58 files changed, 1572 insertions, 1051 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 7c18c27b3..e2f85c5f1 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_library(video_core STATIC
-    buffer_cache.h
+    buffer_cache/buffer_block.h
+    buffer_cache/buffer_cache.h
+    buffer_cache/map_interval.h
     dma_pusher.cpp
     dma_pusher.h
     debug_utils/debug_utils.cpp
@@ -100,6 +102,7 @@ add_library(video_core STATIC
     shader/decode/integer_set.cpp
     shader/decode/half_set.cpp
     shader/decode/video.cpp
+    shader/decode/warp.cpp
     shader/decode/xmad.cpp
     shader/decode/other.cpp
     shader/control_flow.cpp
diff --git a/src/video_core/buffer_cache.h b/src/video_core/buffer_cache.h
deleted file mode 100644
index 6f868b8b4..000000000
--- a/src/video_core/buffer_cache.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <array>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "common/alignment.h"
-#include "common/common_types.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_cache.h"
-
-namespace VideoCore {
-class RasterizerInterface;
-}
-
-namespace VideoCommon {
-
-template <typename BufferStorageType>
-class CachedBuffer final : public RasterizerCacheObject {
-public:
-    explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
-        : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
-    ~CachedBuffer() override = default;
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    u8* GetWritableHostPtr() const {
-        return host_ptr;
-    }
-
-    std::size_t GetSize() const {
-        return size;
-    }
-
-    std::size_t GetCapacity() const {
-        return capacity;
-    }
-
-    bool IsInternalized() const {
-        return is_internal;
-    }
-
-    const BufferStorageType& GetBuffer() const {
-        return buffer;
-    }
-
-    void SetSize(std::size_t new_size) {
-        size = new_size;
-    }
-
-    void SetInternalState(bool is_internal_) {
-        is_internal = is_internal_;
-    }
-
-    BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
-        capacity = new_capacity;
-        std::swap(buffer, buffer_);
-        return buffer_;
-    }
-
-private:
-    u8* host_ptr{};
-    VAddr cpu_addr{};
-    std::size_t size{};
-    std::size_t capacity{};
-    bool is_internal{};
-    BufferStorageType buffer;
-};
-
-template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
-class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
-public:
-    using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
-    using BufferInfo = std::pair<const BufferType*, u64>;
-
-    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer)
-        : RasterizerCache<Buffer>{rasterizer}, system{system},
-          stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
-                                                       this->stream_buffer->GetHandle()} {}
-    ~BufferCache() = default;
-
-    void Unregister(const Buffer& entry) override {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-        if (entry->IsInternalized()) {
-            internalized_entries.erase(entry->GetCacheAddr());
-        }
-        ReserveBuffer(entry);
-        RasterizerCache<Buffer>::Unregister(entry);
-    }
-
-    void TickFrame() {
-        marked_for_destruction_index =
-            (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
-        MarkedForDestruction().clear();
-    }
-
-    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                            bool internalize = false, bool is_written = false) {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-
-        auto& memory_manager = system.GPU().MemoryManager();
-        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-        if (!host_ptr) {
-            return {GetEmptyBuffer(size), 0};
-        }
-        const auto cache_addr = ToCacheAddr(host_ptr);
-
-        // Cache management is a big overhead, so only cache entries with a given size.
-        // TODO: Figure out which size is the best for given games.
-        constexpr std::size_t max_stream_size = 0x800;
-        if (!internalize && size < max_stream_size &&
-            internalized_entries.find(cache_addr) == internalized_entries.end()) {
-            return StreamBufferUpload(host_ptr, size, alignment);
-        }
-
-        auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
-        if (!entry) {
-            return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
-        }
-
-        if (entry->GetSize() < size) {
-            IncreaseBufferSize(entry, size);
-        }
-        if (is_written) {
-            entry->MarkAsModified(true, *this);
-        }
-        return {ToHandle(entry->GetBuffer()), 0};
-    }
-
-    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
-    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
-                                std::size_t alignment = 4) {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-        return StreamBufferUpload(raw_pointer, size, alignment);
-    }
-
-    void Map(std::size_t max_size) {
-        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
-        buffer_offset = buffer_offset_base;
-    }
-
-    /// Finishes the upload stream, returns true on bindings invalidation.
-    bool Unmap() {
-        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
-        return std::exchange(invalidated, false);
-    }
-
-    virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
-
-protected:
-    void FlushObjectInner(const Buffer& entry) override {
-        DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
-    }
-
-    virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
-
-    virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
-
-    virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
-                                  std::size_t size, const u8* data) = 0;
-
-    virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
-                                    std::size_t size, u8* data) = 0;
-
-    virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
-                                std::size_t src_offset, std::size_t dst_offset,
-                                std::size_t size) = 0;
-
-private:
-    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
-                                  std::size_t alignment) {
-        AlignBuffer(alignment);
-        const std::size_t uploaded_offset = buffer_offset;
-        std::memcpy(buffer_ptr, raw_pointer, size);
-
-        buffer_ptr += size;
-        buffer_offset += size;
-        return {&stream_buffer_handle, uploaded_offset};
-    }
-
-    BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
-                                 bool internalize, bool is_written) {
-        auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
-        ASSERT(cpu_addr);
-
-        auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
-        entry->SetSize(size);
-        entry->SetInternalState(internalize);
-        RasterizerCache<Buffer>::Register(entry);
-
-        if (internalize) {
-            internalized_entries.emplace(ToCacheAddr(host_ptr));
-        }
-        if (is_written) {
-            entry->MarkAsModified(true, *this);
-        }
-
-        if (entry->GetCapacity() < size) {
-            MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
-        }
-
-        UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
-        return {ToHandle(entry->GetBuffer()), 0};
-    }
-
-    void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
-        const std::size_t old_size = entry->GetSize();
-        if (entry->GetCapacity() < new_size) {
-            const auto& old_buffer = entry->GetBuffer();
-            auto new_buffer = CreateBuffer(new_size);
-
-            // Copy bits from the old buffer to the new buffer.
-            CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
-            MarkedForDestruction().push_back(
-                entry->ExchangeBuffer(std::move(new_buffer), new_size));
-
-            // This buffer could have been used
-            invalidated = true;
-        }
-        // Upload the new bits.
-        const std::size_t size_diff = new_size - old_size;
-        UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
-
-        // Update entry's size in the object and in the cache.
-        Unregister(entry);
-
-        entry->SetSize(new_size);
-        RasterizerCache<Buffer>::Register(entry);
-    }
-
-    Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
-        if (auto entry = TryGetReservedBuffer(host_ptr)) {
-            return entry;
-        }
-        return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
-    }
-
-    Buffer TryGetReservedBuffer(u8* host_ptr) {
-        const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
-        if (it == buffer_reserve.end()) {
-            return {};
-        }
-        auto& reserve = it->second;
-        auto entry = reserve.back();
-        reserve.pop_back();
-        return entry;
-    }
-
-    void ReserveBuffer(Buffer entry) {
-        buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
-    }
-
-    void AlignBuffer(std::size_t alignment) {
-        // Align the offset, not the mapped pointer
-        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
-        buffer_ptr += offset_aligned - buffer_offset;
-        buffer_offset = offset_aligned;
-    }
-
-    std::vector<BufferStorageType>& MarkedForDestruction() {
-        return marked_for_destruction_ring_buffer[marked_for_destruction_index];
-    }
-
-    Core::System& system;
-
-    std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle{};
-
-    bool invalidated = false;
-
-    u8* buffer_ptr = nullptr;
-    u64 buffer_offset = 0;
-    u64 buffer_offset_base = 0;
-
-    std::size_t marked_for_destruction_index = 0;
-    std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
-
-    std::unordered_set<CacheAddr> internalized_entries;
-    std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
-};
-
-} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
new file mode 100644
index 000000000..4b9193182
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -0,0 +1,76 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_set>
+#include <utility>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+
+namespace VideoCommon {
+
+class BufferBlock {
+public:
+    bool Overlaps(const CacheAddr start, const CacheAddr end) const {
+        return (cache_addr < end) && (cache_addr_end > start);
+    }
+
+    bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
+        return cache_addr <= other_start && other_end <= cache_addr_end;
+    }
+
+    u8* GetWritableHostPtr() const {
+        return FromCacheAddr(cache_addr);
+    }
+
+    u8* GetWritableHostPtr(std::size_t offset) const {
+        return FromCacheAddr(cache_addr + offset);
+    }
+
+    std::size_t GetOffset(const CacheAddr in_addr) {
+        return static_cast<std::size_t>(in_addr - cache_addr);
+    }
+
+    CacheAddr GetCacheAddr() const {
+        return cache_addr;
+    }
+
+    CacheAddr GetCacheAddrEnd() const {
+        return cache_addr_end;
+    }
+
+    void SetCacheAddr(const CacheAddr new_addr) {
+        cache_addr = new_addr;
+        cache_addr_end = new_addr + size;
+    }
+
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    void SetEpoch(u64 new_epoch) {
+        epoch = new_epoch;
+    }
+
+    u64 GetEpoch() {
+        return epoch;
+    }
+
+protected:
+    explicit BufferBlock(CacheAddr cache_addr, const std::size_t size) : size{size} {
+        SetCacheAddr(cache_addr);
+    }
+    ~BufferBlock() = default;
+
+private:
+    CacheAddr cache_addr{};
+    CacheAddr cache_addr_end{};
+    std::size_t size{};
+    u64 epoch{};
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
new file mode 100644
index 000000000..2442ddfd6
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -0,0 +1,447 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/buffer_cache/buffer_block.h"
+#include "video_core/buffer_cache/map_interval.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+using MapInterval = std::shared_ptr<MapIntervalBase>;
+
+template <typename TBuffer, typename TBufferType, typename StreamBuffer>
+class BufferCache {
+public:
+    using BufferInfo = std::pair<const TBufferType*, u64>;
+
+    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                            bool is_written = false) {
+        std::lock_guard lock{mutex};
+
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+        if (!host_ptr) {
+            return {GetEmptyBuffer(size), 0};
+        }
+        const auto cache_addr = ToCacheAddr(host_ptr);
+
+        // Cache management is a big overhead, so only cache entries with a given size.
+        // TODO: Figure out which size is the best for given games.
+        constexpr std::size_t max_stream_size = 0x800;
+        if (size < max_stream_size) {
+            if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
+                return StreamBufferUpload(host_ptr, size, alignment);
+            }
+        }
+
+        auto block = GetBlock(cache_addr, size);
+        auto map = MapAddress(block, gpu_addr, cache_addr, size);
+        if (is_written) {
+            map->MarkAsModified(true, GetModifiedTicks());
+            if (!map->IsWritten()) {
+                map->MarkAsWritten(true);
+                MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+            }
+        } else {
+            if (map->IsWritten()) {
+                WriteBarrier();
+            }
+        }
+
+        const u64 offset = static_cast<u64>(block->GetOffset(cache_addr));
+
+        return {ToHandle(block), offset};
+    }
+
+    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                std::size_t alignment = 4) {
+        std::lock_guard lock{mutex};
+        return StreamBufferUpload(raw_pointer, size, alignment);
+    }
+
+    void Map(std::size_t max_size) {
+        std::lock_guard lock{mutex};
+
+        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        buffer_offset = buffer_offset_base;
+    }
+
+    /// Finishes the upload stream, returns true on bindings invalidation.
+    bool Unmap() {
+        std::lock_guard lock{mutex};
+
+        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        return std::exchange(invalidated, false);
+    }
+
+    void TickFrame() {
+        ++epoch;
+        while (!pending_destruction.empty()) {
+            if (pending_destruction.front()->GetEpoch() + 1 > epoch) {
+                break;
+            }
+            pending_destruction.pop_front();
+        }
+    }
+
+    /// Write any cached resources overlapping the specified region back to memory
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
+        std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
+            return a->GetModificationTick() < b->GetModificationTick();
+        });
+        for (auto& object : objects) {
+            if (object->IsModified() && object->IsRegistered()) {
+                FlushMap(object);
+            }
+        }
+    }
+
+    /// Mark the specified region as being invalidated
+    void InvalidateRegion(CacheAddr addr, u64 size) {
+        std::lock_guard lock{mutex};
+
+        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
+        for (auto& object : objects) {
+            if (object->IsRegistered()) {
+                Unregister(object);
+            }
+        }
+    }
+
+    virtual const TBufferType* GetEmptyBuffer(std::size_t size) = 0;
+
+protected:
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
+          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+
+    ~BufferCache() = default;
+
+    virtual const TBufferType* ToHandle(const TBuffer& storage) = 0;
+
+    virtual void WriteBarrier() = 0;
+
+    virtual TBuffer CreateBlock(CacheAddr cache_addr, std::size_t size) = 0;
+
+    virtual void UploadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
+                                 const u8* data) = 0;
+
+    virtual void DownloadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
+                                   u8* data) = 0;
+
+    virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
+                           std::size_t dst_offset, std::size_t size) = 0;
+
+    /// Register an object into the cache
+    void Register(const MapInterval& new_map, bool inherit_written = false) {
+        const CacheAddr cache_ptr = new_map->GetStart();
+        const std::optional<VAddr> cpu_addr =
+            system.GPU().MemoryManager().GpuToCpuAddress(new_map->GetGpuAddress());
+        if (!cache_ptr || !cpu_addr) {
+            LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
+                         new_map->GetGpuAddress());
+            return;
+        }
+        const std::size_t size = new_map->GetEnd() - new_map->GetStart();
+        new_map->SetCpuAddress(*cpu_addr);
+        new_map->MarkAsRegistered(true);
+        const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
+        mapped_addresses.insert({interval, new_map});
+        rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1);
+        if (inherit_written) {
+            MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
+            new_map->MarkAsWritten(true);
+        }
+    }
+
+    /// Unregisters an object from the cache
+    void Unregister(MapInterval& map) {
+        const std::size_t size = map->GetEnd() - map->GetStart();
+        rasterizer.UpdatePagesCachedCount(map->GetCpuAddress(), size, -1);
+        map->MarkAsRegistered(false);
+        if (map->IsWritten()) {
+            UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+        }
+        const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
+        mapped_addresses.erase(delete_interval);
+    }
+
+private:
+    MapInterval CreateMap(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr) {
+        return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
+    }
+
+    MapInterval MapAddress(const TBuffer& block, const GPUVAddr gpu_addr,
+                           const CacheAddr cache_addr, const std::size_t size) {
+
+        std::vector<MapInterval> overlaps = GetMapsInRange(cache_addr, size);
+        if (overlaps.empty()) {
+            const CacheAddr cache_addr_end = cache_addr + size;
+            MapInterval new_map = CreateMap(cache_addr, cache_addr_end, gpu_addr);
+            u8* host_ptr = FromCacheAddr(cache_addr);
+            UploadBlockData(block, block->GetOffset(cache_addr), size, host_ptr);
+            Register(new_map);
+            return new_map;
+        }
+
+        const CacheAddr cache_addr_end = cache_addr + size;
+        if (overlaps.size() == 1) {
+            MapInterval& current_map = overlaps[0];
+            if (current_map->IsInside(cache_addr, cache_addr_end)) {
+                return current_map;
+            }
+        }
+        CacheAddr new_start = cache_addr;
+        CacheAddr new_end = cache_addr_end;
+        bool write_inheritance = false;
+        bool modified_inheritance = false;
+        // Calculate new buffer parameters
+        for (auto& overlap : overlaps) {
+            new_start = std::min(overlap->GetStart(), new_start);
+            new_end = std::max(overlap->GetEnd(), new_end);
+            write_inheritance |= overlap->IsWritten();
+            modified_inheritance |= overlap->IsModified();
+        }
+        GPUVAddr new_gpu_addr = gpu_addr + new_start - cache_addr;
+        for (auto& overlap : overlaps) {
+            Unregister(overlap);
+        }
+        UpdateBlock(block, new_start, new_end, overlaps);
+        MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
+        if (modified_inheritance) {
+            new_map->MarkAsModified(true, GetModifiedTicks());
+        }
+        Register(new_map, write_inheritance);
+        return new_map;
+    }
+
+    void UpdateBlock(const TBuffer& block, CacheAddr start, CacheAddr end,
+                     std::vector<MapInterval>& overlaps) {
+        const IntervalType base_interval{start, end};
+        IntervalSet interval_set{};
+        interval_set.add(base_interval);
+        for (auto& overlap : overlaps) {
+            const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
+            interval_set.subtract(subtract);
+        }
+        for (auto& interval : interval_set) {
+            std::size_t size = interval.upper() - interval.lower();
+            if (size > 0) {
+                u8* host_ptr = FromCacheAddr(interval.lower());
+                UploadBlockData(block, block->GetOffset(interval.lower()), size, host_ptr);
+            }
+        }
+    }
+
+    std::vector<MapInterval> GetMapsInRange(CacheAddr addr, std::size_t size) {
+        if (size == 0) {
+            return {};
+        }
+
+        std::vector<MapInterval> objects{};
+        const IntervalType interval{addr, addr + size};
+        for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
+            objects.push_back(pair.second);
+        }
+
+        return objects;
+    }
+
+    /// Returns a ticks counter used for tracking when cached objects were last modified
+    u64 GetModifiedTicks() {
+        return ++modified_ticks;
+    }
+
+    void FlushMap(MapInterval map) {
+        std::size_t size = map->GetEnd() - map->GetStart();
+        TBuffer block = blocks[map->GetStart() >> block_page_bits];
+        u8* host_ptr = FromCacheAddr(map->GetStart());
+        DownloadBlockData(block, block->GetOffset(map->GetStart()), size, host_ptr);
+        map->MarkAsModified(false, 0);
+    }
+
+    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
+                                  std::size_t alignment) {
+        AlignBuffer(alignment);
+        const std::size_t uploaded_offset = buffer_offset;
+        std::memcpy(buffer_ptr, raw_pointer, size);
+
+        buffer_ptr += size;
+        buffer_offset += size;
+        return {&stream_buffer_handle, uploaded_offset};
+    }
+
+    void AlignBuffer(std::size_t alignment) {
+        // Align the offset, not the mapped pointer
+        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        buffer_ptr += offset_aligned - buffer_offset;
+        buffer_offset = offset_aligned;
+    }
+
+    TBuffer EnlargeBlock(TBuffer buffer) {
+        const std::size_t old_size = buffer->GetSize();
+        const std::size_t new_size = old_size + block_page_size;
+        const CacheAddr cache_addr = buffer->GetCacheAddr();
+        TBuffer new_buffer = CreateBlock(cache_addr, new_size);
+        CopyBlock(buffer, new_buffer, 0, 0, old_size);
+        buffer->SetEpoch(epoch);
+        pending_destruction.push_back(buffer);
+        const CacheAddr cache_addr_end = cache_addr + new_size - 1;
+        u64 page_start = cache_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            blocks[page_start] = new_buffer;
+            ++page_start;
+        }
+        return new_buffer;
+    }
+
+    TBuffer MergeBlocks(TBuffer first, TBuffer second) {
+        const std::size_t size_1 = first->GetSize();
+        const std::size_t size_2 = second->GetSize();
+        const CacheAddr first_addr = first->GetCacheAddr();
+        const CacheAddr second_addr = second->GetCacheAddr();
+        const CacheAddr new_addr = std::min(first_addr, second_addr);
+        const std::size_t new_size = size_1 + size_2;
+        TBuffer new_buffer = CreateBlock(new_addr, new_size);
+        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
+        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
+        first->SetEpoch(epoch);
+        second->SetEpoch(epoch);
+        pending_destruction.push_back(first);
+        pending_destruction.push_back(second);
+        const CacheAddr cache_addr_end = new_addr + new_size - 1;
+        u64 page_start = new_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            blocks[page_start] = new_buffer;
+            ++page_start;
+        }
+        return new_buffer;
+    }
+
+    TBuffer GetBlock(const CacheAddr cache_addr, const std::size_t size) {
+        TBuffer found{};
+        const CacheAddr cache_addr_end = cache_addr + size - 1;
+        u64 page_start = cache_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            auto it = blocks.find(page_start);
+            if (it == blocks.end()) {
+                if (found) {
+                    found = EnlargeBlock(found);
+                } else {
+                    const CacheAddr start_addr = (page_start << block_page_bits);
+                    found = CreateBlock(start_addr, block_page_size);
+                    blocks[page_start] = found;
+                }
+            } else {
+                if (found) {
+                    if (found == it->second) {
+                        ++page_start;
+                        continue;
+                    }
+                    found = MergeBlocks(found, it->second);
+                } else {
+                    found = it->second;
+                }
+            }
+            ++page_start;
+        }
+        return found;
+    }
+
+    void MarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            auto it = written_pages.find(page_start);
+            if (it != written_pages.end()) {
+                it->second = it->second + 1;
+            } else {
+                written_pages[page_start] = 1;
+            }
+            page_start++;
+        }
+    }
+
+    void UnmarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            auto it = written_pages.find(page_start);
+            if (it != written_pages.end()) {
+                if (it->second > 1) {
+                    it->second = it->second - 1;
+                } else {
+                    written_pages.erase(it);
+                }
+            }
+            page_start++;
+        }
+    }
+
+    bool IsRegionWritten(const CacheAddr start, const CacheAddr end) const {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            if (written_pages.count(page_start) > 0) {
+                return true;
+            }
+            page_start++;
+        }
+        return false;
+    }
+
+    VideoCore::RasterizerInterface& rasterizer;
+    Core::System& system;
+    std::unique_ptr<StreamBuffer> stream_buffer;
+
+    TBufferType stream_buffer_handle{};
+
+    bool invalidated = false;
+
+    u8* buffer_ptr = nullptr;
+    u64 buffer_offset = 0;
+    u64 buffer_offset_base = 0;
+
+    using IntervalSet = boost::icl::interval_set<CacheAddr>;
+    using IntervalCache = boost::icl::interval_map<CacheAddr, MapInterval>;
+    using IntervalType = typename IntervalCache::interval_type;
+    IntervalCache mapped_addresses{};
+
+    static constexpr u64 write_page_bit{11};
+    std::unordered_map<u64, u32> written_pages{};
+
+    static constexpr u64 block_page_bits{21};
+    static constexpr u64 block_page_size{1 << block_page_bits};
+    std::unordered_map<u64, TBuffer> blocks{};
+
+    std::list<TBuffer> pending_destruction{};
+    u64 epoch{};
+    u64 modified_ticks{};
+
+    std::recursive_mutex mutex;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
new file mode 100644
index 000000000..3a104d5cd
--- /dev/null
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -0,0 +1,89 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+
+namespace VideoCommon {
+
+class MapIntervalBase {
+public:
+    MapIntervalBase(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr)
+        : start{start}, end{end}, gpu_addr{gpu_addr} {}
+
+    void SetCpuAddress(VAddr new_cpu_addr) {
+        cpu_addr = new_cpu_addr;
+    }
+
+    VAddr GetCpuAddress() const {
+        return cpu_addr;
+    }
+
+    GPUVAddr GetGpuAddress() const {
+        return gpu_addr;
+    }
+
+    bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
+        return (start <= other_start && other_end <= end);
+    }
+
+    bool operator==(const MapIntervalBase& rhs) const {
+        return std::tie(start, end) == std::tie(rhs.start, rhs.end);
+    }
+
+    bool operator!=(const MapIntervalBase& rhs) const {
+        return !operator==(rhs);
+    }
+
+    void MarkAsRegistered(const bool registered) {
+        is_registered = registered;
+    }
+
+    bool IsRegistered() const {
+        return is_registered;
+    }
+
+    CacheAddr GetStart() const {
+        return start;
+    }
+
+    CacheAddr GetEnd() const {
+        return end;
+    }
+
+    void MarkAsModified(const bool is_modified_, const u64 tick) {
+        is_modified = is_modified_;
+        ticks = tick;
+    }
+
+    bool IsModified() const {
+        return is_modified;
+    }
+
+    u64 GetModificationTick() const {
+        return ticks;
+    }
+
+    void MarkAsWritten(const bool is_written_) {
+        is_written = is_written_;
+    }
+
+    bool IsWritten() const {
+        return is_written;
+    }
+
+private:
+    CacheAddr start;
+    CacheAddr end;
+    GPUVAddr gpu_addr;
+    VAddr cpu_addr{};
+    bool is_written{};
+    bool is_modified{};
+    bool is_registered{};
+    u64 ticks{};
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 0ee228e28..98a8b5337 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -10,8 +10,7 @@
 
 namespace Tegra::Engines {
 
-Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
-    : rasterizer{rasterizer}, memory_manager{memory_manager} {}
+Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
 
 void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
     ASSERT_MSG(method_call.method < Regs::NUM_REGS,
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 05421d185..0901cf2fa 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -33,7 +33,7 @@ namespace Tegra::Engines {
 
 class Fermi2D final {
 public:
-    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
+    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer);
     ~Fermi2D() = default;
 
     /// Write the value to the register identified by method.
@@ -145,7 +145,6 @@ public:
 
 private:
     VideoCore::RasterizerInterface& rasterizer;
-    MemoryManager& memory_manager;
 
     /// Performs the copy from the source surface to the destination surface as configured in the
     /// registers.
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 44279de00..fa4a7c5c1 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -15,7 +15,7 @@
 namespace Tegra::Engines {
 
 KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
-    : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {}
+    : system{system}, upload_state{memory_manager, regs.upload} {}
 
 KeplerMemory::~KeplerMemory() = default;
 
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index f3bc675a9..e0e25c321 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -65,7 +65,6 @@ public:
 
 private:
     Core::System& system;
-    MemoryManager& memory_manager;
     Upload::State upload_state;
 };
 
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 125c53360..f5158d219 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -249,16 +249,10 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
     executing_macro = 0;
 
     // Lookup the macro offset
-    const u32 entry{(method - MacroRegistersStart) >> 1};
-    const auto& search{macro_offsets.find(entry)};
-    if (search == macro_offsets.end()) {
-        LOG_CRITICAL(HW_GPU, "macro not found for method 0x{:X}!", method);
-        UNREACHABLE();
-        return;
-    }
+    const u32 entry = ((method - MacroRegistersStart) >> 1) % macro_positions.size();
 
     // Execute the current macro.
-    macro_interpreter.Execute(search->second, std::move(parameters));
+    macro_interpreter.Execute(macro_positions[entry], std::move(parameters));
 }
 
 void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
@@ -421,7 +415,7 @@ void Maxwell3D::ProcessMacroUpload(u32 data) {
 }
 
 void Maxwell3D::ProcessMacroBind(u32 data) {
-    macro_offsets[regs.macros.entry] = data;
+    macro_positions[regs.macros.entry++] = data;
 }
 
 void Maxwell3D::ProcessQueryGet() {
@@ -524,7 +518,7 @@ void Maxwell3D::ProcessQueryCondition() {
 void Maxwell3D::ProcessSyncPoint() {
     const u32 sync_point = regs.sync_info.sync_point.Value();
     const u32 increment = regs.sync_info.increment.Value();
-    const u32 cache_flush = regs.sync_info.unknown.Value();
+    [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
     if (increment) {
         system.GPU().IncrementSyncPoint(sync_point);
     }
@@ -626,10 +620,10 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     Texture::TICEntry tic_entry;
     memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
 
-    const auto r_type{tic_entry.r_type.Value()};
-    const auto g_type{tic_entry.g_type.Value()};
-    const auto b_type{tic_entry.b_type.Value()};
-    const auto a_type{tic_entry.a_type.Value()};
+    [[maybe_unused]] const auto r_type{tic_entry.r_type.Value()};
+    [[maybe_unused]] const auto g_type{tic_entry.g_type.Value()};
+    [[maybe_unused]] const auto b_type{tic_entry.b_type.Value()};
+    [[maybe_unused]] const auto a_type{tic_entry.a_type.Value()};
 
     // TODO(Subv): Different data types for separate components are not supported
     DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 1ee982b76..0184342a0 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1270,7 +1270,7 @@ private:
     MemoryManager& memory_manager;
 
     /// Start offsets of each macro in macro_memory
-    std::unordered_map<u32, u32> macro_offsets;
+    std::array<u32, 0x80> macro_positions = {};
 
     /// Memory for macro code
     MacroMemory macro_memory;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index a28c04473..ad8453c5f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -5,18 +5,17 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
+#include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
 #include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
-MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                       MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
+MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
+    : system{system}, memory_manager{memory_manager} {}
 
 void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
     ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -84,13 +83,17 @@ void MaxwellDMA::HandleCopy() {
     ASSERT(regs.exec.enable_2d == 1);
 
     if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.size_z == 1);
+        ASSERT(regs.src_params.BlockDepth() == 0);
         // If the input is tiled and the output is linear, deswizzle the input and copy it over.
-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count;
         const std::size_t src_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
+            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
             regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
 
+        const std::size_t src_layer_size = Texture::CalculateSize(
+            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1,
+            regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
+
         const std::size_t dst_size = regs.dst_pitch * regs.y_count;
 
         if (read_buffer.size() < src_size) {
@@ -104,23 +107,23 @@ void MaxwellDMA::HandleCopy() {
         memory_manager.ReadBlock(source, read_buffer.data(), src_size);
         memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
 
-        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
-                                  regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(),
-                                  write_buffer.data(), regs.src_params.BlockHeight(),
-                                  regs.src_params.pos_x, regs.src_params.pos_y);
+        Texture::UnswizzleSubrect(
+            regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel,
+            read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(),
+            regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y);
 
         memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
     } else {
         ASSERT(regs.dst_params.BlockDepth() == 0);
 
-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count;
+        const u32 bytes_per_pixel = regs.src_pitch / regs.x_count;
 
         const std::size_t dst_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
+            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
             regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
         const std::size_t dst_layer_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
+            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
             regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
         const std::size_t src_size = regs.src_pitch * regs.y_count;
@@ -133,14 +136,19 @@ void MaxwellDMA::HandleCopy() {
             write_buffer.resize(dst_size);
         }
 
-        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
-        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+        if (Settings::values.use_accurate_gpu_emulation) {
+            memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+            memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+        } else {
+            memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size);
+            memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size);
+        }
 
         // If the input is linear and the output is tiled, swizzle the input and copy it over.
-        Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
-                                src_bytes_per_pixel,
-                                write_buffer.data() + dst_layer_size * regs.dst_params.pos_z,
-                                read_buffer.data(), regs.dst_params.BlockHeight());
+        Texture::SwizzleSubrect(
+            regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel,
+            write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(),
+            regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y);
 
         memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
     }
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 17b015ca7..93808a9bb 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -20,10 +20,6 @@ namespace Tegra {
 class MemoryManager;
 }
 
-namespace VideoCore {
-class RasterizerInterface;
-}
-
 namespace Tegra::Engines {
 
 /**
@@ -33,8 +29,7 @@ namespace Tegra::Engines {
 
 class MaxwellDMA final {
 public:
-    explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                        MemoryManager& memory_manager);
+    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
     ~MaxwellDMA() = default;
 
     /// Write the value to the register identified by method.
@@ -180,8 +175,6 @@ public:
 private:
     Core::System& system;
 
-    VideoCore::RasterizerInterface& rasterizer;
-
     MemoryManager& memory_manager;
 
     std::vector<u8> read_buffer;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index aaa1acea9..c3678b9ea 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -538,6 +538,12 @@ enum class PhysicalAttributeDirection : u64 {
     Output = 1,
 };
 
+enum class VoteOperation : u64 {
+    All = 0, // allThreadsNV
+    Any = 1, // anyThreadNV
+    Eq = 2,  // allThreadsEqualNV
+};
+
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -565,6 +571,13 @@ union Instruction {
     } nop;
 
     union {
+        BitField<48, 2, VoteOperation> operation;
+        BitField<45, 3, u64> dest_pred;
+        BitField<39, 3, u64> value;
+        BitField<42, 1, u64> negate_value;
+    } vote;
+
+    union {
         BitField<8, 8, Register> gpr;
         BitField<20, 24, s64> offset;
     } gmem;
@@ -873,6 +886,7 @@ union Instruction {
     union {
         BitField<0, 3, u64> pred0;
         BitField<3, 3, u64> pred3;
+        BitField<6, 1, u64> neg_b;
         BitField<7, 1, u64> abs_a;
         BitField<39, 3, u64> pred39;
         BitField<42, 1, u64> neg_pred;
@@ -1006,7 +1020,6 @@ union Instruction {
     } iset;
 
     union {
-        BitField<41, 2, u64> selector; // i2i and i2f only
         BitField<45, 1, u64> negate_a;
         BitField<49, 1, u64> abs_a;
         BitField<10, 2, Register::Size> src_size;
@@ -1032,6 +1045,13 @@ union Instruction {
             }
         } f2f;
 
+        union {
+            BitField<41, 2, u64> selector;
+        } int_src;
+
+        union {
+            BitField<41, 1, u64> selector;
+        } float_src;
     } conversion;
 
     union {
@@ -1487,6 +1507,7 @@ public:
         SYNC,
         BRK,
         DEPBAR,
+        VOTE,
         BFE_C,
         BFE_R,
         BFE_IMM,
@@ -1649,6 +1670,7 @@ public:
         Hfma2,
         Flow,
         Synch,
+        Warp,
         Memory,
         Texture,
         Image,
@@ -1775,6 +1797,7 @@ private:
             INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
             INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
             INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
+            INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
             INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
             INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
             INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index c409af194..2c47541cb 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -17,27 +17,15 @@
 
 namespace Tegra {
 
-u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
-    switch (format) {
-    case PixelFormat::ABGR8:
-    case PixelFormat::BGRA8:
-        return 4;
-    default:
-        return 4;
-    }
-
-    UNREACHABLE();
-}
-
 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
     : system{system}, renderer{renderer}, is_async{is_async} {
     auto& rasterizer{renderer.Rasterizer()};
     memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
+    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);
     kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
-    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager);
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
     kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
 }
 
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 11857ff99..78bc0601a 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -19,6 +19,10 @@ inline CacheAddr ToCacheAddr(const void* host_ptr) {
     return reinterpret_cast<CacheAddr>(host_ptr);
 }
 
+inline u8* FromCacheAddr(CacheAddr cache_addr) {
+    return reinterpret_cast<u8*>(cache_addr);
+}
+
 namespace Core {
 class System;
 }
@@ -91,14 +95,10 @@ class DebugContext;
 struct FramebufferConfig {
     enum class PixelFormat : u32 {
         ABGR8 = 1,
+        RGB565 = 4,
         BGRA8 = 5,
     };
 
-    /**
-     * Returns the number of bytes per pixel.
-     */
-    static u32 BytesPerPixel(PixelFormat format);
-
     VAddr address;
     u32 offset;
     u32 width;
@@ -249,8 +249,7 @@ public:
     virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
 
     /// Swap buffers (render frame)
-    virtual void SwapBuffers(
-        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;
+    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
     virtual void FlushRegion(CacheAddr addr, u64 size) = 0;
@@ -281,8 +280,8 @@ private:
 
 protected:
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
-    VideoCore::RendererBase& renderer;
     Core::System& system;
+    VideoCore::RendererBase& renderer;
 
 private:
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index ea67be831..f2a3a390e 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -23,9 +23,8 @@ void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
     gpu_thread.SubmitList(std::move(entries));
 }
 
-void GPUAsynch::SwapBuffers(
-    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
-    gpu_thread.SwapBuffers(std::move(framebuffer));
+void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    gpu_thread.SwapBuffers(framebuffer);
 }
 
 void GPUAsynch::FlushRegion(CacheAddr addr, u64 size) {
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 36377d677..a12f9bac4 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -14,15 +14,14 @@ class RendererBase;
 namespace VideoCommon {
 
 /// Implementation of GPU interface that runs the GPU asynchronously
-class GPUAsynch : public Tegra::GPU {
+class GPUAsynch final : public Tegra::GPU {
 public:
     explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer);
     ~GPUAsynch() override;
 
     void Start() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
-    void SwapBuffers(
-        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index d4ead9c47..d48221077 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -19,9 +19,8 @@ void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
     dma_pusher->DispatchCalls();
 }
 
-void GPUSynch::SwapBuffers(
-    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
-    renderer.SwapBuffers(std::move(framebuffer));
+void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    renderer.SwapBuffers(framebuffer);
 }
 
 void GPUSynch::FlushRegion(CacheAddr addr, u64 size) {
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 07bcc47f1..5eb1c461c 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -13,15 +13,14 @@ class RendererBase;
 namespace VideoCommon {
 
 /// Implementation of GPU interface that runs the GPU synchronously
-class GPUSynch : public Tegra::GPU {
+class GPUSynch final : public Tegra::GPU {
 public:
     explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer);
     ~GPUSynch() override;
 
     void Start() override;
     void PushGPUEntries(Tegra::CommandList&& entries) override;
-    void SwapBuffers(
-        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
     void FlushRegion(CacheAddr addr, u64 size) override;
     void InvalidateRegion(CacheAddr addr, u64 size) override;
     void FlushAndInvalidateRegion(CacheAddr addr, u64 size) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index b441e92b0..5f039e4fd 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -39,7 +39,7 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
                 dma_pusher.Push(std::move(submit_list->entries));
                 dma_pusher.DispatchCalls();
             } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) {
-                renderer.SwapBuffers(std::move(data->framebuffer));
+                renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr);
             } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) {
                 renderer.Rasterizer().FlushRegion(data->addr, data->size);
             } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) {
@@ -78,9 +78,9 @@ void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
     system.CoreTiming().ScheduleEvent(synchronization_ticks, synchronization_event, fence);
 }
 
-void ThreadManager::SwapBuffers(
-    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
-    PushCommand(SwapBuffersCommand(std::move(framebuffer)));
+void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
+    PushCommand(SwapBuffersCommand(framebuffer ? *framebuffer
+                                               : std::optional<const Tegra::FramebufferConfig>{}));
 }
 
 void ThreadManager::FlushRegion(CacheAddr addr, u64 size) {
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 1d9d0c39e..3ae0ec9f3 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -110,8 +110,7 @@ public:
     void SubmitList(Tegra::CommandList&& entries);
 
     /// Swap buffers (render frame)
-    void SwapBuffers(
-        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer);
 
     /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
     void FlushRegion(CacheAddr addr, u64 size);
diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp
index 3e91cbc83..084f85e67 100644
--- a/src/video_core/morton.cpp
+++ b/src/video_core/morton.cpp
@@ -25,8 +25,8 @@ static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth
 
     // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual
     // pixel values.
-    const u32 tile_size_x{GetDefaultBlockWidth(format)};
-    const u32 tile_size_y{GetDefaultBlockHeight(format)};
+    constexpr u32 tile_size_x{GetDefaultBlockWidth(format)};
+    constexpr u32 tile_size_y{GetDefaultBlockHeight(format)};
 
     if constexpr (morton_to_linear) {
         Tegra::Texture::UnswizzleTexture(buffer, addr, tile_size_x, tile_size_y, bytes_per_pixel,
@@ -186,99 +186,6 @@ static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFor
     return morton_to_linear_fns[static_cast<std::size_t>(format)];
 }
 
-static u32 MortonInterleave128(u32 x, u32 y) {
-    // 128x128 Z-Order coordinate from 2D coordinates
-    static constexpr u32 xlut[] = {
-        0x0000, 0x0001, 0x0002, 0x0003, 0x0008, 0x0009, 0x000a, 0x000b, 0x0040, 0x0041, 0x0042,
-        0x0043, 0x0048, 0x0049, 0x004a, 0x004b, 0x0800, 0x0801, 0x0802, 0x0803, 0x0808, 0x0809,
-        0x080a, 0x080b, 0x0840, 0x0841, 0x0842, 0x0843, 0x0848, 0x0849, 0x084a, 0x084b, 0x1000,
-        0x1001, 0x1002, 0x1003, 0x1008, 0x1009, 0x100a, 0x100b, 0x1040, 0x1041, 0x1042, 0x1043,
-        0x1048, 0x1049, 0x104a, 0x104b, 0x1800, 0x1801, 0x1802, 0x1803, 0x1808, 0x1809, 0x180a,
-        0x180b, 0x1840, 0x1841, 0x1842, 0x1843, 0x1848, 0x1849, 0x184a, 0x184b, 0x2000, 0x2001,
-        0x2002, 0x2003, 0x2008, 0x2009, 0x200a, 0x200b, 0x2040, 0x2041, 0x2042, 0x2043, 0x2048,
-        0x2049, 0x204a, 0x204b, 0x2800, 0x2801, 0x2802, 0x2803, 0x2808, 0x2809, 0x280a, 0x280b,
-        0x2840, 0x2841, 0x2842, 0x2843, 0x2848, 0x2849, 0x284a, 0x284b, 0x3000, 0x3001, 0x3002,
-        0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x3040, 0x3041, 0x3042, 0x3043, 0x3048, 0x3049,
-        0x304a, 0x304b, 0x3800, 0x3801, 0x3802, 0x3803, 0x3808, 0x3809, 0x380a, 0x380b, 0x3840,
-        0x3841, 0x3842, 0x3843, 0x3848, 0x3849, 0x384a, 0x384b, 0x0000, 0x0001, 0x0002, 0x0003,
-        0x0008, 0x0009, 0x000a, 0x000b, 0x0040, 0x0041, 0x0042, 0x0043, 0x0048, 0x0049, 0x004a,
-        0x004b, 0x0800, 0x0801, 0x0802, 0x0803, 0x0808, 0x0809, 0x080a, 0x080b, 0x0840, 0x0841,
-        0x0842, 0x0843, 0x0848, 0x0849, 0x084a, 0x084b, 0x1000, 0x1001, 0x1002, 0x1003, 0x1008,
-        0x1009, 0x100a, 0x100b, 0x1040, 0x1041, 0x1042, 0x1043, 0x1048, 0x1049, 0x104a, 0x104b,
-        0x1800, 0x1801, 0x1802, 0x1803, 0x1808, 0x1809, 0x180a, 0x180b, 0x1840, 0x1841, 0x1842,
-        0x1843, 0x1848, 0x1849, 0x184a, 0x184b, 0x2000, 0x2001, 0x2002, 0x2003, 0x2008, 0x2009,
-        0x200a, 0x200b, 0x2040, 0x2041, 0x2042, 0x2043, 0x2048, 0x2049, 0x204a, 0x204b, 0x2800,
-        0x2801, 0x2802, 0x2803, 0x2808, 0x2809, 0x280a, 0x280b, 0x2840, 0x2841, 0x2842, 0x2843,
-        0x2848, 0x2849, 0x284a, 0x284b, 0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a,
-        0x300b, 0x3040, 0x3041, 0x3042, 0x3043, 0x3048, 0x3049, 0x304a, 0x304b, 0x3800, 0x3801,
-        0x3802, 0x3803, 0x3808, 0x3809, 0x380a, 0x380b, 0x3840, 0x3841, 0x3842, 0x3843, 0x3848,
-        0x3849, 0x384a, 0x384b, 0x0000, 0x0001, 0x0002, 0x0003, 0x0008, 0x0009, 0x000a, 0x000b,
-        0x0040, 0x0041, 0x0042, 0x0043, 0x0048, 0x0049, 0x004a, 0x004b, 0x0800, 0x0801, 0x0802,
-        0x0803, 0x0808, 0x0809, 0x080a, 0x080b, 0x0840, 0x0841, 0x0842, 0x0843, 0x0848, 0x0849,
-        0x084a, 0x084b, 0x1000, 0x1001, 0x1002, 0x1003, 0x1008, 0x1009, 0x100a, 0x100b, 0x1040,
-        0x1041, 0x1042, 0x1043, 0x1048, 0x1049, 0x104a, 0x104b, 0x1800, 0x1801, 0x1802, 0x1803,
-        0x1808, 0x1809, 0x180a, 0x180b, 0x1840, 0x1841, 0x1842, 0x1843, 0x1848, 0x1849, 0x184a,
-        0x184b, 0x2000, 0x2001, 0x2002, 0x2003, 0x2008, 0x2009, 0x200a, 0x200b, 0x2040, 0x2041,
-        0x2042, 0x2043, 0x2048, 0x2049, 0x204a, 0x204b, 0x2800, 0x2801, 0x2802, 0x2803, 0x2808,
-        0x2809, 0x280a, 0x280b, 0x2840, 0x2841, 0x2842, 0x2843, 0x2848, 0x2849, 0x284a, 0x284b,
-        0x3000, 0x3001, 0x3002, 0x3003, 0x3008, 0x3009, 0x300a, 0x300b, 0x3040, 0x3041, 0x3042,
-        0x3043, 0x3048, 0x3049, 0x304a, 0x304b, 0x3800, 0x3801, 0x3802, 0x3803, 0x3808, 0x3809,
-        0x380a, 0x380b, 0x3840, 0x3841, 0x3842, 0x3843, 0x3848, 0x3849, 0x384a, 0x384b,
-    };
-    static constexpr u32 ylut[] = {
-        0x0000, 0x0004, 0x0010, 0x0014, 0x0020, 0x0024, 0x0030, 0x0034, 0x0080, 0x0084, 0x0090,
-        0x0094, 0x00a0, 0x00a4, 0x00b0, 0x00b4, 0x0100, 0x0104, 0x0110, 0x0114, 0x0120, 0x0124,
-        0x0130, 0x0134, 0x0180, 0x0184, 0x0190, 0x0194, 0x01a0, 0x01a4, 0x01b0, 0x01b4, 0x0200,
-        0x0204, 0x0210, 0x0214, 0x0220, 0x0224, 0x0230, 0x0234, 0x0280, 0x0284, 0x0290, 0x0294,
-        0x02a0, 0x02a4, 0x02b0, 0x02b4, 0x0300, 0x0304, 0x0310, 0x0314, 0x0320, 0x0324, 0x0330,
-        0x0334, 0x0380, 0x0384, 0x0390, 0x0394, 0x03a0, 0x03a4, 0x03b0, 0x03b4, 0x0400, 0x0404,
-        0x0410, 0x0414, 0x0420, 0x0424, 0x0430, 0x0434, 0x0480, 0x0484, 0x0490, 0x0494, 0x04a0,
-        0x04a4, 0x04b0, 0x04b4, 0x0500, 0x0504, 0x0510, 0x0514, 0x0520, 0x0524, 0x0530, 0x0534,
-        0x0580, 0x0584, 0x0590, 0x0594, 0x05a0, 0x05a4, 0x05b0, 0x05b4, 0x0600, 0x0604, 0x0610,
-        0x0614, 0x0620, 0x0624, 0x0630, 0x0634, 0x0680, 0x0684, 0x0690, 0x0694, 0x06a0, 0x06a4,
-        0x06b0, 0x06b4, 0x0700, 0x0704, 0x0710, 0x0714, 0x0720, 0x0724, 0x0730, 0x0734, 0x0780,
-        0x0784, 0x0790, 0x0794, 0x07a0, 0x07a4, 0x07b0, 0x07b4, 0x0000, 0x0004, 0x0010, 0x0014,
-        0x0020, 0x0024, 0x0030, 0x0034, 0x0080, 0x0084, 0x0090, 0x0094, 0x00a0, 0x00a4, 0x00b0,
-        0x00b4, 0x0100, 0x0104, 0x0110, 0x0114, 0x0120, 0x0124, 0x0130, 0x0134, 0x0180, 0x0184,
-        0x0190, 0x0194, 0x01a0, 0x01a4, 0x01b0, 0x01b4, 0x0200, 0x0204, 0x0210, 0x0214, 0x0220,
-        0x0224, 0x0230, 0x0234, 0x0280, 0x0284, 0x0290, 0x0294, 0x02a0, 0x02a4, 0x02b0, 0x02b4,
-        0x0300, 0x0304, 0x0310, 0x0314, 0x0320, 0x0324, 0x0330, 0x0334, 0x0380, 0x0384, 0x0390,
-        0x0394, 0x03a0, 0x03a4, 0x03b0, 0x03b4, 0x0400, 0x0404, 0x0410, 0x0414, 0x0420, 0x0424,
-        0x0430, 0x0434, 0x0480, 0x0484, 0x0490, 0x0494, 0x04a0, 0x04a4, 0x04b0, 0x04b4, 0x0500,
-        0x0504, 0x0510, 0x0514, 0x0520, 0x0524, 0x0530, 0x0534, 0x0580, 0x0584, 0x0590, 0x0594,
-        0x05a0, 0x05a4, 0x05b0, 0x05b4, 0x0600, 0x0604, 0x0610, 0x0614, 0x0620, 0x0624, 0x0630,
-        0x0634, 0x0680, 0x0684, 0x0690, 0x0694, 0x06a0, 0x06a4, 0x06b0, 0x06b4, 0x0700, 0x0704,
-        0x0710, 0x0714, 0x0720, 0x0724, 0x0730, 0x0734, 0x0780, 0x0784, 0x0790, 0x0794, 0x07a0,
-        0x07a4, 0x07b0, 0x07b4, 0x0000, 0x0004, 0x0010, 0x0014, 0x0020, 0x0024, 0x0030, 0x0034,
-        0x0080, 0x0084, 0x0090, 0x0094, 0x00a0, 0x00a4, 0x00b0, 0x00b4, 0x0100, 0x0104, 0x0110,
-        0x0114, 0x0120, 0x0124, 0x0130, 0x0134, 0x0180, 0x0184, 0x0190, 0x0194, 0x01a0, 0x01a4,
-        0x01b0, 0x01b4, 0x0200, 0x0204, 0x0210, 0x0214, 0x0220, 0x0224, 0x0230, 0x0234, 0x0280,
-        0x0284, 0x0290, 0x0294, 0x02a0, 0x02a4, 0x02b0, 0x02b4, 0x0300, 0x0304, 0x0310, 0x0314,
-        0x0320, 0x0324, 0x0330, 0x0334, 0x0380, 0x0384, 0x0390, 0x0394, 0x03a0, 0x03a4, 0x03b0,
-        0x03b4, 0x0400, 0x0404, 0x0410, 0x0414, 0x0420, 0x0424, 0x0430, 0x0434, 0x0480, 0x0484,
-        0x0490, 0x0494, 0x04a0, 0x04a4, 0x04b0, 0x04b4, 0x0500, 0x0504, 0x0510, 0x0514, 0x0520,
-        0x0524, 0x0530, 0x0534, 0x0580, 0x0584, 0x0590, 0x0594, 0x05a0, 0x05a4, 0x05b0, 0x05b4,
-        0x0600, 0x0604, 0x0610, 0x0614, 0x0620, 0x0624, 0x0630, 0x0634, 0x0680, 0x0684, 0x0690,
-        0x0694, 0x06a0, 0x06a4, 0x06b0, 0x06b4, 0x0700, 0x0704, 0x0710, 0x0714, 0x0720, 0x0724,
-        0x0730, 0x0734, 0x0780, 0x0784, 0x0790, 0x0794, 0x07a0, 0x07a4, 0x07b0, 0x07b4,
-    };
-    return xlut[x % 128] + ylut[y % 128];
-}
-
-static u32 GetMortonOffset128(u32 x, u32 y, u32 bytes_per_pixel) {
-    // Calculates the offset of the position of the pixel in Morton order
-    // Framebuffer images are split into 128x128 tiles.
-
-    constexpr u32 block_height = 128;
-    const u32 coarse_x = x & ~127;
-
-    const u32 i = MortonInterleave128(x, y);
-
-    const u32 offset = coarse_x * block_height;
-
-    return (i + offset) * bytes_per_pixel;
-}
-
 void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stride,
                    u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
                    u8* buffer, u8* addr) {
@@ -286,23 +193,4 @@ void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stri
                                      tile_width_spacing, buffer, addr);
 }
 
-void MortonCopyPixels128(MortonSwizzleMode mode, u32 width, u32 height, u32 bytes_per_pixel,
-                         u32 linear_bytes_per_pixel, u8* morton_data, u8* linear_data) {
-    const bool morton_to_linear = mode == MortonSwizzleMode::MortonToLinear;
-    u8* data_ptrs[2];
-    for (u32 y = 0; y < height; ++y) {
-        for (u32 x = 0; x < width; ++x) {
-            const u32 coarse_y = y & ~127;
-            const u32 morton_offset =
-                GetMortonOffset128(x, y, bytes_per_pixel) + coarse_y * width * bytes_per_pixel;
-            const u32 linear_pixel_index = (x + y * width) * linear_bytes_per_pixel;
-
-            data_ptrs[morton_to_linear ? 1 : 0] = morton_data + morton_offset;
-            data_ptrs[morton_to_linear ? 0 : 1] = &linear_data[linear_pixel_index];
-
-            std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel);
-        }
-    }
-}
-
 } // namespace VideoCore
diff --git a/src/video_core/morton.h b/src/video_core/morton.h
index ee5b45555..b714a7e3f 100644
--- a/src/video_core/morton.h
+++ b/src/video_core/morton.h
@@ -15,7 +15,4 @@ void MortonSwizzle(MortonSwizzleMode mode, VideoCore::Surface::PixelFormat forma
                    u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing,
                    u8* buffer, u8* addr);
 
-void MortonCopyPixels128(MortonSwizzleMode mode, u32 width, u32 height, u32 bytes_per_pixel,
-                         u32 linear_bytes_per_pixel, u8* morton_data, u8* linear_data);
-
 } // namespace VideoCore
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 6e44d51cf..6b3f2d50a 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -50,7 +50,7 @@ public:
     /// and invalidated
     virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
-    // Notify the rasterizer to send all written commands to the host GPU.
+    /// Notify the rasterizer to send all written commands to the host GPU.
     virtual void FlushCommands() = 0;
 
     /// Notify rasterizer that a frame is about to finish
diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h
index 1d54c3723..af1bebc4f 100644
--- a/src/video_core/renderer_base.h
+++ b/src/video_core/renderer_base.h
@@ -36,8 +36,7 @@ public:
     virtual ~RendererBase();
 
     /// Swap buffers (render frame)
-    virtual void SwapBuffers(
-        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;
+    virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0;
 
     /// Initialize the renderer
     virtual bool Init() = 0;
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 2a9b523f5..f8a807c84 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -7,28 +7,41 @@
 #include <glad/glad.h>
 
 #include "common/assert.h"
+#include "common/microprofile.h"
+#include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
 namespace OpenGL {
 
+MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
+
+CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
+    : VideoCommon::BufferBlock{cache_addr, size} {
+    gl_buffer.Create();
+    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+}
+
+CachedBufferBlock::~CachedBufferBlock() = default;
+
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                                std::size_t stream_size)
-    : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{
+    : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{
           rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
 
 OGLBufferCache::~OGLBufferCache() = default;
 
-OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) {
-    OGLBuffer buffer;
-    buffer.Create();
-    glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
-    return buffer;
+Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
+    return std::make_shared<CachedBufferBlock>(cache_addr, size);
+}
+
+void OGLBufferCache::WriteBarrier() {
+    glMemoryBarrier(GL_ALL_BARRIER_BITS);
 }
 
-const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) {
-    return &buffer.handle;
+const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) {
+    return buffer->GetHandle();
 }
 
 const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
@@ -36,23 +49,24 @@ const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
     return &null_buffer;
 }
 
-void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                                      const u8* data) {
-    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                                     const u8* data) {
+    glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
                          static_cast<GLsizeiptr>(size), data);
 }
 
-void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset,
-                                        std::size_t size, u8* data) {
-    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                                       u8* data) {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
                             static_cast<GLsizeiptr>(size), data);
 }
 
-void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst,
-                                    std::size_t src_offset, std::size_t dst_offset,
-                                    std::size_t size) {
-    glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
+                               std::size_t dst_offset, std::size_t size) {
+    glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(),
+                             static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset),
+                             static_cast<GLsizeiptr>(size));
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 8c8ac4038..022e7bfa9 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -7,7 +7,7 @@
 #include <memory>
 
 #include "common/common_types.h"
-#include "video_core/buffer_cache.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -21,7 +21,24 @@ namespace OpenGL {
 class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> {
+class CachedBufferBlock;
+
+using Buffer = std::shared_ptr<CachedBufferBlock>;
+
+class CachedBufferBlock : public VideoCommon::BufferBlock {
+public:
+    explicit CachedBufferBlock(CacheAddr cache_addr, const std::size_t size);
+    ~CachedBufferBlock();
+
+    const GLuint* GetHandle() const {
+        return &gl_buffer.handle;
+    }
+
+private:
+    OGLBuffer gl_buffer{};
+};
+
+class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {
 public:
     explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                             std::size_t stream_size);
@@ -30,18 +47,20 @@ public:
     const GLuint* GetEmptyBuffer(std::size_t) override;
 
 protected:
-    OGLBuffer CreateBuffer(std::size_t size) override;
+    Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;
+
+    void WriteBarrier() override;
 
-    const GLuint* ToHandle(const OGLBuffer& buffer) override;
+    const GLuint* ToHandle(const Buffer& buffer) override;
 
-    void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                          const u8* data) override;
+    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                         const u8* data) override;
 
-    void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                            u8* data) override;
+    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                           u8* data) override;
 
-    void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset,
-                        std::size_t dst_offset, std::size_t size) override;
+    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
+                   std::size_t dst_offset, std::size_t size) override;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 85424a4c9..4f59a87b4 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -14,12 +14,22 @@
 namespace OpenGL {
 
 namespace {
+
 template <typename T>
 T GetInteger(GLenum pname) {
     GLint temporary;
     glGetIntegerv(pname, &temporary);
     return static_cast<T>(temporary);
 }
+
+bool TestProgram(const GLchar* glsl) {
+    const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &glsl)};
+    GLint link_status;
+    glGetProgramiv(shader, GL_LINK_STATUS, &link_status);
+    glDeleteProgram(shader);
+    return link_status == GL_TRUE;
+}
+
 } // Anonymous namespace
 
 Device::Device() {
@@ -27,42 +37,41 @@ Device::Device() {
     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
     max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
+                          GLAD_GL_NV_shader_thread_shuffle;
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = TestComponentIndexingBug();
+    has_precise_bug = TestPreciseBug();
+
+    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi);
+    LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug);
+    LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug);
 }
 
 Device::Device(std::nullptr_t) {
     uniform_buffer_alignment = 0;
     max_vertex_attributes = 16;
     max_varyings = 15;
+    has_warp_intrinsics = true;
     has_vertex_viewport_layer = true;
     has_variable_aoffi = true;
     has_component_indexing_bug = false;
+    has_precise_bug = false;
 }
 
 bool Device::TestVariableAoffi() {
-    const GLchar* AOFFI_TEST = R"(#version 430 core
+    return TestProgram(R"(#version 430 core
 // This is a unit test, please ignore me on apitrace bug reports.
 uniform sampler2D tex;
 uniform ivec2 variable_offset;
 out vec4 output_attribute;
 void main() {
     output_attribute = textureOffset(tex, vec2(0), variable_offset);
-}
-)";
-    const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)};
-    GLint link_status{};
-    glGetProgramiv(shader, GL_LINK_STATUS, &link_status);
-    glDeleteProgram(shader);
-
-    const bool supported{link_status == GL_TRUE};
-    LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", supported);
-    return supported;
+})");
 }
 
 bool Device::TestComponentIndexingBug() {
-    constexpr char log_message[] = "Renderer_ComponentIndexingBug: {}";
     const GLchar* COMPONENT_TEST = R"(#version 430 core
 layout (std430, binding = 0) buffer OutputBuffer {
     uint output_value;
@@ -102,12 +111,21 @@ void main() {
         GLuint result;
         glGetNamedBufferSubData(ssbo.handle, 0, sizeof(result), &result);
         if (result != values.at(index)) {
-            LOG_INFO(Render_OpenGL, log_message, true);
             return true;
         }
     }
-    LOG_INFO(Render_OpenGL, log_message, false);
     return false;
 }
 
+bool Device::TestPreciseBug() {
+    return !TestProgram(R"(#version 430 core
+in vec3 coords;
+out float out_value;
+uniform sampler2DShadow tex;
+void main() {
+    precise float tmp_value = vec4(texture(tex, coords)).x;
+    out_value = tmp_value;
+})");
+}
+
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index dc883722d..ba6dcd3be 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -30,6 +30,10 @@ public:
         return max_varyings;
     }
 
+    bool HasWarpIntrinsics() const {
+        return has_warp_intrinsics;
+    }
+
     bool HasVertexViewportLayer() const {
         return has_vertex_viewport_layer;
     }
@@ -42,17 +46,24 @@ public:
         return has_component_indexing_bug;
     }
 
+    bool HasPreciseBug() const {
+        return has_precise_bug;
+    }
+
 private:
     static bool TestVariableAoffi();
     static bool TestComponentIndexingBug();
+    static bool TestPreciseBug();
 
     std::size_t uniform_buffer_alignment{};
     std::size_t shader_storage_alignment{};
     u32 max_vertex_attributes{};
     u32 max_varyings{};
+    bool has_warp_intrinsics{};
     bool has_vertex_viewport_layer{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
+    bool has_precise_bug{};
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 80cfda7e4..bb09ecd52 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -708,8 +708,6 @@ void RasterizerOpenGL::DrawArrays() {
         return;
     }
 
-    const auto& regs = gpu.regs;
-
     SyncColorMask();
     SyncFragmentColorClampState();
     SyncMultiSampleState();
@@ -980,7 +978,7 @@ void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entr
                                          GPUVAddr gpu_addr, std::size_t size) {
     const auto alignment{device.GetShaderStorageBufferAlignment()};
     const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten());
+        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());
     bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 1c90facc3..cf6a5cddf 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -212,7 +212,9 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
     const auto texture_buffer_usage{variant.texture_buffer_usage};
 
     std::string source = "#version 430 core\n"
-                         "#extension GL_ARB_separate_shader_objects : enable\n";
+                         "#extension GL_ARB_separate_shader_objects : enable\n"
+                         "#extension GL_NV_gpu_shader5 : enable\n"
+                         "#extension GL_NV_shader_thread_group : enable\n";
     if (entries.shader_viewport_layer_array) {
         source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
     }
@@ -247,20 +249,24 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
         if (!texture_buffer_usage.test(i)) {
             continue;
         }
-        source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
+        source += fmt::format("#define SAMPLER_{}_IS_BUFFER\n", i);
+    }
+    if (texture_buffer_usage.any()) {
+        source += '\n';
     }
 
     if (program_type == ProgramType::Geometry) {
         const auto [glsl_topology, debug_name, max_vertices] =
             GetPrimitiveDescription(primitive_mode);
 
-        source += "layout (" + std::string(glsl_topology) + ") in;\n";
+        source += "layout (" + std::string(glsl_topology) + ") in;\n\n";
         source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
     }
     if (program_type == ProgramType::Compute) {
         source += "layout (local_size_variable) in;\n";
     }
 
+    source += '\n';
     source += code;
 
     OGLShader shader;
@@ -289,7 +295,7 @@ std::set<GLenum> GetSupportedFormats() {
 
 CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
                            GLShader::ProgramResult result)
-    : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr},
+    : RasterizerCacheObject{params.host_ptr}, cpu_addr{params.cpu_addr},
       unique_identifier{params.unique_identifier}, program_type{program_type},
       disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs},
       entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {}
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index a3106a0ff..2c8faf855 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -106,7 +106,6 @@ private:
 
     ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const;
 
-    u8* host_ptr{};
     VAddr cpu_addr{};
     u64 unique_identifier{};
     ProgramType program_type{};
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index d8f722c26..a5cc1a86f 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -39,7 +39,7 @@ using namespace VideoCommon::Shader;
 using Maxwell = Tegra::Engines::Maxwell3D::Regs;
 using Operation = const OperationNode&;
 
-enum class Type { Bool, Bool2, Float, Int, Uint, HalfFloat };
+enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat };
 
 struct TextureAoffi {};
 using TextureArgument = std::pair<Type, Node>;
@@ -48,7 +48,7 @@ using TextureIR = std::variant<TextureAoffi, TextureArgument>;
 constexpr u32 MAX_CONSTBUFFER_ELEMENTS =
     static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float));
 
-class ShaderWriter {
+class ShaderWriter final {
 public:
     void AddExpression(std::string_view text) {
         DEBUG_ASSERT(scope >= 0);
@@ -93,9 +93,157 @@ private:
     u32 temporary_index = 1;
 };
 
+class Expression final {
+public:
+    Expression(std::string code, Type type) : code{std::move(code)}, type{type} {
+        ASSERT(type != Type::Void);
+    }
+    Expression() : type{Type::Void} {}
+
+    Type GetType() const {
+        return type;
+    }
+
+    std::string GetCode() const {
+        return code;
+    }
+
+    void CheckVoid() const {
+        ASSERT(type == Type::Void);
+    }
+
+    std::string As(Type type) const {
+        switch (type) {
+        case Type::Bool:
+            return AsBool();
+        case Type::Bool2:
+            return AsBool2();
+        case Type::Float:
+            return AsFloat();
+        case Type::Int:
+            return AsInt();
+        case Type::Uint:
+            return AsUint();
+        case Type::HalfFloat:
+            return AsHalfFloat();
+        default:
+            UNREACHABLE_MSG("Invalid type");
+            return code;
+        }
+    }
+
+    std::string AsBool() const {
+        switch (type) {
+        case Type::Bool:
+            return code;
+        default:
+            UNREACHABLE_MSG("Incompatible types");
+            return code;
+        }
+    }
+
+    std::string AsBool2() const {
+        switch (type) {
+        case Type::Bool2:
+            return code;
+        default:
+            UNREACHABLE_MSG("Incompatible types");
+            return code;
+        }
+    }
+
+    std::string AsFloat() const {
+        switch (type) {
+        case Type::Float:
+            return code;
+        case Type::Uint:
+            return fmt::format("utof({})", code);
+        case Type::Int:
+            return fmt::format("itof({})", code);
+        case Type::HalfFloat:
+            return fmt::format("utof(packHalf2x16({}))", code);
+        default:
+            UNREACHABLE_MSG("Incompatible types");
+            return code;
+        }
+    }
+
+    std::string AsInt() const {
+        switch (type) {
+        case Type::Float:
+            return fmt::format("ftoi({})", code);
+        case Type::Uint:
+            return fmt::format("int({})", code);
+        case Type::Int:
+            return code;
+        case Type::HalfFloat:
+            return fmt::format("int(packHalf2x16({}))", code);
+        default:
+            UNREACHABLE_MSG("Incompatible types");
+            return code;
+        }
+    }
+
+    std::string AsUint() const {
+        switch (type) {
+        case Type::Float:
+            return fmt::format("ftou({})", code);
+        case Type::Uint:
+            return code;
+        case Type::Int:
+            return fmt::format("uint({})", code);
+        case Type::HalfFloat:
+            return fmt::format("packHalf2x16({})", code);
+        default:
+            UNREACHABLE_MSG("Incompatible types");
+            return code;
+        }
+    }
+
+    std::string AsHalfFloat() const {
+        switch (type) {
+        case Type::Float:
+            return fmt::format("unpackHalf2x16(ftou({}))", code);
+        case Type::Uint:
+            return fmt::format("unpackHalf2x16({})", code);
+        case Type::Int:
+            return fmt::format("unpackHalf2x16(int({}))", code);
+        case Type::HalfFloat:
+            return code;
+        default:
+            UNREACHABLE_MSG("Incompatible types");
+            return code;
+        }
+    }
+
+private:
+    std::string code;
+    Type type{};
+};
+
+constexpr const char* GetTypeString(Type type) {
+    switch (type) {
+    case Type::Bool:
+        return "bool";
+    case Type::Bool2:
+        return "bvec2";
+    case Type::Float:
+        return "float";
+    case Type::Int:
+        return "int";
+    case Type::Uint:
+        return "uint";
+    case Type::HalfFloat:
+        return "vec2";
+    default:
+        UNREACHABLE_MSG("Invalid type");
+        return "<invalid type>";
+    }
+}
+
 /// Generates code to use for a swizzle operation.
 constexpr const char* GetSwizzle(u32 element) {
-    constexpr std::array<const char*, 4> swizzle = {".x", ".y", ".z", ".w"};
+    constexpr std::array swizzle = {".x", ".y", ".z", ".w"};
     return swizzle.at(element);
 }
 
@@ -134,8 +282,8 @@ constexpr bool IsGenericAttribute(Attribute::Index index) {
     return index >= Attribute::Index::Attribute_0 && index <= Attribute::Index::Attribute_31;
 }
 
-constexpr Attribute::Index ToGenericAttribute(u32 value) {
-    return static_cast<Attribute::Index>(value + static_cast<u32>(Attribute::Index::Attribute_0));
+constexpr Attribute::Index ToGenericAttribute(u64 value) {
+    return static_cast<Attribute::Index>(value + static_cast<u64>(Attribute::Index::Attribute_0));
 }
 
 u32 GetGenericAttributeIndex(Attribute::Index index) {
@@ -191,7 +339,7 @@ public:
 
         // VM's program counter
         const auto first_address = ir.GetBasicBlocks().begin()->first;
-        code.AddLine("uint jmp_to = {}u;", first_address);
+        code.AddLine("uint jmp_to = {}U;", first_address);
 
         // TODO(Subv): Figure out the actual depth of the flow stack, for now it seems
         // unlikely that shaders will use 20 nested SSYs and PBKs.
@@ -199,7 +347,7 @@ public:
             constexpr u32 FLOW_STACK_SIZE = 20;
             for (const auto stack : std::array{MetaStackClass::Ssy, MetaStackClass::Pbk}) {
                 code.AddLine("uint {}[{}];", FlowStackName(stack), FLOW_STACK_SIZE);
-                code.AddLine("uint {} = 0u;", FlowStackTopName(stack));
+                code.AddLine("uint {} = 0U;", FlowStackTopName(stack));
             }
         }
 
@@ -210,7 +358,7 @@ public:
 
         for (const auto& pair : ir.GetBasicBlocks()) {
             const auto [address, bb] = pair;
-            code.AddLine("case 0x{:x}u: {{", address);
+            code.AddLine("case 0x{:X}U: {{", address);
             ++code.scope;
 
             VisitBlock(bb);
@@ -322,7 +470,7 @@ private:
     void DeclareRegisters() {
         const auto& registers = ir.GetRegisters();
         for (const u32 gpr : registers) {
-            code.AddLine("float {} = 0;", GetRegister(gpr));
+            code.AddLine("float {} = 0.0f;", GetRegister(gpr));
         }
         if (!registers.empty()) {
             code.AddNewLine();
@@ -348,7 +496,7 @@ private:
             return;
         }
         const auto element_count = Common::AlignUp(local_memory_size, 4) / 4;
-        code.AddLine("float {}[{}];", GetLocalMemory(), element_count);
+        code.AddLine("uint {}[{}];", GetLocalMemory(), element_count);
         code.AddNewLine();
     }
 
@@ -371,8 +519,6 @@ private:
             return "noperspective ";
         default:
         case AttributeUse::Unused:
-            UNREACHABLE_MSG("Unused attribute being fetched");
-            return {};
             UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<u32>(attribute));
             return {};
         }
@@ -449,7 +595,7 @@ private:
             const auto [index, size] = entry;
             code.AddLine("layout (std140, binding = CBUF_BINDING_{}) uniform {} {{", index,
                          GetConstBufferBlock(index));
-            code.AddLine("    vec4 {}[MAX_CONSTBUFFER_ELEMENTS];", GetConstBuffer(index));
+            code.AddLine("    uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS);
             code.AddLine("}};");
             code.AddNewLine();
         }
@@ -470,7 +616,7 @@ private:
 
             code.AddLine("layout (std430, binding = GMEM_BINDING_{}_{}) {} buffer {} {{",
                          base.cbuf_index, base.cbuf_offset, qualifier, GetGlobalMemoryBlock(base));
-            code.AddLine("    float {}[];", GetGlobalMemory(base));
+            code.AddLine("    uint {}[];", GetGlobalMemory(base));
             code.AddLine("}};");
             code.AddNewLine();
         }
@@ -528,7 +674,7 @@ private:
         if (!ir.HasPhysicalAttributes()) {
             return;
         }
-        code.AddLine("float readPhysicalAttribute(uint physical_address) {{");
+        code.AddLine("float ReadPhysicalAttribute(uint physical_address) {{");
         ++code.scope;
         code.AddLine("switch (physical_address) {{");
 
@@ -537,15 +683,16 @@ private:
         for (u32 index = 0; index < num_attributes; ++index) {
             const auto attribute{ToGenericAttribute(index)};
             for (u32 element = 0; element < 4; ++element) {
-                constexpr u32 generic_base{0x80};
-                constexpr u32 generic_stride{16};
-                constexpr u32 element_stride{4};
+                constexpr u32 generic_base = 0x80;
+                constexpr u32 generic_stride = 16;
+                constexpr u32 element_stride = 4;
                 const u32 address{generic_base + index * generic_stride + element * element_stride};
 
-                const bool declared{stage != ProgramType::Fragment ||
-                                    header.ps.GetAttributeUse(index) != AttributeUse::Unused};
-                const std::string value{declared ? ReadAttribute(attribute, element) : "0"};
-                code.AddLine("case 0x{:x}: return {};", address, value);
+                const bool declared = stage != ProgramType::Fragment ||
+                                      header.ps.GetAttributeUse(index) != AttributeUse::Unused;
+                const std::string value =
+                    declared ? ReadAttribute(attribute, element).AsFloat() : "0.0f";
+                code.AddLine("case 0x{:X}U: return {};", address, value);
             }
         }
 
@@ -565,7 +712,7 @@ private:
                 case Tegra::Shader::ImageType::Texture1D:
                     return "image1D";
                 case Tegra::Shader::ImageType::TextureBuffer:
-                    return "bufferImage";
+                    return "imageBuffer";
                 case Tegra::Shader::ImageType::Texture1DArray:
                     return "image1DArray";
                 case Tegra::Shader::ImageType::Texture2D:
@@ -590,13 +737,11 @@ private:
 
     void VisitBlock(const NodeBlock& bb) {
         for (const auto& node : bb) {
-            if (const std::string expr = Visit(node); !expr.empty()) {
-                code.AddLine(expr);
-            }
+            Visit(node).CheckVoid();
         }
     }
 
-    std::string Visit(const Node& node) {
+    Expression Visit(const Node& node) {
         if (const auto operation = std::get_if<OperationNode>(&*node)) {
             const auto operation_index = static_cast<std::size_t>(operation->GetCode());
             if (operation_index >= operation_decompilers.size()) {
@@ -614,18 +759,18 @@ private:
         if (const auto gpr = std::get_if<GprNode>(&*node)) {
             const u32 index = gpr->GetIndex();
             if (index == Register::ZeroIndex) {
-                return "0";
+                return {"0U", Type::Uint};
             }
-            return GetRegister(index);
+            return {GetRegister(index), Type::Float};
         }
 
         if (const auto immediate = std::get_if<ImmediateNode>(&*node)) {
             const u32 value = immediate->GetValue();
             if (value < 10) {
                 // For eyecandy avoid using hex numbers on single digits
-                return fmt::format("utof({}u)", immediate->GetValue());
+                return {fmt::format("{}U", immediate->GetValue()), Type::Uint};
             }
-            return fmt::format("utof(0x{:x}u)", immediate->GetValue());
+            return {fmt::format("0x{:X}U", immediate->GetValue()), Type::Uint};
         }
 
         if (const auto predicate = std::get_if<PredicateNode>(&*node)) {
@@ -640,17 +785,18 @@ private:
                 }
             }();
             if (predicate->IsNegated()) {
-                return fmt::format("!({})", value);
+                return {fmt::format("!({})", value), Type::Bool};
             }
-            return value;
+            return {value, Type::Bool};
         }
 
         if (const auto abuf = std::get_if<AbufNode>(&*node)) {
             UNIMPLEMENTED_IF_MSG(abuf->IsPhysicalBuffer() && stage == ProgramType::Geometry,
                                  "Physical attributes in geometry shaders are not implemented");
             if (abuf->IsPhysicalBuffer()) {
-                return fmt::format("readPhysicalAttribute(ftou({}))",
-                                   Visit(abuf->GetPhysicalAddress()));
+                return {fmt::format("ReadPhysicalAttribute({})",
+                                    Visit(abuf->GetPhysicalAddress()).AsUint()),
+                        Type::Float};
             }
             return ReadAttribute(abuf->GetIndex(), abuf->GetElement(), abuf->GetBuffer());
         }
@@ -661,59 +807,64 @@ private:
                 // Direct access
                 const u32 offset_imm = immediate->GetValue();
                 ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access");
-                return fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
-                                   offset_imm / (4 * 4), (offset_imm / 4) % 4);
+                return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()),
+                                    offset_imm / (4 * 4), (offset_imm / 4) % 4),
+                        Type::Uint};
             }
 
             if (std::holds_alternative<OperationNode>(*offset)) {
                 // Indirect access
                 const std::string final_offset = code.GenerateTemporary();
-                code.AddLine("uint {} = ftou({}) >> 2;", final_offset, Visit(offset));
+                code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint());
 
                 if (!device.HasComponentIndexingBug()) {
-                    return fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
-                                       final_offset, final_offset);
+                    return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()),
+                                        final_offset, final_offset),
+                            Type::Uint};
                 }
 
                 // AMD's proprietary GLSL compiler emits ill code for variable component access.
                 // To bypass this driver bug generate 4 ifs, one per each component.
                 const std::string pack = code.GenerateTemporary();
-                code.AddLine("vec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
+                code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()),
                              final_offset);
 
                 const std::string result = code.GenerateTemporary();
-                code.AddLine("float {};", result);
+                code.AddLine("uint {};", result);
                 for (u32 swizzle = 0; swizzle < 4; ++swizzle) {
                     code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result,
                                  pack, GetSwizzle(swizzle));
                 }
-                return result;
+                return {result, Type::Uint};
             }
 
             UNREACHABLE_MSG("Unmanaged offset node type");
         }
 
         if (const auto gmem = std::get_if<GmemNode>(&*node)) {
-            const std::string real = Visit(gmem->GetRealAddress());
-            const std::string base = Visit(gmem->GetBaseAddress());
-            const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base);
-            return fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
+            const std::string real = Visit(gmem->GetRealAddress()).AsUint();
+            const std::string base = Visit(gmem->GetBaseAddress()).AsUint();
+            const std::string final_offset = fmt::format("({} - {}) >> 2", real, base);
+            return {fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset),
+                    Type::Uint};
         }
 
         if (const auto lmem = std::get_if<LmemNode>(&*node)) {
             if (stage == ProgramType::Compute) {
                 LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
             }
-            return fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
+            return {
+                fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()),
+                Type::Uint};
         }
 
         if (const auto internal_flag = std::get_if<InternalFlagNode>(&*node)) {
-            return GetInternalFlag(internal_flag->GetFlag());
+            return {GetInternalFlag(internal_flag->GetFlag()), Type::Bool};
         }
 
         if (const auto conditional = std::get_if<ConditionalNode>(&*node)) {
             // It's invalid to call conditional on nested nodes, use an operation instead
-            code.AddLine("if ({}) {{", Visit(conditional->GetCondition()));
+            code.AddLine("if ({}) {{", Visit(conditional->GetCondition()).AsBool());
             ++code.scope;
 
             VisitBlock(conditional->GetCode());
@@ -724,20 +875,21 @@ private:
         }
 
         if (const auto comment = std::get_if<CommentNode>(&*node)) {
-            return "// " + comment->GetText();
+            code.AddLine("// " + comment->GetText());
+            return {};
         }
 
         UNREACHABLE();
         return {};
     }
 
-    std::string ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
+    Expression ReadAttribute(Attribute::Index attribute, u32 element, const Node& buffer = {}) {
         const auto GeometryPass = [&](std::string_view name) {
             if (stage == ProgramType::Geometry && buffer) {
                 // TODO(Rodrigo): Guard geometry inputs against out of bound reads. Some games
                 // set an 0x80000000 index for those and the shader fails to build. Find out why
                 // this happens and what's its intent.
-                return fmt::format("gs_{}[ftou({}) % MAX_VERTEX_INPUT]", name, Visit(buffer));
+                return fmt::format("gs_{}[{} % MAX_VERTEX_INPUT]", name, Visit(buffer).AsUint());
             }
             return std::string(name);
         };
@@ -746,25 +898,27 @@ private:
         case Attribute::Index::Position:
             switch (stage) {
             case ProgramType::Geometry:
-                return fmt::format("gl_in[ftou({})].gl_Position{}", Visit(buffer),
-                                   GetSwizzle(element));
+                return {fmt::format("gl_in[{}].gl_Position{}", Visit(buffer).AsUint(),
+                                    GetSwizzle(element)),
+                        Type::Float};
             case ProgramType::Fragment:
-                return element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element));
+                return {element == 3 ? "1.0f" : ("gl_FragCoord"s + GetSwizzle(element)),
+                        Type::Float};
             default:
                 UNREACHABLE();
             }
         case Attribute::Index::PointCoord:
             switch (element) {
             case 0:
-                return "gl_PointCoord.x";
+                return {"gl_PointCoord.x", Type::Float};
             case 1:
-                return "gl_PointCoord.y";
+                return {"gl_PointCoord.y", Type::Float};
             case 2:
             case 3:
-                return "0";
+                return {"0.0f", Type::Float};
             }
             UNREACHABLE();
-            return "0";
+            return {"0", Type::Int};
         case Attribute::Index::TessCoordInstanceIDVertexID:
             // TODO(Subv): Find out what the values are for the first two elements when inside a
             // vertex shader, and what's the value of the fourth element when inside a Tess Eval
@@ -773,44 +927,49 @@ private:
             switch (element) {
             case 2:
                 // Config pack's first value is instance_id.
-                return "uintBitsToFloat(config_pack[0])";
+                return {"config_pack[0]", Type::Uint};
             case 3:
-                return "uintBitsToFloat(gl_VertexID)";
+                return {"gl_VertexID", Type::Int};
             }
             UNIMPLEMENTED_MSG("Unmanaged TessCoordInstanceIDVertexID element={}", element);
-            return "0";
+            return {"0", Type::Int};
         case Attribute::Index::FrontFacing:
             // TODO(Subv): Find out what the values are for the other elements.
             ASSERT(stage == ProgramType::Fragment);
             switch (element) {
             case 3:
-                return "itof(gl_FrontFacing ? -1 : 0)";
+                return {"(gl_FrontFacing ? -1 : 0)", Type::Int};
             }
             UNIMPLEMENTED_MSG("Unmanaged FrontFacing element={}", element);
-            return "0";
+            return {"0", Type::Int};
         default:
             if (IsGenericAttribute(attribute)) {
-                return GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element);
+                return {GeometryPass(GetInputAttribute(attribute)) + GetSwizzle(element),
+                        Type::Float};
             }
             break;
         }
         UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute));
-        return "0";
+        return {"0", Type::Int};
     }
 
-    std::string ApplyPrecise(Operation operation, const std::string& value) {
+    Expression ApplyPrecise(Operation operation, std::string value, Type type) {
         if (!IsPrecise(operation)) {
-            return value;
+            return {std::move(value), type};
         }
-        // There's a bug in NVidia's proprietary drivers that makes precise fail on fragment shaders
-        const std::string precise = stage != ProgramType::Fragment ? "precise " : "";
+        // Old Nvidia drivers have a bug with precise and texture sampling. These are more likely to
+        // be found in fragment shaders, so we disable precise there. There are vertex shaders that
+        // also fail to build but nobody seems to care about those.
+        // Note: Only bugged drivers will skip precise.
+        const bool disable_precise = device.HasPreciseBug() && stage == ProgramType::Fragment;
 
-        const std::string temporary = code.GenerateTemporary();
-        code.AddLine("{}float {} = {};", precise, temporary, value);
-        return temporary;
+        std::string temporary = code.GenerateTemporary();
+        code.AddLine("{}{} {} = {};", disable_precise ? "" : "precise ", GetTypeString(type),
+                     temporary, value);
+        return {std::move(temporary), type};
     }
 
-    std::string VisitOperand(Operation operation, std::size_t operand_index) {
+    Expression VisitOperand(Operation operation, std::size_t operand_index) {
         const auto& operand = operation[operand_index];
         const bool parent_precise = IsPrecise(operation);
         const bool child_precise = IsPrecise(operand);
@@ -819,19 +978,16 @@ private:
             return Visit(operand);
         }
 
-        const std::string temporary = code.GenerateTemporary();
-        code.AddLine("float {} = {};", temporary, Visit(operand));
-        return temporary;
-    }
-
-    std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) {
-        return CastOperand(VisitOperand(operation, operand_index), type);
+        Expression value = Visit(operand);
+        std::string temporary = code.GenerateTemporary();
+        code.AddLine("{} {} = {};", GetTypeString(value.GetType()), temporary, value.GetCode());
+        return {std::move(temporary), value.GetType()};
     }
 
-    std::optional<std::pair<std::string, bool>> GetOutputAttribute(const AbufNode* abuf) {
+    Expression GetOutputAttribute(const AbufNode* abuf) {
         switch (const auto attribute = abuf->GetIndex()) {
         case Attribute::Index::Position:
-            return std::make_pair("gl_Position"s + GetSwizzle(abuf->GetElement()), false);
+            return {"gl_Position"s + GetSwizzle(abuf->GetElement()), Type::Float};
         case Attribute::Index::LayerViewportPointSize:
             switch (abuf->GetElement()) {
             case 0:
@@ -841,119 +997,79 @@ private:
                 if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
-                return std::make_pair("gl_Layer", true);
+                return {"gl_Layer", Type::Int};
             case 2:
                 if (IsVertexShader(stage) && !device.HasVertexViewportLayer()) {
                     return {};
                 }
-                return std::make_pair("gl_ViewportIndex", true);
+                return {"gl_ViewportIndex", Type::Int};
             case 3:
                 UNIMPLEMENTED_MSG("Requires some state changes for gl_PointSize to work in shader");
-                return std::make_pair("gl_PointSize", false);
+                return {"gl_PointSize", Type::Float};
             }
             return {};
         case Attribute::Index::ClipDistances0123:
-            return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), false);
+            return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement()), Type::Float};
         case Attribute::Index::ClipDistances4567:
-            return std::make_pair(fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4),
-                                  false);
+            return {fmt::format("gl_ClipDistance[{}]", abuf->GetElement() + 4), Type::Float};
         default:
             if (IsGenericAttribute(attribute)) {
-                return std::make_pair(
-                    GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()), false);
+                return {GetOutputAttribute(attribute) + GetSwizzle(abuf->GetElement()),
+                        Type::Float};
             }
             UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute));
             return {};
         }
     }
 
-    std::string CastOperand(const std::string& value, Type type) const {
-        switch (type) {
-        case Type::Bool:
-        case Type::Bool2:
-        case Type::Float:
-            return value;
-        case Type::Int:
-            return fmt::format("ftoi({})", value);
-        case Type::Uint:
-            return fmt::format("ftou({})", value);
-        case Type::HalfFloat:
-            return fmt::format("toHalf2({})", value);
-        }
-        UNREACHABLE();
-        return value;
-    }
-
-    std::string BitwiseCastResult(const std::string& value, Type type,
-                                  bool needs_parenthesis = false) {
-        switch (type) {
-        case Type::Bool:
-        case Type::Bool2:
-        case Type::Float:
-            if (needs_parenthesis) {
-                return fmt::format("({})", value);
-            }
-            return value;
-        case Type::Int:
-            return fmt::format("itof({})", value);
-        case Type::Uint:
-            return fmt::format("utof({})", value);
-        case Type::HalfFloat:
-            return fmt::format("fromHalf2({})", value);
-        }
-        UNREACHABLE();
-        return value;
-    }
-
-    std::string GenerateUnary(Operation operation, const std::string& func, Type result_type,
-                              Type type_a, bool needs_parenthesis = true) {
-        const std::string op_str = fmt::format("{}({})", func, VisitOperand(operation, 0, type_a));
-
-        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type, needs_parenthesis));
+    Expression GenerateUnary(Operation operation, std::string_view func, Type result_type,
+                             Type type_a) {
+        std::string op_str = fmt::format("{}({})", func, VisitOperand(operation, 0).As(type_a));
+        return ApplyPrecise(operation, std::move(op_str), result_type);
     }
 
-    std::string GenerateBinaryInfix(Operation operation, const std::string& func, Type result_type,
-                                    Type type_a, Type type_b) {
-        const std::string op_a = VisitOperand(operation, 0, type_a);
-        const std::string op_b = VisitOperand(operation, 1, type_b);
-        const std::string op_str = fmt::format("({} {} {})", op_a, func, op_b);
+    Expression GenerateBinaryInfix(Operation operation, std::string_view func, Type result_type,
+                                   Type type_a, Type type_b) {
+        const std::string op_a = VisitOperand(operation, 0).As(type_a);
+        const std::string op_b = VisitOperand(operation, 1).As(type_b);
+        std::string op_str = fmt::format("({} {} {})", op_a, func, op_b);
 
-        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type));
+        return ApplyPrecise(operation, std::move(op_str), result_type);
     }
 
-    std::string GenerateBinaryCall(Operation operation, const std::string& func, Type result_type,
-                                   Type type_a, Type type_b) {
-        const std::string op_a = VisitOperand(operation, 0, type_a);
-        const std::string op_b = VisitOperand(operation, 1, type_b);
-        const std::string op_str = fmt::format("{}({}, {})", func, op_a, op_b);
+    Expression GenerateBinaryCall(Operation operation, std::string_view func, Type result_type,
+                                  Type type_a, Type type_b) {
+        const std::string op_a = VisitOperand(operation, 0).As(type_a);
+        const std::string op_b = VisitOperand(operation, 1).As(type_b);
+        std::string op_str = fmt::format("{}({}, {})", func, op_a, op_b);
 
-        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type));
+        return ApplyPrecise(operation, std::move(op_str), result_type);
     }
 
-    std::string GenerateTernary(Operation operation, const std::string& func, Type result_type,
-                                Type type_a, Type type_b, Type type_c) {
-        const std::string op_a = VisitOperand(operation, 0, type_a);
-        const std::string op_b = VisitOperand(operation, 1, type_b);
-        const std::string op_c = VisitOperand(operation, 2, type_c);
-        const std::string op_str = fmt::format("{}({}, {}, {})", func, op_a, op_b, op_c);
+    Expression GenerateTernary(Operation operation, std::string_view func, Type result_type,
+                               Type type_a, Type type_b, Type type_c) {
+        const std::string op_a = VisitOperand(operation, 0).As(type_a);
+        const std::string op_b = VisitOperand(operation, 1).As(type_b);
+        const std::string op_c = VisitOperand(operation, 2).As(type_c);
+        std::string op_str = fmt::format("{}({}, {}, {})", func, op_a, op_b, op_c);
 
-        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type));
+        return ApplyPrecise(operation, std::move(op_str), result_type);
     }
 
-    std::string GenerateQuaternary(Operation operation, const std::string& func, Type result_type,
-                                   Type type_a, Type type_b, Type type_c, Type type_d) {
-        const std::string op_a = VisitOperand(operation, 0, type_a);
-        const std::string op_b = VisitOperand(operation, 1, type_b);
-        const std::string op_c = VisitOperand(operation, 2, type_c);
-        const std::string op_d = VisitOperand(operation, 3, type_d);
-        const std::string op_str = fmt::format("{}({}, {}, {}, {})", func, op_a, op_b, op_c, op_d);
+    Expression GenerateQuaternary(Operation operation, const std::string& func, Type result_type,
+                                  Type type_a, Type type_b, Type type_c, Type type_d) {
+        const std::string op_a = VisitOperand(operation, 0).As(type_a);
+        const std::string op_b = VisitOperand(operation, 1).As(type_b);
+        const std::string op_c = VisitOperand(operation, 2).As(type_c);
+        const std::string op_d = VisitOperand(operation, 3).As(type_d);
+        std::string op_str = fmt::format("{}({}, {}, {}, {})", func, op_a, op_b, op_c, op_d);
 
-        return ApplyPrecise(operation, BitwiseCastResult(op_str, result_type));
+        return ApplyPrecise(operation, std::move(op_str), result_type);
     }
 
     std::string GenerateTexture(Operation operation, const std::string& function_suffix,
                                 const std::vector<TextureIR>& extras) {
-        constexpr std::array<const char*, 4> coord_constructors = {"float", "vec2", "vec3", "vec4"};
+        constexpr std::array coord_constructors = {"float", "vec2", "vec3", "vec4"};
 
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
@@ -970,17 +1086,17 @@ private:
         expr += coord_constructors.at(count + (has_array ? 1 : 0) + (has_shadow ? 1 : 0) - 1);
         expr += '(';
         for (std::size_t i = 0; i < count; ++i) {
-            expr += Visit(operation[i]);
+            expr += Visit(operation[i]).AsFloat();
 
             const std::size_t next = i + 1;
             if (next < count)
                 expr += ", ";
         }
         if (has_array) {
-            expr += ", float(ftoi(" + Visit(meta->array) + "))";
+            expr += ", float(" + Visit(meta->array).AsInt() + ')';
         }
         if (has_shadow) {
-            expr += ", " + Visit(meta->depth_compare);
+            expr += ", " + Visit(meta->depth_compare).AsFloat();
         }
         expr += ')';
 
@@ -1011,11 +1127,11 @@ private:
                 // required to be constant)
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
             } else {
-                expr += fmt::format("ftoi({})", Visit(operand));
+                expr += Visit(operand).AsInt();
             }
             break;
         case Type::Float:
-            expr += Visit(operand);
+            expr += Visit(operand).AsFloat();
             break;
         default: {
             const auto type_int = static_cast<u32>(type);
@@ -1031,7 +1147,7 @@ private:
         if (aoffi.empty()) {
             return {};
         }
-        constexpr std::array<const char*, 3> coord_constructors = {"int", "ivec2", "ivec3"};
+        constexpr std::array coord_constructors = {"int", "ivec2", "ivec3"};
         std::string expr = ", ";
         expr += coord_constructors.at(aoffi.size() - 1);
         expr += '(';
@@ -1044,7 +1160,7 @@ private:
                 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
             } else if (device.HasVariableAoffi()) {
                 // Avoid using variable AOFFI on unsupported devices.
-                expr += fmt::format("ftoi({})", Visit(operand));
+                expr += Visit(operand).AsInt();
             } else {
                 // Insert 0 on devices not supporting variable AOFFI.
                 expr += '0';
@@ -1058,328 +1174,314 @@ private:
         return expr;
     }
 
-    std::string Assign(Operation operation) {
+    Expression Assign(Operation operation) {
         const Node& dest = operation[0];
         const Node& src = operation[1];
 
-        std::string target;
-        bool is_integer = false;
-
+        Expression target;
         if (const auto gpr = std::get_if<GprNode>(&*dest)) {
             if (gpr->GetIndex() == Register::ZeroIndex) {
                 // Writing to Register::ZeroIndex is a no op
                 return {};
             }
-            target = GetRegister(gpr->GetIndex());
+            target = {GetRegister(gpr->GetIndex()), Type::Float};
         } else if (const auto abuf = std::get_if<AbufNode>(&*dest)) {
             UNIMPLEMENTED_IF(abuf->IsPhysicalBuffer());
-            const auto result = GetOutputAttribute(abuf);
-            if (!result) {
-                return {};
-            }
-            target = result->first;
-            is_integer = result->second;
+            target = GetOutputAttribute(abuf);
         } else if (const auto lmem = std::get_if<LmemNode>(&*dest)) {
             if (stage == ProgramType::Compute) {
                 LOG_WARNING(Render_OpenGL, "Local memory is stubbed on compute shaders");
             }
-            target = fmt::format("{}[ftou({}) / 4]", GetLocalMemory(), Visit(lmem->GetAddress()));
+            target = {
+                fmt::format("{}[{} >> 2]", GetLocalMemory(), Visit(lmem->GetAddress()).AsUint()),
+                Type::Uint};
         } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) {
-            const std::string real = Visit(gmem->GetRealAddress());
-            const std::string base = Visit(gmem->GetBaseAddress());
-            const std::string final_offset = fmt::format("(ftou({}) - ftou({})) / 4", real, base);
-            target = fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset);
+            const std::string real = Visit(gmem->GetRealAddress()).AsUint();
+            const std::string base = Visit(gmem->GetBaseAddress()).AsUint();
+            const std::string final_offset = fmt::format("({} - {}) >> 2", real, base);
+            target = {fmt::format("{}[{}]", GetGlobalMemory(gmem->GetDescriptor()), final_offset),
+                      Type::Uint};
         } else {
             UNREACHABLE_MSG("Assign called without a proper target");
         }
 
-        if (is_integer) {
-            code.AddLine("{} = ftoi({});", target, Visit(src));
-        } else {
-            code.AddLine("{} = {};", target, Visit(src));
-        }
+        code.AddLine("{} = {};", target.GetCode(), Visit(src).As(target.GetType()));
         return {};
     }
 
     template <Type type>
-    std::string Add(Operation operation) {
+    Expression Add(Operation operation) {
         return GenerateBinaryInfix(operation, "+", type, type, type);
     }
 
     template <Type type>
-    std::string Mul(Operation operation) {
+    Expression Mul(Operation operation) {
         return GenerateBinaryInfix(operation, "*", type, type, type);
     }
 
     template <Type type>
-    std::string Div(Operation operation) {
+    Expression Div(Operation operation) {
         return GenerateBinaryInfix(operation, "/", type, type, type);
     }
 
     template <Type type>
-    std::string Fma(Operation operation) {
+    Expression Fma(Operation operation) {
         return GenerateTernary(operation, "fma", type, type, type, type);
     }
 
     template <Type type>
-    std::string Negate(Operation operation) {
-        return GenerateUnary(operation, "-", type, type, true);
+    Expression Negate(Operation operation) {
+        return GenerateUnary(operation, "-", type, type);
     }
 
     template <Type type>
-    std::string Absolute(Operation operation) {
-        return GenerateUnary(operation, "abs", type, type, false);
+    Expression Absolute(Operation operation) {
+        return GenerateUnary(operation, "abs", type, type);
     }
 
-    std::string FClamp(Operation operation) {
+    Expression FClamp(Operation operation) {
         return GenerateTernary(operation, "clamp", Type::Float, Type::Float, Type::Float,
                                Type::Float);
     }
 
-    std::string FCastHalf0(Operation operation) {
-        const std::string op_a = VisitOperand(operation, 0, Type::HalfFloat);
-        return fmt::format("({})[0]", op_a);
+    Expression FCastHalf0(Operation operation) {
+        return {fmt::format("({})[0]", VisitOperand(operation, 0).AsHalfFloat()), Type::Float};
     }
 
-    std::string FCastHalf1(Operation operation) {
-        const std::string op_a = VisitOperand(operation, 0, Type::HalfFloat);
-        return fmt::format("({})[1]", op_a);
+    Expression FCastHalf1(Operation operation) {
+        return {fmt::format("({})[1]", VisitOperand(operation, 0).AsHalfFloat()), Type::Float};
     }
 
     template <Type type>
-    std::string Min(Operation operation) {
+    Expression Min(Operation operation) {
         return GenerateBinaryCall(operation, "min", type, type, type);
     }
 
     template <Type type>
-    std::string Max(Operation operation) {
+    Expression Max(Operation operation) {
         return GenerateBinaryCall(operation, "max", type, type, type);
     }
 
-    std::string Select(Operation operation) {
-        const std::string condition = Visit(operation[0]);
-        const std::string true_case = Visit(operation[1]);
-        const std::string false_case = Visit(operation[2]);
-        const std::string op_str = fmt::format("({} ? {} : {})", condition, true_case, false_case);
+    Expression Select(Operation operation) {
+        const std::string condition = Visit(operation[0]).AsBool();
+        const std::string true_case = Visit(operation[1]).AsUint();
+        const std::string false_case = Visit(operation[2]).AsUint();
+        std::string op_str = fmt::format("({} ? {} : {})", condition, true_case, false_case);
 
-        return ApplyPrecise(operation, op_str);
+        return ApplyPrecise(operation, std::move(op_str), Type::Uint);
     }
 
-    std::string FCos(Operation operation) {
-        return GenerateUnary(operation, "cos", Type::Float, Type::Float, false);
+    Expression FCos(Operation operation) {
+        return GenerateUnary(operation, "cos", Type::Float, Type::Float);
     }
 
-    std::string FSin(Operation operation) {
-        return GenerateUnary(operation, "sin", Type::Float, Type::Float, false);
+    Expression FSin(Operation operation) {
+        return GenerateUnary(operation, "sin", Type::Float, Type::Float);
     }
 
-    std::string FExp2(Operation operation) {
-        return GenerateUnary(operation, "exp2", Type::Float, Type::Float, false);
+    Expression FExp2(Operation operation) {
+        return GenerateUnary(operation, "exp2", Type::Float, Type::Float);
     }
 
-    std::string FLog2(Operation operation) {
-        return GenerateUnary(operation, "log2", Type::Float, Type::Float, false);
+    Expression FLog2(Operation operation) {
+        return GenerateUnary(operation, "log2", Type::Float, Type::Float);
     }
 
-    std::string FInverseSqrt(Operation operation) {
-        return GenerateUnary(operation, "inversesqrt", Type::Float, Type::Float, false);
+    Expression FInverseSqrt(Operation operation) {
+        return GenerateUnary(operation, "inversesqrt", Type::Float, Type::Float);
     }
 
-    std::string FSqrt(Operation operation) {
-        return GenerateUnary(operation, "sqrt", Type::Float, Type::Float, false);
+    Expression FSqrt(Operation operation) {
+        return GenerateUnary(operation, "sqrt", Type::Float, Type::Float);
     }
 
-    std::string FRoundEven(Operation operation) {
-        return GenerateUnary(operation, "roundEven", Type::Float, Type::Float, false);
+    Expression FRoundEven(Operation operation) {
+        return GenerateUnary(operation, "roundEven", Type::Float, Type::Float);
     }
 
-    std::string FFloor(Operation operation) {
-        return GenerateUnary(operation, "floor", Type::Float, Type::Float, false);
+    Expression FFloor(Operation operation) {
+        return GenerateUnary(operation, "floor", Type::Float, Type::Float);
     }
 
-    std::string FCeil(Operation operation) {
-        return GenerateUnary(operation, "ceil", Type::Float, Type::Float, false);
+    Expression FCeil(Operation operation) {
+        return GenerateUnary(operation, "ceil", Type::Float, Type::Float);
     }
 
-    std::string FTrunc(Operation operation) {
-        return GenerateUnary(operation, "trunc", Type::Float, Type::Float, false);
+    Expression FTrunc(Operation operation) {
+        return GenerateUnary(operation, "trunc", Type::Float, Type::Float);
     }
 
     template <Type type>
-    std::string FCastInteger(Operation operation) {
-        return GenerateUnary(operation, "float", Type::Float, type, false);
+    Expression FCastInteger(Operation operation) {
+        return GenerateUnary(operation, "float", Type::Float, type);
     }
 
-    std::string ICastFloat(Operation operation) {
-        return GenerateUnary(operation, "int", Type::Int, Type::Float, false);
+    Expression ICastFloat(Operation operation) {
+        return GenerateUnary(operation, "int", Type::Int, Type::Float);
     }
 
-    std::string ICastUnsigned(Operation operation) {
-        return GenerateUnary(operation, "int", Type::Int, Type::Uint, false);
+    Expression ICastUnsigned(Operation operation) {
+        return GenerateUnary(operation, "int", Type::Int, Type::Uint);
     }
 
     template <Type type>
-    std::string LogicalShiftLeft(Operation operation) {
+    Expression LogicalShiftLeft(Operation operation) {
         return GenerateBinaryInfix(operation, "<<", type, type, Type::Uint);
     }
 
-    std::string ILogicalShiftRight(Operation operation) {
-        const std::string op_a = VisitOperand(operation, 0, Type::Uint);
-        const std::string op_b = VisitOperand(operation, 1, Type::Uint);
-        const std::string op_str = fmt::format("int({} >> {})", op_a, op_b);
+    Expression ILogicalShiftRight(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0).AsUint();
+        const std::string op_b = VisitOperand(operation, 1).AsUint();
+        std::string op_str = fmt::format("int({} >> {})", op_a, op_b);
 
-        return ApplyPrecise(operation, BitwiseCastResult(op_str, Type::Int));
+        return ApplyPrecise(operation, std::move(op_str), Type::Int);
     }
 
-    std::string IArithmeticShiftRight(Operation operation) {
+    Expression IArithmeticShiftRight(Operation operation) {
         return GenerateBinaryInfix(operation, ">>", Type::Int, Type::Int, Type::Uint);
     }
 
     template <Type type>
-    std::string BitwiseAnd(Operation operation) {
+    Expression BitwiseAnd(Operation operation) {
         return GenerateBinaryInfix(operation, "&", type, type, type);
     }
 
     template <Type type>
-    std::string BitwiseOr(Operation operation) {
+    Expression BitwiseOr(Operation operation) {
         return GenerateBinaryInfix(operation, "|", type, type, type);
     }
 
     template <Type type>
-    std::string BitwiseXor(Operation operation) {
+    Expression BitwiseXor(Operation operation) {
         return GenerateBinaryInfix(operation, "^", type, type, type);
     }
 
     template <Type type>
-    std::string BitwiseNot(Operation operation) {
-        return GenerateUnary(operation, "~", type, type, false);
+    Expression BitwiseNot(Operation operation) {
+        return GenerateUnary(operation, "~", type, type);
     }
 
-    std::string UCastFloat(Operation operation) {
-        return GenerateUnary(operation, "uint", Type::Uint, Type::Float, false);
+    Expression UCastFloat(Operation operation) {
+        return GenerateUnary(operation, "uint", Type::Uint, Type::Float);
     }
 
-    std::string UCastSigned(Operation operation) {
-        return GenerateUnary(operation, "uint", Type::Uint, Type::Int, false);
+    Expression UCastSigned(Operation operation) {
+        return GenerateUnary(operation, "uint", Type::Uint, Type::Int);
     }
 
-    std::string UShiftRight(Operation operation) {
+    Expression UShiftRight(Operation operation) {
         return GenerateBinaryInfix(operation, ">>", Type::Uint, Type::Uint, Type::Uint);
     }
 
     template <Type type>
-    std::string BitfieldInsert(Operation operation) {
+    Expression BitfieldInsert(Operation operation) {
         return GenerateQuaternary(operation, "bitfieldInsert", type, type, type, Type::Int,
                                   Type::Int);
     }
 
     template <Type type>
-    std::string BitfieldExtract(Operation operation) {
+    Expression BitfieldExtract(Operation operation) {
         return GenerateTernary(operation, "bitfieldExtract", type, type, Type::Int, Type::Int);
     }
 
     template <Type type>
-    std::string BitCount(Operation operation) {
-        return GenerateUnary(operation, "bitCount", type, type, false);
+    Expression BitCount(Operation operation) {
+        return GenerateUnary(operation, "bitCount", type, type);
     }
 
-    std::string HNegate(Operation operation) {
+    Expression HNegate(Operation operation) {
         const auto GetNegate = [&](std::size_t index) {
-            return VisitOperand(operation, index, Type::Bool) + " ? -1 : 1";
+            return VisitOperand(operation, index).AsBool() + " ? -1 : 1";
         };
-        const std::string value =
-            fmt::format("({} * vec2({}, {}))", VisitOperand(operation, 0, Type::HalfFloat),
-                        GetNegate(1), GetNegate(2));
-        return BitwiseCastResult(value, Type::HalfFloat);
-    }
-
-    std::string HClamp(Operation operation) {
-        const std::string value = VisitOperand(operation, 0, Type::HalfFloat);
-        const std::string min = VisitOperand(operation, 1, Type::Float);
-        const std::string max = VisitOperand(operation, 2, Type::Float);
-        const std::string clamped = fmt::format("clamp({}, vec2({}), vec2({}))", value, min, max);
-
-        return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
-    }
-
-    std::string HCastFloat(Operation operation) {
-        const std::string op_a = VisitOperand(operation, 0, Type::Float);
-        return fmt::format("fromHalf2(vec2({}, 0.0f))", op_a);
-    }
-
-    std::string HUnpack(Operation operation) {
-        const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)};
-        const auto value = [&]() -> std::string {
-            switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
-            case Tegra::Shader::HalfType::H0_H1:
-                return operand;
-            case Tegra::Shader::HalfType::F32:
-                return fmt::format("vec2(fromHalf2({}))", operand);
-            case Tegra::Shader::HalfType::H0_H0:
-                return fmt::format("vec2({}[0])", operand);
-            case Tegra::Shader::HalfType::H1_H1:
-                return fmt::format("vec2({}[1])", operand);
-            }
-            UNREACHABLE();
-            return "0";
-        }();
-        return fmt::format("fromHalf2({})", value);
+        return {fmt::format("({} * vec2({}, {}))", VisitOperand(operation, 0).AsHalfFloat(),
+                            GetNegate(1), GetNegate(2)),
+                Type::HalfFloat};
+    }
+
+    Expression HClamp(Operation operation) {
+        const std::string value = VisitOperand(operation, 0).AsHalfFloat();
+        const std::string min = VisitOperand(operation, 1).AsFloat();
+        const std::string max = VisitOperand(operation, 2).AsFloat();
+        std::string clamped = fmt::format("clamp({}, vec2({}), vec2({}))", value, min, max);
+
+        return ApplyPrecise(operation, std::move(clamped), Type::HalfFloat);
+    }
+
+    Expression HCastFloat(Operation operation) {
+        return {fmt::format("vec2({})", VisitOperand(operation, 0).AsFloat()), Type::HalfFloat};
     }
 
-    std::string HMergeF32(Operation operation) {
-        return fmt::format("float(toHalf2({})[0])", Visit(operation[0]));
+    Expression HUnpack(Operation operation) {
+        Expression operand = VisitOperand(operation, 0);
+        switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
+        case Tegra::Shader::HalfType::H0_H1:
+            return operand;
+        case Tegra::Shader::HalfType::F32:
+            return {fmt::format("vec2({})", operand.AsFloat()), Type::HalfFloat};
+        case Tegra::Shader::HalfType::H0_H0:
+            return {fmt::format("vec2({}[0])", operand.AsHalfFloat()), Type::HalfFloat};
+        case Tegra::Shader::HalfType::H1_H1:
+            return {fmt::format("vec2({}[1])", operand.AsHalfFloat()), Type::HalfFloat};
+        }
+    }
+
+    Expression HMergeF32(Operation operation) {
+        return {fmt::format("float({}[0])", VisitOperand(operation, 0).AsHalfFloat()), Type::Float};
     }
 
-    std::string HMergeH0(Operation operation) {
-        return fmt::format("fromHalf2(vec2(toHalf2({})[0], toHalf2({})[1]))", Visit(operation[1]),
-                           Visit(operation[0]));
+    Expression HMergeH0(Operation operation) {
+        std::string dest = VisitOperand(operation, 0).AsUint();
+        std::string src = VisitOperand(operation, 1).AsUint();
+        return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", src, dest), Type::Uint};
     }
 
-    std::string HMergeH1(Operation operation) {
-        return fmt::format("fromHalf2(vec2(toHalf2({})[0], toHalf2({})[1]))", Visit(operation[0]),
-                           Visit(operation[1]));
+    Expression HMergeH1(Operation operation) {
+        std::string dest = VisitOperand(operation, 0).AsUint();
+        std::string src = VisitOperand(operation, 1).AsUint();
+        return {fmt::format("(({} & 0x0000FFFFU) | ({} & 0xFFFF0000U))", dest, src), Type::Uint};
     }
 
-    std::string HPack2(Operation operation) {
-        return fmt::format("utof(packHalf2x16(vec2({}, {})))", Visit(operation[0]),
-                           Visit(operation[1]));
+    Expression HPack2(Operation operation) {
+        return {fmt::format("vec2({}, {})", VisitOperand(operation, 0).AsFloat(),
+                            VisitOperand(operation, 1).AsFloat()),
+                Type::HalfFloat};
     }
 
     template <Type type>
-    std::string LogicalLessThan(Operation operation) {
+    Expression LogicalLessThan(Operation operation) {
         return GenerateBinaryInfix(operation, "<", Type::Bool, type, type);
     }
 
     template <Type type>
-    std::string LogicalEqual(Operation operation) {
+    Expression LogicalEqual(Operation operation) {
         return GenerateBinaryInfix(operation, "==", Type::Bool, type, type);
     }
 
     template <Type type>
-    std::string LogicalLessEqual(Operation operation) {
+    Expression LogicalLessEqual(Operation operation) {
         return GenerateBinaryInfix(operation, "<=", Type::Bool, type, type);
     }
 
     template <Type type>
-    std::string LogicalGreaterThan(Operation operation) {
+    Expression LogicalGreaterThan(Operation operation) {
         return GenerateBinaryInfix(operation, ">", Type::Bool, type, type);
     }
 
     template <Type type>
-    std::string LogicalNotEqual(Operation operation) {
+    Expression LogicalNotEqual(Operation operation) {
         return GenerateBinaryInfix(operation, "!=", Type::Bool, type, type);
     }
 
     template <Type type>
-    std::string LogicalGreaterEqual(Operation operation) {
+    Expression LogicalGreaterEqual(Operation operation) {
         return GenerateBinaryInfix(operation, ">=", Type::Bool, type, type);
     }
 
-    std::string LogicalFIsNan(Operation operation) {
-        return GenerateUnary(operation, "isnan", Type::Bool, Type::Float, false);
+    Expression LogicalFIsNan(Operation operation) {
+        return GenerateUnary(operation, "isnan", Type::Bool, Type::Float);
     }
 
-    std::string LogicalAssign(Operation operation) {
+    Expression LogicalAssign(Operation operation) {
         const Node& dest = operation[0];
         const Node& src = operation[1];
 
@@ -1400,78 +1502,80 @@ private:
             target = GetInternalFlag(flag->GetFlag());
         }
 
-        code.AddLine("{} = {};", target, Visit(src));
+        code.AddLine("{} = {};", target, Visit(src).AsBool());
         return {};
     }
 
-    std::string LogicalAnd(Operation operation) {
+    Expression LogicalAnd(Operation operation) {
         return GenerateBinaryInfix(operation, "&&", Type::Bool, Type::Bool, Type::Bool);
     }
 
-    std::string LogicalOr(Operation operation) {
+    Expression LogicalOr(Operation operation) {
         return GenerateBinaryInfix(operation, "||", Type::Bool, Type::Bool, Type::Bool);
     }
 
-    std::string LogicalXor(Operation operation) {
+    Expression LogicalXor(Operation operation) {
         return GenerateBinaryInfix(operation, "^^", Type::Bool, Type::Bool, Type::Bool);
     }
 
-    std::string LogicalNegate(Operation operation) {
-        return GenerateUnary(operation, "!", Type::Bool, Type::Bool, false);
+    Expression LogicalNegate(Operation operation) {
+        return GenerateUnary(operation, "!", Type::Bool, Type::Bool);
     }
 
-    std::string LogicalPick2(Operation operation) {
-        const std::string pair = VisitOperand(operation, 0, Type::Bool2);
-        return fmt::format("{}[{}]", pair, VisitOperand(operation, 1, Type::Uint));
+    Expression LogicalPick2(Operation operation) {
+        return {fmt::format("{}[{}]", VisitOperand(operation, 0).AsBool2(),
+                            VisitOperand(operation, 1).AsUint()),
+                Type::Bool};
     }
 
-    std::string LogicalAnd2(Operation operation) {
+    Expression LogicalAnd2(Operation operation) {
         return GenerateUnary(operation, "all", Type::Bool, Type::Bool2);
     }
 
     template <bool with_nan>
-    std::string GenerateHalfComparison(Operation operation, const std::string& compare_op) {
-        const std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
-                                                        Type::HalfFloat, Type::HalfFloat)};
+    Expression GenerateHalfComparison(Operation operation, std::string_view compare_op) {
+        Expression comparison = GenerateBinaryCall(operation, compare_op, Type::Bool2,
+                                                   Type::HalfFloat, Type::HalfFloat);
         if constexpr (!with_nan) {
             return comparison;
         }
-        return fmt::format("halfFloatNanComparison({}, {}, {})", comparison,
-                           VisitOperand(operation, 0, Type::HalfFloat),
-                           VisitOperand(operation, 1, Type::HalfFloat));
+        return {fmt::format("HalfFloatNanComparison({}, {}, {})", comparison.AsBool2(),
+                            VisitOperand(operation, 0).AsHalfFloat(),
+                            VisitOperand(operation, 1).AsHalfFloat()),
+                Type::Bool2};
     }
 
     template <bool with_nan>
-    std::string Logical2HLessThan(Operation operation) {
+    Expression Logical2HLessThan(Operation operation) {
         return GenerateHalfComparison<with_nan>(operation, "lessThan");
     }
 
     template <bool with_nan>
-    std::string Logical2HEqual(Operation operation) {
+    Expression Logical2HEqual(Operation operation) {
         return GenerateHalfComparison<with_nan>(operation, "equal");
     }
 
     template <bool with_nan>
-    std::string Logical2HLessEqual(Operation operation) {
+    Expression Logical2HLessEqual(Operation operation) {
         return GenerateHalfComparison<with_nan>(operation, "lessThanEqual");
     }
 
     template <bool with_nan>
-    std::string Logical2HGreaterThan(Operation operation) {
+    Expression Logical2HGreaterThan(Operation operation) {
         return GenerateHalfComparison<with_nan>(operation, "greaterThan");
     }
 
     template <bool with_nan>
-    std::string Logical2HNotEqual(Operation operation) {
+    Expression Logical2HNotEqual(Operation operation) {
         return GenerateHalfComparison<with_nan>(operation, "notEqual");
     }
 
     template <bool with_nan>
-    std::string Logical2HGreaterEqual(Operation operation) {
+    Expression Logical2HGreaterEqual(Operation operation) {
         return GenerateHalfComparison<with_nan>(operation, "greaterThanEqual");
     }
 
-    std::string Texture(Operation operation) {
+    Expression Texture(Operation operation) {
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
@@ -1480,10 +1584,10 @@ private:
         if (meta->sampler.IsShadow()) {
             expr = "vec4(" + expr + ')';
         }
-        return expr + GetSwizzle(meta->element);
+        return {expr + GetSwizzle(meta->element), Type::Float};
     }
 
-    std::string TextureLod(Operation operation) {
+    Expression TextureLod(Operation operation) {
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
@@ -1492,54 +1596,54 @@ private:
         if (meta->sampler.IsShadow()) {
             expr = "vec4(" + expr + ')';
         }
-        return expr + GetSwizzle(meta->element);
+        return {expr + GetSwizzle(meta->element), Type::Float};
     }
 
-    std::string TextureGather(Operation operation) {
+    Expression TextureGather(Operation operation) {
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
         const auto type = meta->sampler.IsShadow() ? Type::Float : Type::Int;
-        return GenerateTexture(operation, "Gather",
-                               {TextureArgument{type, meta->component}, TextureAoffi{}}) +
-               GetSwizzle(meta->element);
+        return {GenerateTexture(operation, "Gather",
+                                {TextureArgument{type, meta->component}, TextureAoffi{}}) +
+                    GetSwizzle(meta->element),
+                Type::Float};
     }
 
-    std::string TextureQueryDimensions(Operation operation) {
+    Expression TextureQueryDimensions(Operation operation) {
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
         const std::string sampler = GetSampler(meta->sampler);
-        const std::string lod = VisitOperand(operation, 0, Type::Int);
+        const std::string lod = VisitOperand(operation, 0).AsInt();
 
         switch (meta->element) {
         case 0:
         case 1:
-            return fmt::format("itof(int(textureSize({}, {}){}))", sampler, lod,
-                               GetSwizzle(meta->element));
-        case 2:
-            return "0";
+            return {fmt::format("textureSize({}, {}){}", sampler, lod, GetSwizzle(meta->element)),
+                    Type::Int};
         case 3:
-            return fmt::format("itof(textureQueryLevels({}))", sampler);
+            return {fmt::format("textureQueryLevels({})", sampler), Type::Int};
         }
         UNREACHABLE();
-        return "0";
+        return {"0", Type::Int};
     }
 
-    std::string TextureQueryLod(Operation operation) {
+    Expression TextureQueryLod(Operation operation) {
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
 
         if (meta->element < 2) {
-            return fmt::format("itof(int(({} * vec2(256)){}))",
-                               GenerateTexture(operation, "QueryLod", {}),
-                               GetSwizzle(meta->element));
+            return {fmt::format("int(({} * vec2(256)){})",
+                                GenerateTexture(operation, "QueryLod", {}),
+                                GetSwizzle(meta->element)),
+                    Type::Int};
         }
-        return "0";
+        return {"0", Type::Int};
     }
 
-    std::string TexelFetch(Operation operation) {
-        constexpr std::array<const char*, 4> constructors = {"int", "ivec2", "ivec3", "ivec4"};
+    Expression TexelFetch(Operation operation) {
+        constexpr std::array constructors = {"int", "ivec2", "ivec3", "ivec4"};
         const auto meta = std::get_if<MetaTexture>(&operation.GetMeta());
         ASSERT(meta);
         UNIMPLEMENTED_IF(meta->sampler.IsArray());
@@ -1552,7 +1656,7 @@ private:
         expr += constructors.at(operation.GetOperandsCount() - 1);
         expr += '(';
         for (std::size_t i = 0; i < count; ++i) {
-            expr += VisitOperand(operation, i, Type::Int);
+            expr += VisitOperand(operation, i).AsInt();
             const std::size_t next = i + 1;
             if (next == count)
                 expr += ')';
@@ -1565,7 +1669,7 @@ private:
 
         if (meta->lod) {
             expr += ", ";
-            expr += CastOperand(Visit(meta->lod), Type::Int);
+            expr += Visit(meta->lod).AsInt();
         }
         expr += ')';
         expr += GetSwizzle(meta->element);
@@ -1580,11 +1684,11 @@ private:
         code.AddLine("float {} = {};", tmp, expr);
         code.AddLine("#endif");
 
-        return tmp;
+        return {tmp, Type::Float};
     }
 
-    std::string ImageStore(Operation operation) {
-        constexpr std::array<const char*, 4> constructors{"int(", "ivec2(", "ivec3(", "ivec4("};
+    Expression ImageStore(Operation operation) {
+        constexpr std::array constructors{"int(", "ivec2(", "ivec3(", "ivec4("};
         const auto meta{std::get<MetaImage>(operation.GetMeta())};
 
         std::string expr = "imageStore(";
@@ -1594,7 +1698,7 @@ private:
         const std::size_t coords_count{operation.GetOperandsCount()};
         expr += constructors.at(coords_count - 1);
         for (std::size_t i = 0; i < coords_count; ++i) {
-            expr += VisitOperand(operation, i, Type::Int);
+            expr += VisitOperand(operation, i).AsInt();
             if (i + 1 < coords_count) {
                 expr += ", ";
             }
@@ -1605,7 +1709,7 @@ private:
         UNIMPLEMENTED_IF(values_count != 4);
         expr += "vec4(";
         for (std::size_t i = 0; i < values_count; ++i) {
-            expr += Visit(meta.values.at(i));
+            expr += Visit(meta.values.at(i)).AsFloat();
             if (i + 1 < values_count) {
                 expr += ", ";
             }
@@ -1616,52 +1720,52 @@ private:
         return {};
     }
 
-    std::string Branch(Operation operation) {
+    Expression Branch(Operation operation) {
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
-        code.AddLine("jmp_to = 0x{:x}u;", target->GetValue());
+        code.AddLine("jmp_to = 0x{:X}U;", target->GetValue());
         code.AddLine("break;");
         return {};
     }
 
-    std::string BranchIndirect(Operation operation) {
-        const std::string op_a = VisitOperand(operation, 0, Type::Uint);
+    Expression BranchIndirect(Operation operation) {
+        const std::string op_a = VisitOperand(operation, 0).AsUint();
 
         code.AddLine("jmp_to = {};", op_a);
         code.AddLine("break;");
         return {};
     }
 
-    std::string PushFlowStack(Operation operation) {
+    Expression PushFlowStack(Operation operation) {
         const auto stack = std::get<MetaStackClass>(operation.GetMeta());
         const auto target = std::get_if<ImmediateNode>(&*operation[0]);
         UNIMPLEMENTED_IF(!target);
 
-        code.AddLine("{}[{}++] = 0x{:x}u;", FlowStackName(stack), FlowStackTopName(stack),
+        code.AddLine("{}[{}++] = 0x{:X}U;", FlowStackName(stack), FlowStackTopName(stack),
                      target->GetValue());
         return {};
     }
 
-    std::string PopFlowStack(Operation operation) {
+    Expression PopFlowStack(Operation operation) {
         const auto stack = std::get<MetaStackClass>(operation.GetMeta());
         code.AddLine("jmp_to = {}[--{}];", FlowStackName(stack), FlowStackTopName(stack));
         code.AddLine("break;");
         return {};
     }
 
-    std::string Exit(Operation operation) {
+    Expression Exit(Operation operation) {
         if (stage != ProgramType::Fragment) {
             code.AddLine("return;");
             return {};
         }
         const auto& used_registers = ir.GetRegisters();
-        const auto SafeGetRegister = [&](u32 reg) -> std::string {
+        const auto SafeGetRegister = [&](u32 reg) -> Expression {
             // TODO(Rodrigo): Replace with contains once C++20 releases
             if (used_registers.find(reg) != used_registers.end()) {
-                return GetRegister(reg);
+                return {GetRegister(reg), Type::Float};
             }
-            return "0.0f";
+            return {"0.0f", Type::Float};
         };
 
         UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0, "Sample mask write is unimplemented");
@@ -1674,7 +1778,7 @@ private:
             for (u32 component = 0; component < 4; ++component) {
                 if (header.ps.IsColorComponentOutputEnabled(render_target, component)) {
                     code.AddLine("FragColor{}[{}] = {};", render_target, component,
-                                 SafeGetRegister(current_reg));
+                                 SafeGetRegister(current_reg).AsFloat());
                     ++current_reg;
                 }
             }
@@ -1683,14 +1787,14 @@ private:
         if (header.ps.omap.depth) {
             // The depth output is always 2 registers after the last color output, and current_reg
             // already contains one past the last color register.
-            code.AddLine("gl_FragDepth = {};", SafeGetRegister(current_reg + 1));
+            code.AddLine("gl_FragDepth = {};", SafeGetRegister(current_reg + 1).AsFloat());
         }
 
         code.AddLine("return;");
         return {};
     }
 
-    std::string Discard(Operation operation) {
+    Expression Discard(Operation operation) {
         // Enclose "discard" in a conditional, so that GLSL compilation does not complain
         // about unexecuted instructions that may follow this.
         code.AddLine("if (true) {{");
@@ -1701,7 +1805,7 @@ private:
         return {};
     }
 
-    std::string EmitVertex(Operation operation) {
+    Expression EmitVertex(Operation operation) {
         ASSERT_MSG(stage == ProgramType::Geometry,
                    "EmitVertex is expected to be used in a geometry shader.");
 
@@ -1712,7 +1816,7 @@ private:
         return {};
     }
 
-    std::string EndPrimitive(Operation operation) {
+    Expression EndPrimitive(Operation operation) {
         ASSERT_MSG(stage == ProgramType::Geometry,
                    "EndPrimitive is expected to be used in a geometry shader.");
 
@@ -1720,19 +1824,61 @@ private:
         return {};
     }
 
-    std::string YNegate(Operation operation) {
+    Expression YNegate(Operation operation) {
         // Config pack's third value is Y_NEGATE's state.
-        return "uintBitsToFloat(config_pack[2])";
+        return {"config_pack[2]", Type::Uint};
     }
 
     template <u32 element>
-    std::string LocalInvocationId(Operation) {
-        return "utof(gl_LocalInvocationID"s + GetSwizzle(element) + ')';
+    Expression LocalInvocationId(Operation) {
+        return {"gl_LocalInvocationID"s + GetSwizzle(element), Type::Uint};
     }
 
     template <u32 element>
-    std::string WorkGroupId(Operation) {
-        return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
+    Expression WorkGroupId(Operation) {
+        return {"gl_WorkGroupID"s + GetSwizzle(element), Type::Uint};
+    }
+
+    Expression BallotThread(Operation operation) {
+        const std::string value = VisitOperand(operation, 0).AsBool();
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia warp intrinsics are not available and its required by a shader");
+            // Stub on non-Nvidia devices by simulating all threads voting the same as the active
+            // one.
+            return {fmt::format("({} ? 0xFFFFFFFFU : 0U)", value), Type::Uint};
+        }
+        return {fmt::format("ballotThreadNV({})", value), Type::Uint};
+    }
+
+    Expression Vote(Operation operation, const char* func) {
+        const std::string value = VisitOperand(operation, 0).AsBool();
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia vote intrinsics are not available and its required by a shader");
+            // Stub with a warp size of one.
+            return {value, Type::Bool};
+        }
+        return {fmt::format("{}({})", func, value), Type::Bool};
+    }
+
+    Expression VoteAll(Operation operation) {
+        return Vote(operation, "allThreadsNV");
+    }
+
+    Expression VoteAny(Operation operation) {
+        return Vote(operation, "anyThreadNV");
+    }
+
+    Expression VoteEqual(Operation operation) {
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia vote intrinsics are not available and its required by a shader");
+            // We must return true here since a stub for a theoretical warp size of 1 will always
+            // return an equal result for all its votes.
+            return {"true", Type::Bool};
+        }
+        return Vote(operation, "allThreadsEqualNV");
     }
 
     static constexpr std::array operation_decompilers = {
@@ -1885,6 +2031,11 @@ private:
         &GLSLDecompiler::WorkGroupId<0>,
         &GLSLDecompiler::WorkGroupId<1>,
         &GLSLDecompiler::WorkGroupId<2>,
+
+        &GLSLDecompiler::BallotThread,
+        &GLSLDecompiler::VoteAll,
+        &GLSLDecompiler::VoteAny,
+        &GLSLDecompiler::VoteEqual,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
@@ -1926,8 +2077,8 @@ private:
     }
 
     std::string GetInternalFlag(InternalFlag flag) const {
-        constexpr std::array<const char*, 4> InternalFlagNames = {"zero_flag", "sign_flag",
-                                                                  "carry_flag", "overflow_flag"};
+        constexpr std::array InternalFlagNames = {"zero_flag", "sign_flag", "carry_flag",
+                                                  "overflow_flag"};
         const auto index = static_cast<u32>(flag);
         ASSERT(index < static_cast<u32>(InternalFlag::Amount));
 
@@ -1975,24 +2126,16 @@ private:
 
 std::string GetCommonDeclarations() {
     return fmt::format(
-        "#define MAX_CONSTBUFFER_ELEMENTS {}\n"
         "#define ftoi floatBitsToInt\n"
         "#define ftou floatBitsToUint\n"
         "#define itof intBitsToFloat\n"
         "#define utof uintBitsToFloat\n\n"
-        "float fromHalf2(vec2 pair) {{\n"
-        "    return utof(packHalf2x16(pair));\n"
-        "}}\n\n"
-        "vec2 toHalf2(float value) {{\n"
-        "    return unpackHalf2x16(ftou(value));\n"
-        "}}\n\n"
-        "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{\n"
+        "bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{\n"
         "    bvec2 is_nan1 = isnan(pair1);\n"
         "    bvec2 is_nan2 = isnan(pair2);\n"
         "    return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || "
         "is_nan2.y);\n"
-        "}}\n",
-        MAX_CONSTBUFFER_ELEMENTS);
+        "}}\n\n");
 }
 
 ProgramResult Decompile(const Device& device, const ShaderIR& ir, ProgramType stage,
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 408332f90..4f135fe03 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -184,6 +184,9 @@ GLint GetSwizzleSource(SwizzleSource source) {
 }
 
 void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) {
+    if (params.IsBuffer()) {
+        return;
+    }
     glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
     glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
     glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
@@ -208,6 +211,7 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
         glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(),
                              nullptr, GL_DYNAMIC_STORAGE_BIT);
         glTextureBuffer(texture.handle, internal_format, texture_buffer.handle);
+        break;
     case SurfaceTarget::Texture2D:
     case SurfaceTarget::TextureCubemap:
         glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width,
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index ff6ab6988..21324488a 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -51,7 +51,7 @@ public:
     }
 
 protected:
-    void DecorateSurfaceName();
+    void DecorateSurfaceName() override;
 
     View CreateView(const ViewParams& view_key) override;
     View CreateViewInner(const ViewParams& view_key, bool is_proxy);
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index a05cef3b9..af9684839 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -101,9 +101,7 @@ RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::Syst
 
 RendererOpenGL::~RendererOpenGL() = default;
 
-void RendererOpenGL::SwapBuffers(
-    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
-
+void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
     system.GetPerfStats().EndSystemFrame();
 
     // Maintain the rasterizer's state as a priority
@@ -113,9 +111,9 @@ void RendererOpenGL::SwapBuffers(
 
     if (framebuffer) {
         // If framebuffer is provided, reload it from memory to a texture
-        if (screen_info.texture.width != (GLsizei)framebuffer->get().width ||
-            screen_info.texture.height != (GLsizei)framebuffer->get().height ||
-            screen_info.texture.pixel_format != framebuffer->get().pixel_format) {
+        if (screen_info.texture.width != static_cast<GLsizei>(framebuffer->width) ||
+            screen_info.texture.height != static_cast<GLsizei>(framebuffer->height) ||
+            screen_info.texture.pixel_format != framebuffer->pixel_format) {
             // Reallocate texture if the framebuffer size has changed.
             // This is expected to not happen very often and hence should not be a
             // performance problem.
@@ -149,43 +147,43 @@ void RendererOpenGL::SwapBuffers(
  * Loads framebuffer from emulated memory into the active OpenGL texture.
  */
 void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuffer) {
-    const u32 bytes_per_pixel{Tegra::FramebufferConfig::BytesPerPixel(framebuffer.pixel_format)};
-    const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel};
-    const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset};
-
     // Framebuffer orientation handling
     framebuffer_transform_flags = framebuffer.transform_flags;
     framebuffer_crop_rect = framebuffer.crop_rect;
 
-    // Ensure no bad interactions with GL_UNPACK_ALIGNMENT, which by default
-    // only allows rows to have a memory alignement of 4.
-    ASSERT(framebuffer.stride % 4 == 0);
-
-    if (!rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) {
-        // Reset the screen info's display texture to its own permanent texture
-        screen_info.display_texture = screen_info.texture.resource.handle;
-
-        rasterizer->FlushRegion(ToCacheAddr(Memory::GetPointer(framebuffer_addr)), size_in_bytes);
-
-        constexpr u32 linear_bpp = 4;
-        VideoCore::MortonCopyPixels128(VideoCore::MortonSwizzleMode::MortonToLinear,
-                                       framebuffer.width, framebuffer.height, bytes_per_pixel,
-                                       linear_bpp, Memory::GetPointer(framebuffer_addr),
-                                       gl_framebuffer_data.data());
-
-        glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride));
+    const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset};
+    if (rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) {
+        return;
+    }
 
-        // Update existing texture
-        // TODO: Test what happens on hardware when you change the framebuffer dimensions so that
-        //       they differ from the LCD resolution.
-        // TODO: Applications could theoretically crash yuzu here by specifying too large
-        //       framebuffer sizes. We should make sure that this cannot happen.
-        glTextureSubImage2D(screen_info.texture.resource.handle, 0, 0, 0, framebuffer.width,
-                            framebuffer.height, screen_info.texture.gl_format,
-                            screen_info.texture.gl_type, gl_framebuffer_data.data());
+    // Reset the screen info's display texture to its own permanent texture
+    screen_info.display_texture = screen_info.texture.resource.handle;
 
-        glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
-    }
+    const auto pixel_format{
+        VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)};
+    const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)};
+    const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel};
+    const auto host_ptr{Memory::GetPointer(framebuffer_addr)};
+    rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes);
+
+    // TODO(Rodrigo): Read this from HLE
+    constexpr u32 block_height_log2 = 4;
+    VideoCore::MortonSwizzle(VideoCore::MortonSwizzleMode::MortonToLinear, pixel_format,
+                             framebuffer.stride, block_height_log2, framebuffer.height, 0, 1, 1,
+                             gl_framebuffer_data.data(), host_ptr);
+
+    glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride));
+
+    // Update existing texture
+    // TODO: Test what happens on hardware when you change the framebuffer dimensions so that
+    //       they differ from the LCD resolution.
+    // TODO: Applications could theoretically crash yuzu here by specifying too large
+    //       framebuffer sizes. We should make sure that this cannot happen.
+    glTextureSubImage2D(screen_info.texture.resource.handle, 0, 0, 0, framebuffer.width,
+                        framebuffer.height, screen_info.texture.gl_format,
+                        screen_info.texture.gl_type, gl_framebuffer_data.data());
+
+    glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
 }
 
 /**
@@ -276,22 +274,29 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture,
     texture.height = framebuffer.height;
     texture.pixel_format = framebuffer.pixel_format;
 
+    const auto pixel_format{
+        VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)};
+    const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)};
+    gl_framebuffer_data.resize(texture.width * texture.height * bytes_per_pixel);
+
     GLint internal_format;
     switch (framebuffer.pixel_format) {
     case Tegra::FramebufferConfig::PixelFormat::ABGR8:
         internal_format = GL_RGBA8;
         texture.gl_format = GL_RGBA;
         texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV;
-        gl_framebuffer_data.resize(texture.width * texture.height * 4);
+        break;
+    case Tegra::FramebufferConfig::PixelFormat::RGB565:
+        internal_format = GL_RGB565;
+        texture.gl_format = GL_RGB;
+        texture.gl_type = GL_UNSIGNED_SHORT_5_6_5;
         break;
     default:
         internal_format = GL_RGBA8;
         texture.gl_format = GL_RGBA;
         texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV;
-        gl_framebuffer_data.resize(texture.width * texture.height * 4);
-        LOG_CRITICAL(Render_OpenGL, "Unknown framebuffer pixel format: {}",
-                     static_cast<u32>(framebuffer.pixel_format));
-        UNREACHABLE();
+        UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}",
+                          static_cast<u32>(framebuffer.pixel_format));
     }
 
     texture.resource.Release();
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index 4aebf2321..9bd086368 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -43,14 +43,13 @@ struct ScreenInfo {
     TextureInfo texture;
 };
 
-class RendererOpenGL : public VideoCore::RendererBase {
+class RendererOpenGL final : public VideoCore::RendererBase {
 public:
     explicit RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system);
     ~RendererOpenGL() override;
 
     /// Swap buffers (render frame)
-    void SwapBuffers(
-        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
+    void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override;
 
     /// Initialize the renderer
     bool Init() override;
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 24a591797..a35b45c9c 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1072,6 +1072,26 @@ private:
         return {};
     }
 
+    Id BallotThread(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteAll(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteAny(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteEqual(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
                       const std::string& name) {
         const Id id = OpVariable(type, storage);
@@ -1364,6 +1384,11 @@ private:
         &SPIRVDecompiler::WorkGroupId<0>,
         &SPIRVDecompiler::WorkGroupId<1>,
         &SPIRVDecompiler::WorkGroupId<2>,
+
+        &SPIRVDecompiler::BallotThread,
+        &SPIRVDecompiler::VoteAll,
+        &SPIRVDecompiler::VoteAny,
+        &SPIRVDecompiler::VoteEqual,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index b547d8323..47a9fd961 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -176,6 +176,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
         {OpCode::Type::Ffma, &ShaderIR::DecodeFfma},
         {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2},
         {OpCode::Type::Conversion, &ShaderIR::DecodeConversion},
+        {OpCode::Type::Warp, &ShaderIR::DecodeWarp},
         {OpCode::Type::Memory, &ShaderIR::DecodeMemory},
         {OpCode::Type::Texture, &ShaderIR::DecodeTexture},
         {OpCode::Type::Image, &ShaderIR::DecodeImage},
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 8973fbefa..32facd6ba 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -14,6 +14,12 @@ using Tegra::Shader::Instruction;
 using Tegra::Shader::OpCode;
 using Tegra::Shader::Register;
 
+namespace {
+constexpr OperationCode GetFloatSelector(u64 selector) {
+    return selector == 0 ? OperationCode::FCastHalf0 : OperationCode::FCastHalf1;
+}
+} // Anonymous namespace
+
 u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
     const auto opcode = OpCode::Decode(instr);
@@ -22,7 +28,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     case OpCode::Id::I2I_R:
     case OpCode::Id::I2I_C:
     case OpCode::Id::I2I_IMM: {
-        UNIMPLEMENTED_IF(instr.conversion.selector);
+        UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
         UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
         UNIMPLEMENTED_IF(instr.alu.saturate_d);
 
@@ -57,8 +63,8 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
     case OpCode::Id::I2F_R:
     case OpCode::Id::I2F_C:
     case OpCode::Id::I2F_IMM: {
+        UNIMPLEMENTED_IF(instr.conversion.int_src.selector != 0);
         UNIMPLEMENTED_IF(instr.conversion.dst_size == Register::Size::Long);
-        UNIMPLEMENTED_IF(instr.conversion.selector);
         UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                              "Condition codes generation in I2F is not implemented");
 
@@ -113,8 +119,10 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         }();
 
         if (instr.conversion.src_size == Register::Size::Short) {
-            // TODO: figure where extract is sey in the encoding
-            value = Operation(OperationCode::FCastHalf0, PRECISE, value);
+            value = Operation(GetFloatSelector(instr.conversion.float_src.selector), NO_PRECISE,
+                              std::move(value));
+        } else {
+            ASSERT(instr.conversion.float_src.selector == 0);
         }
 
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
@@ -169,8 +177,10 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
         }();
 
         if (instr.conversion.src_size == Register::Size::Short) {
-            // TODO: figure where extract is sey in the encoding
-            value = Operation(OperationCode::FCastHalf0, PRECISE, value);
+            value = Operation(GetFloatSelector(instr.conversion.float_src.selector), NO_PRECISE,
+                              std::move(value));
+        } else {
+            ASSERT(instr.conversion.float_src.selector == 0);
         }
 
         value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
diff --git a/src/video_core/shader/decode/float_set.cpp b/src/video_core/shader/decode/float_set.cpp
index f5013e44a..5614e8a0d 100644
--- a/src/video_core/shader/decode/float_set.cpp
+++ b/src/video_core/shader/decode/float_set.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0,
                                             instr.fset.neg_a != 0);
diff --git a/src/video_core/shader/decode/float_set_predicate.cpp b/src/video_core/shader/decode/float_set_predicate.cpp
index 2323052b0..200c2c983 100644
--- a/src/video_core/shader/decode/float_set_predicate.cpp
+++ b/src/video_core/shader/decode/float_set_predicate.cpp
@@ -16,10 +16,9 @@ using Tegra::Shader::Pred;
 
 u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
-    const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0,
-                                            instr.fsetp.neg_a != 0);
+    Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0,
+                                      instr.fsetp.neg_a != 0);
     Node op_b = [&]() {
         if (instr.is_b_imm) {
             return GetImmediate19(instr);
@@ -29,12 +28,13 @@ u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
             return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
         }
     }();
-    op_b = GetOperandAbsNegFloat(op_b, instr.fsetp.abs_b, false);
+    op_b = GetOperandAbsNegFloat(std::move(op_b), instr.fsetp.abs_b, instr.fsetp.neg_b);
 
     // We can't use the constant predicate as destination.
     ASSERT(instr.fsetp.pred3 != static_cast<u64>(Pred::UnusedIndex));
 
-    const Node predicate = GetPredicateComparisonFloat(instr.fsetp.cond, op_a, op_b);
+    const Node predicate =
+        GetPredicateComparisonFloat(instr.fsetp.cond, std::move(op_a), std::move(op_b));
     const Node second_pred = GetPredicate(instr.fsetp.pred39, instr.fsetp.neg_pred != 0);
 
     const OperationCode combiner = GetPredicateCombiner(instr.fsetp.op);
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index afea33e5f..840694527 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -42,9 +42,8 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
         cond = instr.hsetp2.reg.cond;
         h_and = instr.hsetp2.reg.h_and;
         op_b =
-            UnpackHalfFloat(GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.reg.abs_b,
-                                                 instr.hsetp2.reg.negate_b),
-                            instr.hsetp2.reg.type_b);
+            GetOperandAbsNegHalf(UnpackHalfFloat(GetRegister(instr.gpr20), instr.hsetp2.reg.type_b),
+                                 instr.hsetp2.reg.abs_b, instr.hsetp2.reg.negate_b);
         break;
     default:
         UNREACHABLE();
@@ -52,22 +51,22 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
     }
 
     const OperationCode combiner = GetPredicateCombiner(instr.hsetp2.op);
-    const Node combined_pred = GetPredicate(instr.hsetp2.pred3, instr.hsetp2.neg_pred);
+    const Node combined_pred = GetPredicate(instr.hsetp2.pred39, instr.hsetp2.neg_pred);
 
     const auto Write = [&](u64 dest, Node src) {
         SetPredicate(bb, dest, Operation(combiner, std::move(src), combined_pred));
     };
 
     const Node comparison = GetPredicateComparisonHalf(cond, op_a, op_b);
-    const u64 first = instr.hsetp2.pred0;
-    const u64 second = instr.hsetp2.pred39;
+    const u64 first = instr.hsetp2.pred3;
+    const u64 second = instr.hsetp2.pred0;
     if (h_and) {
-        const Node joined = Operation(OperationCode::LogicalAnd2, comparison);
+        Node joined = Operation(OperationCode::LogicalAnd2, comparison);
         Write(first, joined);
-        Write(second, Operation(OperationCode::LogicalNegate, joined));
+        Write(second, Operation(OperationCode::LogicalNegate, std::move(joined)));
     } else {
-        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0u)));
-        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1u)));
+        Write(first, Operation(OperationCode::LogicalPick2, comparison, Immediate(0U)));
+        Write(second, Operation(OperationCode::LogicalPick2, comparison, Immediate(1U)));
     }
 
     return pc;
diff --git a/src/video_core/shader/decode/integer_set.cpp b/src/video_core/shader/decode/integer_set.cpp
index 46e3d5905..59809bcd8 100644
--- a/src/video_core/shader/decode/integer_set.cpp
+++ b/src/video_core/shader/decode/integer_set.cpp
@@ -14,7 +14,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetRegister(instr.gpr8);
     const Node op_b = [&]() {
diff --git a/src/video_core/shader/decode/integer_set_predicate.cpp b/src/video_core/shader/decode/integer_set_predicate.cpp
index dd20775d7..25e48fef8 100644
--- a/src/video_core/shader/decode/integer_set_predicate.cpp
+++ b/src/video_core/shader/decode/integer_set_predicate.cpp
@@ -16,7 +16,6 @@ using Tegra::Shader::Pred;
 
 u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetRegister(instr.gpr8);
 
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index ac0e764d6..d46e0f823 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -74,6 +74,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
             case SystemVariable::InvocationInfo:
                 LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
                 return Immediate(0u);
+            case SystemVariable::Tid: {
+                Node value = Immediate(0);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5);
+                return value;
+            }
             case SystemVariable::TidX:
                 return Operation(OperationCode::LocalInvocationIdX);
             case SystemVariable::TidY:
diff --git a/src/video_core/shader/decode/predicate_set_register.cpp b/src/video_core/shader/decode/predicate_set_register.cpp
index febbfeb50..84dbc50fe 100644
--- a/src/video_core/shader/decode/predicate_set_register.cpp
+++ b/src/video_core/shader/decode/predicate_set_register.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                          "Condition codes generation in PSET is not implemented");
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
new file mode 100644
index 000000000..04ca74f46
--- /dev/null
+++ b/src/video_core/shader/decode/warp.cpp
@@ -0,0 +1,55 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+using Tegra::Shader::Pred;
+using Tegra::Shader::VoteOperation;
+
+namespace {
+OperationCode GetOperationCode(VoteOperation vote_op) {
+    switch (vote_op) {
+    case VoteOperation::All:
+        return OperationCode::VoteAll;
+    case VoteOperation::Any:
+        return OperationCode::VoteAny;
+    case VoteOperation::Eq:
+        return OperationCode::VoteEqual;
+    default:
+        UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op));
+        return OperationCode::VoteAll;
+    }
+}
+} // Anonymous namespace
+
+u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
+    const Instruction instr = {program_code[pc]};
+    const auto opcode = OpCode::Decode(instr);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::VOTE: {
+        const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0);
+        const Node active = Operation(OperationCode::BallotThread, value);
+        const Node vote = Operation(GetOperationCode(instr.vote.operation), value);
+        SetRegister(bb, instr.gpr0, active);
+        SetPredicate(bb, instr.vote.dest_pred, vote);
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
+        break;
+    }
+
+    return pc;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 5f0852364..5db9313c4 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -168,6 +168,11 @@ enum class OperationCode {
     WorkGroupIdY,       /// () -> uint
     WorkGroupIdZ,       /// () -> uint
 
+    BallotThread, /// (bool) -> uint
+    VoteAll,      /// (bool) -> bool
+    VoteAny,      /// (bool) -> bool
+    VoteEqual,    /// (bool) -> bool
+
     Amount,
 };
 
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 5e91fe129..1e5c7f660 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -405,4 +405,9 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
                      Immediate(offset), Immediate(bits));
 }
 
+Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) {
+    return Operation(OperationCode::UBitfieldInsert, NO_PRECISE, base, insert, Immediate(offset),
+                     Immediate(bits));
+}
+
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 59a083d90..bcc9b79b6 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -167,6 +167,7 @@ private:
     u32 DecodeFfma(NodeBlock& bb, u32 pc);
     u32 DecodeHfma2(NodeBlock& bb, u32 pc);
     u32 DecodeConversion(NodeBlock& bb, u32 pc);
+    u32 DecodeWarp(NodeBlock& bb, u32 pc);
     u32 DecodeMemory(NodeBlock& bb, u32 pc);
     u32 DecodeTexture(NodeBlock& bb, u32 pc);
     u32 DecodeImage(NodeBlock& bb, u32 pc);
@@ -279,6 +280,9 @@ private:
     /// Extracts a sequence of bits from a node
     Node BitfieldExtract(Node value, u32 offset, u32 bits);
 
+    /// Inserts a sequence of bits from a node
+    Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits);
+
     void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
                                   const Node4& components);
 
diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp
index c50f6354d..4ceb219be 100644
--- a/src/video_core/surface.cpp
+++ b/src/video_core/surface.cpp
@@ -445,11 +445,12 @@ PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat
     switch (format) {
     case Tegra::FramebufferConfig::PixelFormat::ABGR8:
         return PixelFormat::ABGR8U;
+    case Tegra::FramebufferConfig::PixelFormat::RGB565:
+        return PixelFormat::B5G6R5U;
     case Tegra::FramebufferConfig::PixelFormat::BGRA8:
         return PixelFormat::BGRA8;
     default:
-        LOG_CRITICAL(HW_GPU, "Unimplemented format={}", static_cast<u32>(format));
-        UNREACHABLE();
+        UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format));
         return PixelFormat::ABGR8U;
     }
 }
diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h
index 358d6757c..e7ef66ee2 100644
--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -58,7 +58,6 @@ public:
     std::size_t GetHostSizeInBytes() const {
         std::size_t host_size_in_bytes;
         if (GetCompressionType() == SurfaceCompression::Converted) {
-            constexpr std::size_t rgb8_bpp = 4ULL;
             // ASTC is uncompressed in software, in emulated as RGBA8
             host_size_in_bytes = 0;
             for (u32 level = 0; level < num_levels; ++level) {
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index a3a3770a7..2ec0203d1 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -308,8 +308,6 @@ protected:
         if (!guard_render_targets && surface->IsRenderTarget()) {
             ManageRenderTargetUnregister(surface);
         }
-        const GPUVAddr gpu_addr = surface->GetGpuAddr();
-        const CacheAddr cache_ptr = surface->GetCacheAddr();
         const std::size_t size = surface->GetSizeInBytes();
         const VAddr cpu_addr = surface->GetCpuAddr();
         rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1);
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 7e8295944..7df5f1452 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -257,19 +257,21 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y,
 
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
                     u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
-                    u32 block_height_bit) {
+                    u32 block_height_bit, u32 offset_x, u32 offset_y) {
     const u32 block_height = 1U << block_height_bit;
     const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) /
                                   gob_size_x};
     for (u32 line = 0; line < subrect_height; ++line) {
+        const u32 dst_y = line + offset_y;
         const u32 gob_address_y =
-            (line / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
-            ((line % (gob_size_y * block_height)) / gob_size_y) * gob_size;
-        const auto& table = legacy_swizzle_table[line % gob_size_y];
+            (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
+            ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
+        const auto& table = legacy_swizzle_table[dst_y % gob_size_y];
         for (u32 x = 0; x < subrect_width; ++x) {
+            const u32 dst_x = x + offset_x;
             const u32 gob_address =
-                gob_address_y + (x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
-            const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % gob_size_x];
+                gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
+            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x];
             u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
             u8* dest_addr = swizzled_data + swizzled_offset;
 
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index eaec9b5a5..f1e3952bc 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -44,7 +44,8 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
 
 /// Copies an untiled subrectangle into a tiled surface.
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
-                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height);
+                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
+                    u32 offset_x, u32 offset_y);
 
 /// Copies a tiled subrectangle into a linear surface.
 void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index e3be018b9..e36bc2c04 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -213,7 +213,7 @@ struct TICEntry {
         if (header_version != TICHeaderVersion::OneDBuffer) {
             return width_minus_1 + 1;
         }
-        return (buffer_high_width_minus_one << 16) | buffer_low_width_minus_one;
+        return ((buffer_high_width_minus_one << 16) | buffer_low_width_minus_one) + 1;
     }
 
     u32 Height() const {