49 files changed, 953 insertions, 408 deletions
diff --git a/CMakeModules/GenerateSCMRev.cmake b/CMakeModules/GenerateSCMRev.cmake
index abdc74428..a1ace89cb 100644
--- a/CMakeModules/GenerateSCMRev.cmake
+++ b/CMakeModules/GenerateSCMRev.cmake
@@ -81,6 +81,7 @@ set(HASH_FILES
     "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
     "${VIDEO_CORE}/shader/decode/shift.cpp"
     "${VIDEO_CORE}/shader/decode/video.cpp"
+    "${VIDEO_CORE}/shader/decode/warp.cpp"
     "${VIDEO_CORE}/shader/decode/xmad.cpp"
     "${VIDEO_CORE}/shader/control_flow.cpp"
     "${VIDEO_CORE}/shader/control_flow.h"
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 2b4266f29..01abdb3bb 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -55,6 +55,7 @@ add_custom_command(OUTPUT scm_rev.cpp
       "${VIDEO_CORE}/shader/decode/register_set_predicate.cpp"
       "${VIDEO_CORE}/shader/decode/shift.cpp"
       "${VIDEO_CORE}/shader/decode/video.cpp"
+      "${VIDEO_CORE}/shader/decode/warp.cpp"
       "${VIDEO_CORE}/shader/decode/xmad.cpp"
       "${VIDEO_CORE}/shader/control_flow.cpp"
       "${VIDEO_CORE}/shader/control_flow.h"
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 7c18c27b3..e2f85c5f1 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_library(video_core STATIC
-    buffer_cache.h
+    buffer_cache/buffer_block.h
+    buffer_cache/buffer_cache.h
+    buffer_cache/map_interval.h
     dma_pusher.cpp
     dma_pusher.h
     debug_utils/debug_utils.cpp
@@ -100,6 +102,7 @@ add_library(video_core STATIC
     shader/decode/integer_set.cpp
     shader/decode/half_set.cpp
     shader/decode/video.cpp
+    shader/decode/warp.cpp
     shader/decode/xmad.cpp
     shader/decode/other.cpp
     shader/control_flow.cpp
diff --git a/src/video_core/buffer_cache.h b/src/video_core/buffer_cache.h
deleted file mode 100644
index 6f868b8b4..000000000
--- a/src/video_core/buffer_cache.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright 2019 yuzu Emulator Project
-// Licensed under GPLv2 or any later version
-// Refer to the license.txt file included.
-
-#pragma once
-
-#include <array>
-#include <memory>
-#include <mutex>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-
-#include "common/alignment.h"
-#include "common/common_types.h"
-#include "core/core.h"
-#include "video_core/memory_manager.h"
-#include "video_core/rasterizer_cache.h"
-
-namespace VideoCore {
-class RasterizerInterface;
-}
-
-namespace VideoCommon {
-
-template <typename BufferStorageType>
-class CachedBuffer final : public RasterizerCacheObject {
-public:
-    explicit CachedBuffer(VAddr cpu_addr, u8* host_ptr)
-        : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr} {}
-    ~CachedBuffer() override = default;
-
-    VAddr GetCpuAddr() const override {
-        return cpu_addr;
-    }
-
-    std::size_t GetSizeInBytes() const override {
-        return size;
-    }
-
-    u8* GetWritableHostPtr() const {
-        return host_ptr;
-    }
-
-    std::size_t GetSize() const {
-        return size;
-    }
-
-    std::size_t GetCapacity() const {
-        return capacity;
-    }
-
-    bool IsInternalized() const {
-        return is_internal;
-    }
-
-    const BufferStorageType& GetBuffer() const {
-        return buffer;
-    }
-
-    void SetSize(std::size_t new_size) {
-        size = new_size;
-    }
-
-    void SetInternalState(bool is_internal_) {
-        is_internal = is_internal_;
-    }
-
-    BufferStorageType ExchangeBuffer(BufferStorageType buffer_, std::size_t new_capacity) {
-        capacity = new_capacity;
-        std::swap(buffer, buffer_);
-        return buffer_;
-    }
-
-private:
-    u8* host_ptr{};
-    VAddr cpu_addr{};
-    std::size_t size{};
-    std::size_t capacity{};
-    bool is_internal{};
-    BufferStorageType buffer;
-};
-
-template <typename BufferStorageType, typename BufferType, typename StreamBuffer>
-class BufferCache : public RasterizerCache<std::shared_ptr<CachedBuffer<BufferStorageType>>> {
-public:
-    using Buffer = std::shared_ptr<CachedBuffer<BufferStorageType>>;
-    using BufferInfo = std::pair<const BufferType*, u64>;
-
-    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
-                         std::unique_ptr<StreamBuffer> stream_buffer)
-        : RasterizerCache<Buffer>{rasterizer}, system{system},
-          stream_buffer{std::move(stream_buffer)}, stream_buffer_handle{
-                                                       this->stream_buffer->GetHandle()} {}
-    ~BufferCache() = default;
-
-    void Unregister(const Buffer& entry) override {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-        if (entry->IsInternalized()) {
-            internalized_entries.erase(entry->GetCacheAddr());
-        }
-        ReserveBuffer(entry);
-        RasterizerCache<Buffer>::Unregister(entry);
-    }
-
-    void TickFrame() {
-        marked_for_destruction_index =
-            (marked_for_destruction_index + 1) % marked_for_destruction_ring_buffer.size();
-        MarkedForDestruction().clear();
-    }
-
-    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
-                            bool internalize = false, bool is_written = false) {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-
-        auto& memory_manager = system.GPU().MemoryManager();
-        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
-        if (!host_ptr) {
-            return {GetEmptyBuffer(size), 0};
-        }
-        const auto cache_addr = ToCacheAddr(host_ptr);
-
-        // Cache management is a big overhead, so only cache entries with a given size.
-        // TODO: Figure out which size is the best for given games.
-        constexpr std::size_t max_stream_size = 0x800;
-        if (!internalize && size < max_stream_size &&
-            internalized_entries.find(cache_addr) == internalized_entries.end()) {
-            return StreamBufferUpload(host_ptr, size, alignment);
-        }
-
-        auto entry = RasterizerCache<Buffer>::TryGet(cache_addr);
-        if (!entry) {
-            return FixedBufferUpload(gpu_addr, host_ptr, size, internalize, is_written);
-        }
-
-        if (entry->GetSize() < size) {
-            IncreaseBufferSize(entry, size);
-        }
-        if (is_written) {
-            entry->MarkAsModified(true, *this);
-        }
-        return {ToHandle(entry->GetBuffer()), 0};
-    }
-
-    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
-    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
-                                std::size_t alignment = 4) {
-        std::lock_guard lock{RasterizerCache<Buffer>::mutex};
-        return StreamBufferUpload(raw_pointer, size, alignment);
-    }
-
-    void Map(std::size_t max_size) {
-        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
-        buffer_offset = buffer_offset_base;
-    }
-
-    /// Finishes the upload stream, returns true on bindings invalidation.
-    bool Unmap() {
-        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
-        return std::exchange(invalidated, false);
-    }
-
-    virtual const BufferType* GetEmptyBuffer(std::size_t size) = 0;
-
-protected:
-    void FlushObjectInner(const Buffer& entry) override {
-        DownloadBufferData(entry->GetBuffer(), 0, entry->GetSize(), entry->GetWritableHostPtr());
-    }
-
-    virtual BufferStorageType CreateBuffer(std::size_t size) = 0;
-
-    virtual const BufferType* ToHandle(const BufferStorageType& storage) = 0;
-
-    virtual void UploadBufferData(const BufferStorageType& buffer, std::size_t offset,
-                                  std::size_t size, const u8* data) = 0;
-
-    virtual void DownloadBufferData(const BufferStorageType& buffer, std::size_t offset,
-                                    std::size_t size, u8* data) = 0;
-
-    virtual void CopyBufferData(const BufferStorageType& src, const BufferStorageType& dst,
-                                std::size_t src_offset, std::size_t dst_offset,
-                                std::size_t size) = 0;
-
-private:
-    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
-                                  std::size_t alignment) {
-        AlignBuffer(alignment);
-        const std::size_t uploaded_offset = buffer_offset;
-        std::memcpy(buffer_ptr, raw_pointer, size);
-
-        buffer_ptr += size;
-        buffer_offset += size;
-        return {&stream_buffer_handle, uploaded_offset};
-    }
-
-    BufferInfo FixedBufferUpload(GPUVAddr gpu_addr, u8* host_ptr, std::size_t size,
-                                 bool internalize, bool is_written) {
-        auto& memory_manager = Core::System::GetInstance().GPU().MemoryManager();
-        const auto cpu_addr = memory_manager.GpuToCpuAddress(gpu_addr);
-        ASSERT(cpu_addr);
-
-        auto entry = GetUncachedBuffer(*cpu_addr, host_ptr);
-        entry->SetSize(size);
-        entry->SetInternalState(internalize);
-        RasterizerCache<Buffer>::Register(entry);
-
-        if (internalize) {
-            internalized_entries.emplace(ToCacheAddr(host_ptr));
-        }
-        if (is_written) {
-            entry->MarkAsModified(true, *this);
-        }
-
-        if (entry->GetCapacity() < size) {
-            MarkedForDestruction().push_back(entry->ExchangeBuffer(CreateBuffer(size), size));
-        }
-
-        UploadBufferData(entry->GetBuffer(), 0, size, host_ptr);
-        return {ToHandle(entry->GetBuffer()), 0};
-    }
-
-    void IncreaseBufferSize(Buffer& entry, std::size_t new_size) {
-        const std::size_t old_size = entry->GetSize();
-        if (entry->GetCapacity() < new_size) {
-            const auto& old_buffer = entry->GetBuffer();
-            auto new_buffer = CreateBuffer(new_size);
-
-            // Copy bits from the old buffer to the new buffer.
-            CopyBufferData(old_buffer, new_buffer, 0, 0, old_size);
-            MarkedForDestruction().push_back(
-                entry->ExchangeBuffer(std::move(new_buffer), new_size));
-
-            // This buffer could have been used
-            invalidated = true;
-        }
-        // Upload the new bits.
-        const std::size_t size_diff = new_size - old_size;
-        UploadBufferData(entry->GetBuffer(), old_size, size_diff, entry->GetHostPtr() + old_size);
-
-        // Update entry's size in the object and in the cache.
-        Unregister(entry);
-
-        entry->SetSize(new_size);
-        RasterizerCache<Buffer>::Register(entry);
-    }
-
-    Buffer GetUncachedBuffer(VAddr cpu_addr, u8* host_ptr) {
-        if (auto entry = TryGetReservedBuffer(host_ptr)) {
-            return entry;
-        }
-        return std::make_shared<CachedBuffer<BufferStorageType>>(cpu_addr, host_ptr);
-    }
-
-    Buffer TryGetReservedBuffer(u8* host_ptr) {
-        const auto it = buffer_reserve.find(ToCacheAddr(host_ptr));
-        if (it == buffer_reserve.end()) {
-            return {};
-        }
-        auto& reserve = it->second;
-        auto entry = reserve.back();
-        reserve.pop_back();
-        return entry;
-    }
-
-    void ReserveBuffer(Buffer entry) {
-        buffer_reserve[entry->GetCacheAddr()].push_back(std::move(entry));
-    }
-
-    void AlignBuffer(std::size_t alignment) {
-        // Align the offset, not the mapped pointer
-        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
-        buffer_ptr += offset_aligned - buffer_offset;
-        buffer_offset = offset_aligned;
-    }
-
-    std::vector<BufferStorageType>& MarkedForDestruction() {
-        return marked_for_destruction_ring_buffer[marked_for_destruction_index];
-    }
-
-    Core::System& system;
-
-    std::unique_ptr<StreamBuffer> stream_buffer;
-    BufferType stream_buffer_handle{};
-
-    bool invalidated = false;
-
-    u8* buffer_ptr = nullptr;
-    u64 buffer_offset = 0;
-    u64 buffer_offset_base = 0;
-
-    std::size_t marked_for_destruction_index = 0;
-    std::array<std::vector<BufferStorageType>, 4> marked_for_destruction_ring_buffer;
-
-    std::unordered_set<CacheAddr> internalized_entries;
-    std::unordered_map<CacheAddr, std::vector<Buffer>> buffer_reserve;
-};
-
-} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h
new file mode 100644
index 000000000..4b9193182
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_block.h
@@ -0,0 +1,76 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <unordered_set>
+#include <utility>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+
+namespace VideoCommon {
+
+class BufferBlock {
+public:
+    bool Overlaps(const CacheAddr start, const CacheAddr end) const {
+        return (cache_addr < end) && (cache_addr_end > start);
+    }
+
+    bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
+        return cache_addr <= other_start && other_end <= cache_addr_end;
+    }
+
+    u8* GetWritableHostPtr() const {
+        return FromCacheAddr(cache_addr);
+    }
+
+    u8* GetWritableHostPtr(std::size_t offset) const {
+        return FromCacheAddr(cache_addr + offset);
+    }
+
+    std::size_t GetOffset(const CacheAddr in_addr) {
+        return static_cast<std::size_t>(in_addr - cache_addr);
+    }
+
+    CacheAddr GetCacheAddr() const {
+        return cache_addr;
+    }
+
+    CacheAddr GetCacheAddrEnd() const {
+        return cache_addr_end;
+    }
+
+    void SetCacheAddr(const CacheAddr new_addr) {
+        cache_addr = new_addr;
+        cache_addr_end = new_addr + size;
+    }
+
+    std::size_t GetSize() const {
+        return size;
+    }
+
+    void SetEpoch(u64 new_epoch) {
+        epoch = new_epoch;
+    }
+
+    u64 GetEpoch() {
+        return epoch;
+    }
+
+protected:
+    explicit BufferBlock(CacheAddr cache_addr, const std::size_t size) : size{size} {
+        SetCacheAddr(cache_addr);
+    }
+    ~BufferBlock() = default;
+
+private:
+    CacheAddr cache_addr{};
+    CacheAddr cache_addr_end{};
+    std::size_t size{};
+    u64 epoch{};
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h
new file mode 100644
index 000000000..2442ddfd6
--- /dev/null
+++ b/src/video_core/buffer_cache/buffer_cache.h
@@ -0,0 +1,447 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "common/alignment.h"
+#include "common/common_types.h"
+#include "core/core.h"
+#include "video_core/buffer_cache/buffer_block.h"
+#include "video_core/buffer_cache/map_interval.h"
+#include "video_core/memory_manager.h"
+#include "video_core/rasterizer_interface.h"
+
+namespace VideoCommon {
+
+using MapInterval = std::shared_ptr<MapIntervalBase>;
+
+template <typename TBuffer, typename TBufferType, typename StreamBuffer>
+class BufferCache {
+public:
+    using BufferInfo = std::pair<const TBufferType*, u64>;
+
+    BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4,
+                            bool is_written = false) {
+        std::lock_guard lock{mutex};
+
+        auto& memory_manager = system.GPU().MemoryManager();
+        const auto host_ptr = memory_manager.GetPointer(gpu_addr);
+        if (!host_ptr) {
+            return {GetEmptyBuffer(size), 0};
+        }
+        const auto cache_addr = ToCacheAddr(host_ptr);
+
+        // Cache management is a big overhead, so only cache entries with a given size.
+        // TODO: Figure out which size is the best for given games.
+        constexpr std::size_t max_stream_size = 0x800;
+        if (size < max_stream_size) {
+            if (!is_written && !IsRegionWritten(cache_addr, cache_addr + size - 1)) {
+                return StreamBufferUpload(host_ptr, size, alignment);
+            }
+        }
+
+        auto block = GetBlock(cache_addr, size);
+        auto map = MapAddress(block, gpu_addr, cache_addr, size);
+        if (is_written) {
+            map->MarkAsModified(true, GetModifiedTicks());
+            if (!map->IsWritten()) {
+                map->MarkAsWritten(true);
+                MarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+            }
+        } else {
+            if (map->IsWritten()) {
+                WriteBarrier();
+            }
+        }
+
+        const u64 offset = static_cast<u64>(block->GetOffset(cache_addr));
+
+        return {ToHandle(block), offset};
+    }
+
+    /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset.
+    BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size,
+                                std::size_t alignment = 4) {
+        std::lock_guard lock{mutex};
+        return StreamBufferUpload(raw_pointer, size, alignment);
+    }
+
+    void Map(std::size_t max_size) {
+        std::lock_guard lock{mutex};
+
+        std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4);
+        buffer_offset = buffer_offset_base;
+    }
+
+    /// Finishes the upload stream, returns true on bindings invalidation.
+    bool Unmap() {
+        std::lock_guard lock{mutex};
+
+        stream_buffer->Unmap(buffer_offset - buffer_offset_base);
+        return std::exchange(invalidated, false);
+    }
+
+    void TickFrame() {
+        ++epoch;
+        while (!pending_destruction.empty()) {
+            if (pending_destruction.front()->GetEpoch() + 1 > epoch) {
+                break;
+            }
+            pending_destruction.pop_front();
+        }
+    }
+
+    /// Write any cached resources overlapping the specified region back to memory
+    void FlushRegion(CacheAddr addr, std::size_t size) {
+        std::lock_guard lock{mutex};
+
+        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
+        std::sort(objects.begin(), objects.end(), [](const MapInterval& a, const MapInterval& b) {
+            return a->GetModificationTick() < b->GetModificationTick();
+        });
+        for (auto& object : objects) {
+            if (object->IsModified() && object->IsRegistered()) {
+                FlushMap(object);
+            }
+        }
+    }
+
+    /// Mark the specified region as being invalidated
+    void InvalidateRegion(CacheAddr addr, u64 size) {
+        std::lock_guard lock{mutex};
+
+        std::vector<MapInterval> objects = GetMapsInRange(addr, size);
+        for (auto& object : objects) {
+            if (object->IsRegistered()) {
+                Unregister(object);
+            }
+        }
+    }
+
+    virtual const TBufferType* GetEmptyBuffer(std::size_t size) = 0;
+
+protected:
+    explicit BufferCache(VideoCore::RasterizerInterface& rasterizer, Core::System& system,
+                         std::unique_ptr<StreamBuffer> stream_buffer)
+        : rasterizer{rasterizer}, system{system}, stream_buffer{std::move(stream_buffer)},
+          stream_buffer_handle{this->stream_buffer->GetHandle()} {}
+
+    ~BufferCache() = default;
+
+    virtual const TBufferType* ToHandle(const TBuffer& storage) = 0;
+
+    virtual void WriteBarrier() = 0;
+
+    virtual TBuffer CreateBlock(CacheAddr cache_addr, std::size_t size) = 0;
+
+    virtual void UploadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
+                                 const u8* data) = 0;
+
+    virtual void DownloadBlockData(const TBuffer& buffer, std::size_t offset, std::size_t size,
+                                   u8* data) = 0;
+
+    virtual void CopyBlock(const TBuffer& src, const TBuffer& dst, std::size_t src_offset,
+                           std::size_t dst_offset, std::size_t size) = 0;
+
+    /// Register an object into the cache
+    void Register(const MapInterval& new_map, bool inherit_written = false) {
+        const CacheAddr cache_ptr = new_map->GetStart();
+        const std::optional<VAddr> cpu_addr =
+            system.GPU().MemoryManager().GpuToCpuAddress(new_map->GetGpuAddress());
+        if (!cache_ptr || !cpu_addr) {
+            LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}",
+                         new_map->GetGpuAddress());
+            return;
+        }
+        const std::size_t size = new_map->GetEnd() - new_map->GetStart();
+        new_map->SetCpuAddress(*cpu_addr);
+        new_map->MarkAsRegistered(true);
+        const IntervalType interval{new_map->GetStart(), new_map->GetEnd()};
+        mapped_addresses.insert({interval, new_map});
+        rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1);
+        if (inherit_written) {
+            MarkRegionAsWritten(new_map->GetStart(), new_map->GetEnd() - 1);
+            new_map->MarkAsWritten(true);
+        }
+    }
+
+    /// Unregisters an object from the cache
+    void Unregister(MapInterval& map) {
+        const std::size_t size = map->GetEnd() - map->GetStart();
+        rasterizer.UpdatePagesCachedCount(map->GetCpuAddress(), size, -1);
+        map->MarkAsRegistered(false);
+        if (map->IsWritten()) {
+            UnmarkRegionAsWritten(map->GetStart(), map->GetEnd() - 1);
+        }
+        const IntervalType delete_interval{map->GetStart(), map->GetEnd()};
+        mapped_addresses.erase(delete_interval);
+    }
+
+private:
+    MapInterval CreateMap(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr) {
+        return std::make_shared<MapIntervalBase>(start, end, gpu_addr);
+    }
+
+    MapInterval MapAddress(const TBuffer& block, const GPUVAddr gpu_addr,
+                           const CacheAddr cache_addr, const std::size_t size) {
+
+        std::vector<MapInterval> overlaps = GetMapsInRange(cache_addr, size);
+        if (overlaps.empty()) {
+            const CacheAddr cache_addr_end = cache_addr + size;
+            MapInterval new_map = CreateMap(cache_addr, cache_addr_end, gpu_addr);
+            u8* host_ptr = FromCacheAddr(cache_addr);
+            UploadBlockData(block, block->GetOffset(cache_addr), size, host_ptr);
+            Register(new_map);
+            return new_map;
+        }
+
+        const CacheAddr cache_addr_end = cache_addr + size;
+        if (overlaps.size() == 1) {
+            MapInterval& current_map = overlaps[0];
+            if (current_map->IsInside(cache_addr, cache_addr_end)) {
+                return current_map;
+            }
+        }
+        CacheAddr new_start = cache_addr;
+        CacheAddr new_end = cache_addr_end;
+        bool write_inheritance = false;
+        bool modified_inheritance = false;
+        // Calculate new buffer parameters
+        for (auto& overlap : overlaps) {
+            new_start = std::min(overlap->GetStart(), new_start);
+            new_end = std::max(overlap->GetEnd(), new_end);
+            write_inheritance |= overlap->IsWritten();
+            modified_inheritance |= overlap->IsModified();
+        }
+        GPUVAddr new_gpu_addr = gpu_addr + new_start - cache_addr;
+        for (auto& overlap : overlaps) {
+            Unregister(overlap);
+        }
+        UpdateBlock(block, new_start, new_end, overlaps);
+        MapInterval new_map = CreateMap(new_start, new_end, new_gpu_addr);
+        if (modified_inheritance) {
+            new_map->MarkAsModified(true, GetModifiedTicks());
+        }
+        Register(new_map, write_inheritance);
+        return new_map;
+    }
+
+    void UpdateBlock(const TBuffer& block, CacheAddr start, CacheAddr end,
+                     std::vector<MapInterval>& overlaps) {
+        const IntervalType base_interval{start, end};
+        IntervalSet interval_set{};
+        interval_set.add(base_interval);
+        for (auto& overlap : overlaps) {
+            const IntervalType subtract{overlap->GetStart(), overlap->GetEnd()};
+            interval_set.subtract(subtract);
+        }
+        for (auto& interval : interval_set) {
+            std::size_t size = interval.upper() - interval.lower();
+            if (size > 0) {
+                u8* host_ptr = FromCacheAddr(interval.lower());
+                UploadBlockData(block, block->GetOffset(interval.lower()), size, host_ptr);
+            }
+        }
+    }
+
+    std::vector<MapInterval> GetMapsInRange(CacheAddr addr, std::size_t size) {
+        if (size == 0) {
+            return {};
+        }
+
+        std::vector<MapInterval> objects{};
+        const IntervalType interval{addr, addr + size};
+        for (auto& pair : boost::make_iterator_range(mapped_addresses.equal_range(interval))) {
+            objects.push_back(pair.second);
+        }
+
+        return objects;
+    }
+
+    /// Returns a ticks counter used for tracking when cached objects were last modified
+    u64 GetModifiedTicks() {
+        return ++modified_ticks;
+    }
+
+    void FlushMap(MapInterval map) {
+        std::size_t size = map->GetEnd() - map->GetStart();
+        TBuffer block = blocks[map->GetStart() >> block_page_bits];
+        u8* host_ptr = FromCacheAddr(map->GetStart());
+        DownloadBlockData(block, block->GetOffset(map->GetStart()), size, host_ptr);
+        map->MarkAsModified(false, 0);
+    }
+
+    BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size,
+                                  std::size_t alignment) {
+        AlignBuffer(alignment);
+        const std::size_t uploaded_offset = buffer_offset;
+        std::memcpy(buffer_ptr, raw_pointer, size);
+
+        buffer_ptr += size;
+        buffer_offset += size;
+        return {&stream_buffer_handle, uploaded_offset};
+    }
+
+    void AlignBuffer(std::size_t alignment) {
+        // Align the offset, not the mapped pointer
+        const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment);
+        buffer_ptr += offset_aligned - buffer_offset;
+        buffer_offset = offset_aligned;
+    }
+
+    TBuffer EnlargeBlock(TBuffer buffer) {
+        const std::size_t old_size = buffer->GetSize();
+        const std::size_t new_size = old_size + block_page_size;
+        const CacheAddr cache_addr = buffer->GetCacheAddr();
+        TBuffer new_buffer = CreateBlock(cache_addr, new_size);
+        CopyBlock(buffer, new_buffer, 0, 0, old_size);
+        buffer->SetEpoch(epoch);
+        pending_destruction.push_back(buffer);
+        const CacheAddr cache_addr_end = cache_addr + new_size - 1;
+        u64 page_start = cache_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            blocks[page_start] = new_buffer;
+            ++page_start;
+        }
+        return new_buffer;
+    }
+
+    TBuffer MergeBlocks(TBuffer first, TBuffer second) {
+        const std::size_t size_1 = first->GetSize();
+        const std::size_t size_2 = second->GetSize();
+        const CacheAddr first_addr = first->GetCacheAddr();
+        const CacheAddr second_addr = second->GetCacheAddr();
+        const CacheAddr new_addr = std::min(first_addr, second_addr);
+        const std::size_t new_size = size_1 + size_2;
+        TBuffer new_buffer = CreateBlock(new_addr, new_size);
+        CopyBlock(first, new_buffer, 0, new_buffer->GetOffset(first_addr), size_1);
+        CopyBlock(second, new_buffer, 0, new_buffer->GetOffset(second_addr), size_2);
+        first->SetEpoch(epoch);
+        second->SetEpoch(epoch);
+        pending_destruction.push_back(first);
+        pending_destruction.push_back(second);
+        const CacheAddr cache_addr_end = new_addr + new_size - 1;
+        u64 page_start = new_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            blocks[page_start] = new_buffer;
+            ++page_start;
+        }
+        return new_buffer;
+    }
+
+    TBuffer GetBlock(const CacheAddr cache_addr, const std::size_t size) {
+        TBuffer found{};
+        const CacheAddr cache_addr_end = cache_addr + size - 1;
+        u64 page_start = cache_addr >> block_page_bits;
+        const u64 page_end = cache_addr_end >> block_page_bits;
+        while (page_start <= page_end) {
+            auto it = blocks.find(page_start);
+            if (it == blocks.end()) {
+                if (found) {
+                    found = EnlargeBlock(found);
+                } else {
+                    const CacheAddr start_addr = (page_start << block_page_bits);
+                    found = CreateBlock(start_addr, block_page_size);
+                    blocks[page_start] = found;
+                }
+            } else {
+                if (found) {
+                    if (found == it->second) {
+                        ++page_start;
+                        continue;
+                    }
+                    found = MergeBlocks(found, it->second);
+                } else {
+                    found = it->second;
+                }
+            }
+            ++page_start;
+        }
+        return found;
+    }
+
+    void MarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            auto it = written_pages.find(page_start);
+            if (it != written_pages.end()) {
+                it->second = it->second + 1;
+            } else {
+                written_pages[page_start] = 1;
+            }
+            page_start++;
+        }
+    }
+
+    void UnmarkRegionAsWritten(const CacheAddr start, const CacheAddr end) {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            auto it = written_pages.find(page_start);
+            if (it != written_pages.end()) {
+                if (it->second > 1) {
+                    it->second = it->second - 1;
+                } else {
+                    written_pages.erase(it);
+                }
+            }
+            page_start++;
+        }
+    }
+
+    bool IsRegionWritten(const CacheAddr start, const CacheAddr end) const {
+        u64 page_start = start >> write_page_bit;
+        const u64 page_end = end >> write_page_bit;
+        while (page_start <= page_end) {
+            if (written_pages.count(page_start) > 0) {
+                return true;
+            }
+            page_start++;
+        }
+        return false;
+    }
+
+    VideoCore::RasterizerInterface& rasterizer;
+    Core::System& system;
+    std::unique_ptr<StreamBuffer> stream_buffer;
+
+    TBufferType stream_buffer_handle{};
+
+    bool invalidated = false;
+
+    u8* buffer_ptr = nullptr;
+    u64 buffer_offset = 0;
+    u64 buffer_offset_base = 0;
+
+    using IntervalSet = boost::icl::interval_set<CacheAddr>;
+    using IntervalCache = boost::icl::interval_map<CacheAddr, MapInterval>;
+    using IntervalType = typename IntervalCache::interval_type;
+    IntervalCache mapped_addresses{};
+
+    static constexpr u64 write_page_bit{11};
+    std::unordered_map<u64, u32> written_pages{};
+
+    static constexpr u64 block_page_bits{21};
+    static constexpr u64 block_page_size{1 << block_page_bits};
+    std::unordered_map<u64, TBuffer> blocks{};
+
+    std::list<TBuffer> pending_destruction{};
+    u64 epoch{};
+    u64 modified_ticks{};
+
+    std::recursive_mutex mutex;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h
new file mode 100644
index 000000000..3a104d5cd
--- /dev/null
+++ b/src/video_core/buffer_cache/map_interval.h
@@ -0,0 +1,89 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+#include "video_core/gpu.h"
+
+namespace VideoCommon {
+
+class MapIntervalBase {
+public:
+    MapIntervalBase(const CacheAddr start, const CacheAddr end, const GPUVAddr gpu_addr)
+        : start{start}, end{end}, gpu_addr{gpu_addr} {}
+
+    void SetCpuAddress(VAddr new_cpu_addr) {
+        cpu_addr = new_cpu_addr;
+    }
+
+    VAddr GetCpuAddress() const {
+        return cpu_addr;
+    }
+
+    GPUVAddr GetGpuAddress() const {
+        return gpu_addr;
+    }
+
+    bool IsInside(const CacheAddr other_start, const CacheAddr other_end) const {
+        return (start <= other_start && other_end <= end);
+    }
+
+    bool operator==(const MapIntervalBase& rhs) const {
+        return std::tie(start, end) == std::tie(rhs.start, rhs.end);
+    }
+
+    bool operator!=(const MapIntervalBase& rhs) const {
+        return !operator==(rhs);
+    }
+
+    void MarkAsRegistered(const bool registered) {
+        is_registered = registered;
+    }
+
+    bool IsRegistered() const {
+        return is_registered;
+    }
+
+    CacheAddr GetStart() const {
+        return start;
+    }
+
+    CacheAddr GetEnd() const {
+        return end;
+    }
+
+    void MarkAsModified(const bool is_modified_, const u64 tick) {
+        is_modified = is_modified_;
+        ticks = tick;
+    }
+
+    bool IsModified() const {
+        return is_modified;
+    }
+
+    u64 GetModificationTick() const {
+        return ticks;
+    }
+
+    void MarkAsWritten(const bool is_written_) {
+        is_written = is_written_;
+    }
+
+    bool IsWritten() const {
+        return is_written;
+    }
+
+private:
+    CacheAddr start;
+    CacheAddr end;
+    GPUVAddr gpu_addr;
+    VAddr cpu_addr{};
+    bool is_written{};
+    bool is_modified{};
+    bool is_registered{};
+    u64 ticks{};
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp
index 0ee228e28..98a8b5337 100644
--- a/src/video_core/engines/fermi_2d.cpp
+++ b/src/video_core/engines/fermi_2d.cpp
@@ -10,8 +10,7 @@
 
 namespace Tegra::Engines {
 
-Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager)
-    : rasterizer{rasterizer}, memory_manager{memory_manager} {}
+Fermi2D::Fermi2D(VideoCore::RasterizerInterface& rasterizer) : rasterizer{rasterizer} {}
 
 void Fermi2D::CallMethod(const GPU::MethodCall& method_call) {
     ASSERT_MSG(method_call.method < Regs::NUM_REGS,
diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h
index 05421d185..0901cf2fa 100644
--- a/src/video_core/engines/fermi_2d.h
+++ b/src/video_core/engines/fermi_2d.h
@@ -33,7 +33,7 @@ namespace Tegra::Engines {
 
 class Fermi2D final {
 public:
-    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager);
+    explicit Fermi2D(VideoCore::RasterizerInterface& rasterizer);
     ~Fermi2D() = default;
 
     /// Write the value to the register identified by method.
@@ -145,7 +145,6 @@ public:
 
 private:
     VideoCore::RasterizerInterface& rasterizer;
-    MemoryManager& memory_manager;
 
     /// Performs the copy from the source surface to the destination surface as configured in the
     /// registers.
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 44279de00..fa4a7c5c1 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -15,7 +15,7 @@
 namespace Tegra::Engines {
 
 KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager)
-    : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {}
+    : system{system}, upload_state{memory_manager, regs.upload} {}
 
 KeplerMemory::~KeplerMemory() = default;
 
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index f3bc675a9..e0e25c321 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -65,7 +65,6 @@ public:
 
 private:
     Core::System& system;
-    MemoryManager& memory_manager;
     Upload::State upload_state;
 };
 
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index 125c53360..f5158d219 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -249,16 +249,10 @@ void Maxwell3D::CallMacroMethod(u32 method, std::vector<u32> parameters) {
     executing_macro = 0;
 
     // Lookup the macro offset
-    const u32 entry{(method - MacroRegistersStart) >> 1};
-    const auto& search{macro_offsets.find(entry)};
-    if (search == macro_offsets.end()) {
-        LOG_CRITICAL(HW_GPU, "macro not found for method 0x{:X}!", method);
-        UNREACHABLE();
-        return;
-    }
+    const u32 entry = ((method - MacroRegistersStart) >> 1) % macro_positions.size();
 
     // Execute the current macro.
-    macro_interpreter.Execute(search->second, std::move(parameters));
+    macro_interpreter.Execute(macro_positions[entry], std::move(parameters));
 }
 
 void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) {
@@ -421,7 +415,7 @@ void Maxwell3D::ProcessMacroUpload(u32 data) {
 }
 
 void Maxwell3D::ProcessMacroBind(u32 data) {
-    macro_offsets[regs.macros.entry] = data;
+    macro_positions[regs.macros.entry++] = data;
 }
 
 void Maxwell3D::ProcessQueryGet() {
@@ -524,7 +518,7 @@ void Maxwell3D::ProcessQueryCondition() {
 void Maxwell3D::ProcessSyncPoint() {
     const u32 sync_point = regs.sync_info.sync_point.Value();
     const u32 increment = regs.sync_info.increment.Value();
-    const u32 cache_flush = regs.sync_info.unknown.Value();
+    [[maybe_unused]] const u32 cache_flush = regs.sync_info.unknown.Value();
     if (increment) {
         system.GPU().IncrementSyncPoint(sync_point);
     }
@@ -626,10 +620,10 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
     Texture::TICEntry tic_entry;
     memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
 
-    const auto r_type{tic_entry.r_type.Value()};
-    const auto g_type{tic_entry.g_type.Value()};
-    const auto b_type{tic_entry.b_type.Value()};
-    const auto a_type{tic_entry.a_type.Value()};
+    [[maybe_unused]] const auto r_type{tic_entry.r_type.Value()};
+    [[maybe_unused]] const auto g_type{tic_entry.g_type.Value()};
+    [[maybe_unused]] const auto b_type{tic_entry.b_type.Value()};
+    [[maybe_unused]] const auto a_type{tic_entry.a_type.Value()};
 
     // TODO(Subv): Different data types for separate components are not supported
     DEBUG_ASSERT(r_type == g_type && r_type == b_type && r_type == a_type);
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h
index 1ee982b76..0184342a0 100644
--- a/src/video_core/engines/maxwell_3d.h
+++ b/src/video_core/engines/maxwell_3d.h
@@ -1270,7 +1270,7 @@ private:
     MemoryManager& memory_manager;
 
     /// Start offsets of each macro in macro_memory
-    std::unordered_map<u32, u32> macro_offsets;
+    std::array<u32, 0x80> macro_positions = {};
 
     /// Memory for macro code
     MacroMemory macro_memory;
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index a28c04473..ad8453c5f 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -5,18 +5,17 @@
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "core/core.h"
+#include "core/settings.h"
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/memory_manager.h"
-#include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_base.h"
 #include "video_core/textures/decoders.h"
 
 namespace Tegra::Engines {
 
-MaxwellDMA::MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                       MemoryManager& memory_manager)
-    : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {}
+MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager)
+    : system{system}, memory_manager{memory_manager} {}
 
 void MaxwellDMA::CallMethod(const GPU::MethodCall& method_call) {
     ASSERT_MSG(method_call.method < Regs::NUM_REGS,
@@ -84,13 +83,17 @@ void MaxwellDMA::HandleCopy() {
     ASSERT(regs.exec.enable_2d == 1);
 
     if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
-        ASSERT(regs.src_params.size_z == 1);
+        ASSERT(regs.src_params.BlockDepth() == 0);
         // If the input is tiled and the output is linear, deswizzle the input and copy it over.
-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x;
+        const u32 bytes_per_pixel = regs.dst_pitch / regs.x_count;
         const std::size_t src_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
+            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y,
             regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
 
+        const std::size_t src_layer_size = Texture::CalculateSize(
+            true, bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, 1,
+            regs.src_params.BlockHeight(), regs.src_params.BlockDepth());
+
         const std::size_t dst_size = regs.dst_pitch * regs.y_count;
 
         if (read_buffer.size() < src_size) {
@@ -104,23 +107,23 @@ void MaxwellDMA::HandleCopy() {
         memory_manager.ReadBlock(source, read_buffer.data(), src_size);
         memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
 
-        Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch,
-                                  regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(),
-                                  write_buffer.data(), regs.src_params.BlockHeight(),
-                                  regs.src_params.pos_x, regs.src_params.pos_y);
+        Texture::UnswizzleSubrect(
+            regs.x_count, regs.y_count, regs.dst_pitch, regs.src_params.size_x, bytes_per_pixel,
+            read_buffer.data() + src_layer_size * regs.src_params.pos_z, write_buffer.data(),
+            regs.src_params.BlockHeight(), regs.src_params.pos_x, regs.src_params.pos_y);
 
         memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
     } else {
         ASSERT(regs.dst_params.BlockDepth() == 0);
 
-        const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count;
+        const u32 bytes_per_pixel = regs.src_pitch / regs.x_count;
 
         const std::size_t dst_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
+            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y,
             regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
         const std::size_t dst_layer_size = Texture::CalculateSize(
-            true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
+            true, bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1,
             regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth());
 
         const std::size_t src_size = regs.src_pitch * regs.y_count;
@@ -133,14 +136,19 @@ void MaxwellDMA::HandleCopy() {
             write_buffer.resize(dst_size);
         }
 
-        memory_manager.ReadBlock(source, read_buffer.data(), src_size);
-        memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+        if (Settings::values.use_accurate_gpu_emulation) {
+            memory_manager.ReadBlock(source, read_buffer.data(), src_size);
+            memory_manager.ReadBlock(dest, write_buffer.data(), dst_size);
+        } else {
+            memory_manager.ReadBlockUnsafe(source, read_buffer.data(), src_size);
+            memory_manager.ReadBlockUnsafe(dest, write_buffer.data(), dst_size);
+        }
 
         // If the input is linear and the output is tiled, swizzle the input and copy it over.
-        Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x,
-                                src_bytes_per_pixel,
-                                write_buffer.data() + dst_layer_size * regs.dst_params.pos_z,
-                                read_buffer.data(), regs.dst_params.BlockHeight());
+        Texture::SwizzleSubrect(
+            regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, bytes_per_pixel,
+            write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, read_buffer.data(),
+            regs.dst_params.BlockHeight(), regs.dst_params.pos_x, regs.dst_params.pos_y);
 
         memory_manager.WriteBlock(dest, write_buffer.data(), dst_size);
     }
diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h
index 17b015ca7..93808a9bb 100644
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@@ -20,10 +20,6 @@ namespace Tegra {
 class MemoryManager;
 }
 
-namespace VideoCore {
-class RasterizerInterface;
-}
-
 namespace Tegra::Engines {
 
 /**
@@ -33,8 +29,7 @@ namespace Tegra::Engines {
 
 class MaxwellDMA final {
 public:
-    explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer,
-                        MemoryManager& memory_manager);
+    explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager);
     ~MaxwellDMA() = default;
 
     /// Write the value to the register identified by method.
@@ -180,8 +175,6 @@ public:
 private:
     Core::System& system;
 
-    VideoCore::RasterizerInterface& rasterizer;
-
     MemoryManager& memory_manager;
 
     std::vector<u8> read_buffer;
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index 103f39e91..ba28ff51c 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -538,6 +538,12 @@ enum class PhysicalAttributeDirection : u64 {
     Output = 1,
 };
 
+enum class VoteOperation : u64 {
+    All = 0, // allThreadsNV
+    Any = 1, // anyThreadNV
+    Eq = 2,  // allThreadsEqualNV
+};
+
 union Instruction {
     Instruction& operator=(const Instruction& instr) {
         value = instr.value;
@@ -565,6 +571,13 @@ union Instruction {
     } nop;
 
     union {
+        BitField<48, 2, VoteOperation> operation;
+        BitField<45, 3, u64> dest_pred;
+        BitField<39, 3, u64> value;
+        BitField<42, 1, u64> negate_value;
+    } vote;
+
+    union {
         BitField<8, 8, Register> gpr;
         BitField<20, 24, s64> offset;
     } gmem;
@@ -1488,6 +1501,7 @@ public:
         SYNC,
         BRK,
         DEPBAR,
+        VOTE,
         BFE_C,
         BFE_R,
         BFE_IMM,
@@ -1650,6 +1664,7 @@ public:
         Hfma2,
         Flow,
         Synch,
+        Warp,
         Memory,
         Texture,
         Image,
@@ -1776,6 +1791,7 @@ private:
             INST("111000110100---", Id::BRK, Type::Flow, "BRK"),
             INST("111000110000----", Id::EXIT, Type::Flow, "EXIT"),
             INST("1111000011110---", Id::DEPBAR, Type::Synch, "DEPBAR"),
+            INST("0101000011011---", Id::VOTE, Type::Warp, "VOTE"),
             INST("1110111111011---", Id::LD_A, Type::Memory, "LD_A"),
             INST("1110111101001---", Id::LD_S, Type::Memory, "LD_S"),
             INST("1110111101000---", Id::LD_L, Type::Memory, "LD_L"),
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index c409af194..8d9db45f5 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
     memory_manager = std::make_unique<Tegra::MemoryManager>(system, rasterizer);
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
-    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager);
+    fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer);
     kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager);
-    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager);
+    maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, *memory_manager);
     kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager);
 }
 
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 11857ff99..544340ecd 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -19,6 +19,10 @@ inline CacheAddr ToCacheAddr(const void* host_ptr) {
     return reinterpret_cast<CacheAddr>(host_ptr);
 }
 
+inline u8* FromCacheAddr(CacheAddr cache_addr) {
+    return reinterpret_cast<u8*>(cache_addr);
+}
+
 namespace Core {
 class System;
 }
@@ -281,8 +285,8 @@ private:
 
 protected:
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
-    VideoCore::RendererBase& renderer;
     Core::System& system;
+    VideoCore::RendererBase& renderer;
 
 private:
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h
index 6e44d51cf..6b3f2d50a 100644
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@@ -50,7 +50,7 @@ public:
     /// and invalidated
     virtual void FlushAndInvalidateRegion(CacheAddr addr, u64 size) = 0;
 
-    // Notify the rasterizer to send all written commands to the host GPU.
+    /// Notify the rasterizer to send all written commands to the host GPU.
     virtual void FlushCommands() = 0;
 
     /// Notify rasterizer that a frame is about to finish
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
index 2a9b523f5..f8a807c84 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp
@@ -7,28 +7,41 @@
 #include <glad/glad.h>
 
 #include "common/assert.h"
+#include "common/microprofile.h"
+#include "video_core/rasterizer_interface.h"
 #include "video_core/renderer_opengl/gl_buffer_cache.h"
 #include "video_core/renderer_opengl/gl_rasterizer.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 
 namespace OpenGL {
 
+MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128));
+
+CachedBufferBlock::CachedBufferBlock(CacheAddr cache_addr, const std::size_t size)
+    : VideoCommon::BufferBlock{cache_addr, size} {
+    gl_buffer.Create();
+    glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
+}
+
+CachedBufferBlock::~CachedBufferBlock() = default;
+
 OGLBufferCache::OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                                std::size_t stream_size)
-    : VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer>{
+    : VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer>{
           rasterizer, system, std::make_unique<OGLStreamBuffer>(stream_size, true)} {}
 
 OGLBufferCache::~OGLBufferCache() = default;
 
-OGLBuffer OGLBufferCache::CreateBuffer(std::size_t size) {
-    OGLBuffer buffer;
-    buffer.Create();
-    glNamedBufferData(buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW);
-    return buffer;
+Buffer OGLBufferCache::CreateBlock(CacheAddr cache_addr, std::size_t size) {
+    return std::make_shared<CachedBufferBlock>(cache_addr, size);
+}
+
+void OGLBufferCache::WriteBarrier() {
+    glMemoryBarrier(GL_ALL_BARRIER_BITS);
 }
 
-const GLuint* OGLBufferCache::ToHandle(const OGLBuffer& buffer) {
-    return &buffer.handle;
+const GLuint* OGLBufferCache::ToHandle(const Buffer& buffer) {
+    return buffer->GetHandle();
 }
 
 const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
@@ -36,23 +49,24 @@ const GLuint* OGLBufferCache::GetEmptyBuffer(std::size_t) {
     return &null_buffer;
 }
 
-void OGLBufferCache::UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                                      const u8* data) {
-    glNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+void OGLBufferCache::UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                                     const u8* data) {
+    glNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
                          static_cast<GLsizeiptr>(size), data);
 }
 
-void OGLBufferCache::DownloadBufferData(const OGLBuffer& buffer, std::size_t offset,
-                                        std::size_t size, u8* data) {
-    glGetNamedBufferSubData(buffer.handle, static_cast<GLintptr>(offset),
+void OGLBufferCache::DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                                       u8* data) {
+    MICROPROFILE_SCOPE(OpenGL_Buffer_Download);
+    glGetNamedBufferSubData(*buffer->GetHandle(), static_cast<GLintptr>(offset),
                             static_cast<GLsizeiptr>(size), data);
 }
 
-void OGLBufferCache::CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst,
-                                    std::size_t src_offset, std::size_t dst_offset,
-                                    std::size_t size) {
-    glCopyNamedBufferSubData(src.handle, dst.handle, static_cast<GLintptr>(src_offset),
-                             static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size));
+void OGLBufferCache::CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
+                               std::size_t dst_offset, std::size_t size) {
+    glCopyNamedBufferSubData(*src->GetHandle(), *dst->GetHandle(),
+                             static_cast<GLintptr>(src_offset), static_cast<GLintptr>(dst_offset),
+                             static_cast<GLsizeiptr>(size));
 }
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h
index 8c8ac4038..022e7bfa9 100644
--- a/src/video_core/renderer_opengl/gl_buffer_cache.h
+++ b/src/video_core/renderer_opengl/gl_buffer_cache.h
@@ -7,7 +7,7 @@
 #include <memory>
 
 #include "common/common_types.h"
-#include "video_core/buffer_cache.h"
+#include "video_core/buffer_cache/buffer_cache.h"
 #include "video_core/rasterizer_cache.h"
 #include "video_core/renderer_opengl/gl_resource_manager.h"
 #include "video_core/renderer_opengl/gl_stream_buffer.h"
@@ -21,7 +21,24 @@ namespace OpenGL {
 class OGLStreamBuffer;
 class RasterizerOpenGL;
 
-class OGLBufferCache final : public VideoCommon::BufferCache<OGLBuffer, GLuint, OGLStreamBuffer> {
+class CachedBufferBlock;
+
+using Buffer = std::shared_ptr<CachedBufferBlock>;
+
+class CachedBufferBlock : public VideoCommon::BufferBlock {
+public:
+    explicit CachedBufferBlock(CacheAddr cache_addr, const std::size_t size);
+    ~CachedBufferBlock();
+
+    const GLuint* GetHandle() const {
+        return &gl_buffer.handle;
+    }
+
+private:
+    OGLBuffer gl_buffer{};
+};
+
+class OGLBufferCache final : public VideoCommon::BufferCache<Buffer, GLuint, OGLStreamBuffer> {
 public:
     explicit OGLBufferCache(RasterizerOpenGL& rasterizer, Core::System& system,
                             std::size_t stream_size);
@@ -30,18 +47,20 @@ public:
     const GLuint* GetEmptyBuffer(std::size_t) override;
 
 protected:
-    OGLBuffer CreateBuffer(std::size_t size) override;
+    Buffer CreateBlock(CacheAddr cache_addr, std::size_t size) override;
+
+    void WriteBarrier() override;
 
-    const GLuint* ToHandle(const OGLBuffer& buffer) override;
+    const GLuint* ToHandle(const Buffer& buffer) override;
 
-    void UploadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                          const u8* data) override;
+    void UploadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                         const u8* data) override;
 
-    void DownloadBufferData(const OGLBuffer& buffer, std::size_t offset, std::size_t size,
-                            u8* data) override;
+    void DownloadBlockData(const Buffer& buffer, std::size_t offset, std::size_t size,
+                           u8* data) override;
 
-    void CopyBufferData(const OGLBuffer& src, const OGLBuffer& dst, std::size_t src_offset,
-                        std::size_t dst_offset, std::size_t size) override;
+    void CopyBlock(const Buffer& src, const Buffer& dst, std::size_t src_offset,
+                   std::size_t dst_offset, std::size_t size) override;
 };
 
 } // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
index 85424a4c9..03d434b28 100644
--- a/src/video_core/renderer_opengl/gl_device.cpp
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -27,6 +27,8 @@ Device::Device() {
     shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT);
     max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS);
     max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS);
+    has_warp_intrinsics = GLAD_GL_NV_gpu_shader5 && GLAD_GL_NV_shader_thread_group &&
+                          GLAD_GL_NV_shader_thread_shuffle;
     has_vertex_viewport_layer = GLAD_GL_ARB_shader_viewport_layer_array;
     has_variable_aoffi = TestVariableAoffi();
     has_component_indexing_bug = TestComponentIndexingBug();
@@ -36,6 +38,7 @@ Device::Device(std::nullptr_t) {
     uniform_buffer_alignment = 0;
     max_vertex_attributes = 16;
     max_varyings = 15;
+    has_warp_intrinsics = true;
     has_vertex_viewport_layer = true;
     has_variable_aoffi = true;
     has_component_indexing_bug = false;
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
index dc883722d..3ef7c6dd8 100644
--- a/src/video_core/renderer_opengl/gl_device.h
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -30,6 +30,10 @@ public:
         return max_varyings;
     }
 
+    bool HasWarpIntrinsics() const {
+        return has_warp_intrinsics;
+    }
+
     bool HasVertexViewportLayer() const {
         return has_vertex_viewport_layer;
     }
@@ -50,6 +54,7 @@ private:
     std::size_t shader_storage_alignment{};
     u32 max_vertex_attributes{};
     u32 max_varyings{};
+    bool has_warp_intrinsics{};
     bool has_vertex_viewport_layer{};
     bool has_variable_aoffi{};
     bool has_component_indexing_bug{};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 80cfda7e4..bb09ecd52 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -708,8 +708,6 @@ void RasterizerOpenGL::DrawArrays() {
         return;
     }
 
-    const auto& regs = gpu.regs;
-
     SyncColorMask();
     SyncFragmentColorClampState();
     SyncMultiSampleState();
@@ -980,7 +978,7 @@ void RasterizerOpenGL::SetupGlobalMemory(const GLShader::GlobalMemoryEntry& entr
                                          GPUVAddr gpu_addr, std::size_t size) {
     const auto alignment{device.GetShaderStorageBufferAlignment()};
     const auto [ssbo, buffer_offset] =
-        buffer_cache.UploadMemory(gpu_addr, size, alignment, true, entry.IsWritten());
+        buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.IsWritten());
     bind_ssbo_pushbuffer.Push(ssbo, buffer_offset, static_cast<GLsizeiptr>(size));
 }
 
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 1c90facc3..cf6a5cddf 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -212,7 +212,9 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
     const auto texture_buffer_usage{variant.texture_buffer_usage};
 
     std::string source = "#version 430 core\n"
-                         "#extension GL_ARB_separate_shader_objects : enable\n";
+                         "#extension GL_ARB_separate_shader_objects : enable\n"
+                         "#extension GL_NV_gpu_shader5 : enable\n"
+                         "#extension GL_NV_shader_thread_group : enable\n";
     if (entries.shader_viewport_layer_array) {
         source += "#extension GL_ARB_shader_viewport_layer_array : enable\n";
     }
@@ -247,20 +249,24 @@ CachedProgram SpecializeShader(const std::string& code, const GLShader::ShaderEn
         if (!texture_buffer_usage.test(i)) {
             continue;
         }
-        source += fmt::format("#define SAMPLER_{}_IS_BUFFER", i);
+        source += fmt::format("#define SAMPLER_{}_IS_BUFFER\n", i);
+    }
+    if (texture_buffer_usage.any()) {
+        source += '\n';
     }
 
     if (program_type == ProgramType::Geometry) {
         const auto [glsl_topology, debug_name, max_vertices] =
             GetPrimitiveDescription(primitive_mode);
 
-        source += "layout (" + std::string(glsl_topology) + ") in;\n";
+        source += "layout (" + std::string(glsl_topology) + ") in;\n\n";
         source += "#define MAX_VERTEX_INPUT " + std::to_string(max_vertices) + '\n';
     }
     if (program_type == ProgramType::Compute) {
         source += "layout (local_size_variable) in;\n";
     }
 
+    source += '\n';
     source += code;
 
     OGLShader shader;
@@ -289,7 +295,7 @@ std::set<GLenum> GetSupportedFormats() {
 
 CachedShader::CachedShader(const ShaderParameters& params, ProgramType program_type,
                            GLShader::ProgramResult result)
-    : RasterizerCacheObject{params.host_ptr}, host_ptr{params.host_ptr}, cpu_addr{params.cpu_addr},
+    : RasterizerCacheObject{params.host_ptr}, cpu_addr{params.cpu_addr},
       unique_identifier{params.unique_identifier}, program_type{program_type},
       disk_cache{params.disk_cache}, precompiled_programs{params.precompiled_programs},
       entries{result.second}, code{std::move(result.first)}, shader_length{entries.shader_length} {}
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index a3106a0ff..2c8faf855 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -106,7 +106,6 @@ private:
 
     ShaderDiskCacheUsage GetUsage(const ProgramVariant& variant) const;
 
-    u8* host_ptr{};
     VAddr cpu_addr{};
     u64 unique_identifier{};
     ProgramType program_type{};
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index d8f722c26..359d58cbe 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -565,7 +565,7 @@ private:
                 case Tegra::Shader::ImageType::Texture1D:
                     return "image1D";
                 case Tegra::Shader::ImageType::TextureBuffer:
-                    return "bufferImage";
+                    return "imageBuffer";
                 case Tegra::Shader::ImageType::Texture1DArray:
                     return "image1DArray";
                 case Tegra::Shader::ImageType::Texture2D:
@@ -1735,6 +1735,48 @@ private:
         return "utof(gl_WorkGroupID"s + GetSwizzle(element) + ')';
     }
 
+    std::string BallotThread(Operation operation) {
+        const std::string value = VisitOperand(operation, 0, Type::Bool);
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia warp intrinsics are not available and its required by a shader");
+            // Stub on non-Nvidia devices by simulating all threads voting the same as the active
+            // one.
+            return fmt::format("utof({} ? 0xFFFFFFFFU : 0U)", value);
+        }
+        return fmt::format("utof(ballotThreadNV({}))", value);
+    }
+
+    std::string Vote(Operation operation, const char* func) {
+        const std::string value = VisitOperand(operation, 0, Type::Bool);
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia vote intrinsics are not available and its required by a shader");
+            // Stub with a warp size of one.
+            return value;
+        }
+        return fmt::format("{}({})", func, value);
+    }
+
+    std::string VoteAll(Operation operation) {
+        return Vote(operation, "allThreadsNV");
+    }
+
+    std::string VoteAny(Operation operation) {
+        return Vote(operation, "anyThreadNV");
+    }
+
+    std::string VoteEqual(Operation operation) {
+        if (!device.HasWarpIntrinsics()) {
+            LOG_ERROR(Render_OpenGL,
+                      "Nvidia vote intrinsics are not available and its required by a shader");
+            // We must return true here since a stub for a theoretical warp size of 1 will always
+            // return an equal result for all its votes.
+            return "true";
+        }
+        return Vote(operation, "allThreadsEqualNV");
+    }
+
     static constexpr std::array operation_decompilers = {
         &GLSLDecompiler::Assign,
 
@@ -1885,6 +1927,11 @@ private:
         &GLSLDecompiler::WorkGroupId<0>,
         &GLSLDecompiler::WorkGroupId<1>,
         &GLSLDecompiler::WorkGroupId<2>,
+
+        &GLSLDecompiler::BallotThread,
+        &GLSLDecompiler::VoteAll,
+        &GLSLDecompiler::VoteAny,
+        &GLSLDecompiler::VoteEqual,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp
index 408332f90..4f135fe03 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp
@@ -184,6 +184,9 @@ GLint GetSwizzleSource(SwizzleSource source) {
 }
 
 void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) {
+    if (params.IsBuffer()) {
+        return;
+    }
     glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
     glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
     glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
@@ -208,6 +211,7 @@ OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum inte
         glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(),
                              nullptr, GL_DYNAMIC_STORAGE_BIT);
         glTextureBuffer(texture.handle, internal_format, texture_buffer.handle);
+        break;
     case SurfaceTarget::Texture2D:
     case SurfaceTarget::TextureCubemap:
         glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width,
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h
index ff6ab6988..21324488a 100644
--- a/src/video_core/renderer_opengl/gl_texture_cache.h
+++ b/src/video_core/renderer_opengl/gl_texture_cache.h
@@ -51,7 +51,7 @@ public:
     }
 
 protected:
-    void DecorateSurfaceName();
+    void DecorateSurfaceName() override;
 
     View CreateView(const ViewParams& view_key) override;
     View CreateViewInner(const ViewParams& view_key, bool is_proxy);
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 24a591797..a35b45c9c 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -1072,6 +1072,26 @@ private:
         return {};
     }
 
+    Id BallotThread(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteAll(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteAny(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
+    Id VoteEqual(Operation) {
+        UNIMPLEMENTED();
+        return {};
+    }
+
     Id DeclareBuiltIn(spv::BuiltIn builtin, spv::StorageClass storage, Id type,
                       const std::string& name) {
         const Id id = OpVariable(type, storage);
@@ -1364,6 +1384,11 @@ private:
         &SPIRVDecompiler::WorkGroupId<0>,
         &SPIRVDecompiler::WorkGroupId<1>,
         &SPIRVDecompiler::WorkGroupId<2>,
+
+        &SPIRVDecompiler::BallotThread,
+        &SPIRVDecompiler::VoteAll,
+        &SPIRVDecompiler::VoteAny,
+        &SPIRVDecompiler::VoteEqual,
     };
     static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount));
 
diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp
index b547d8323..47a9fd961 100644
--- a/src/video_core/shader/decode.cpp
+++ b/src/video_core/shader/decode.cpp
@@ -176,6 +176,7 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) {
         {OpCode::Type::Ffma, &ShaderIR::DecodeFfma},
         {OpCode::Type::Hfma2, &ShaderIR::DecodeHfma2},
         {OpCode::Type::Conversion, &ShaderIR::DecodeConversion},
+        {OpCode::Type::Warp, &ShaderIR::DecodeWarp},
         {OpCode::Type::Memory, &ShaderIR::DecodeMemory},
         {OpCode::Type::Texture, &ShaderIR::DecodeTexture},
         {OpCode::Type::Image, &ShaderIR::DecodeImage},
diff --git a/src/video_core/shader/decode/float_set.cpp b/src/video_core/shader/decode/float_set.cpp
index f5013e44a..5614e8a0d 100644
--- a/src/video_core/shader/decode/float_set.cpp
+++ b/src/video_core/shader/decode/float_set.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodeFloatSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fset.abs_a != 0,
                                             instr.fset.neg_a != 0);
diff --git a/src/video_core/shader/decode/float_set_predicate.cpp b/src/video_core/shader/decode/float_set_predicate.cpp
index 815464f28..200c2c983 100644
--- a/src/video_core/shader/decode/float_set_predicate.cpp
+++ b/src/video_core/shader/decode/float_set_predicate.cpp
@@ -16,7 +16,6 @@ using Tegra::Shader::Pred;
 
 u32 ShaderIR::DecodeFloatSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     Node op_a = GetOperandAbsNegFloat(GetRegister(instr.gpr8), instr.fsetp.abs_a != 0,
                                       instr.fsetp.neg_a != 0);
diff --git a/src/video_core/shader/decode/integer_set.cpp b/src/video_core/shader/decode/integer_set.cpp
index 46e3d5905..59809bcd8 100644
--- a/src/video_core/shader/decode/integer_set.cpp
+++ b/src/video_core/shader/decode/integer_set.cpp
@@ -14,7 +14,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodeIntegerSet(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetRegister(instr.gpr8);
     const Node op_b = [&]() {
diff --git a/src/video_core/shader/decode/integer_set_predicate.cpp b/src/video_core/shader/decode/integer_set_predicate.cpp
index dd20775d7..25e48fef8 100644
--- a/src/video_core/shader/decode/integer_set_predicate.cpp
+++ b/src/video_core/shader/decode/integer_set_predicate.cpp
@@ -16,7 +16,6 @@ using Tegra::Shader::Pred;
 
 u32 ShaderIR::DecodeIntegerSetPredicate(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     const Node op_a = GetRegister(instr.gpr8);
 
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp
index ac0e764d6..d46e0f823 100644
--- a/src/video_core/shader/decode/other.cpp
+++ b/src/video_core/shader/decode/other.cpp
@@ -74,6 +74,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) {
             case SystemVariable::InvocationInfo:
                 LOG_WARNING(HW_GPU, "MOV_SYS instruction with InvocationInfo is incomplete");
                 return Immediate(0u);
+            case SystemVariable::Tid: {
+                Node value = Immediate(0);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9);
+                value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5);
+                return value;
+            }
             case SystemVariable::TidX:
                 return Operation(OperationCode::LocalInvocationIdX);
             case SystemVariable::TidY:
diff --git a/src/video_core/shader/decode/predicate_set_register.cpp b/src/video_core/shader/decode/predicate_set_register.cpp
index febbfeb50..84dbc50fe 100644
--- a/src/video_core/shader/decode/predicate_set_register.cpp
+++ b/src/video_core/shader/decode/predicate_set_register.cpp
@@ -15,7 +15,6 @@ using Tegra::Shader::OpCode;
 
 u32 ShaderIR::DecodePredicateSetRegister(NodeBlock& bb, u32 pc) {
     const Instruction instr = {program_code[pc]};
-    const auto opcode = OpCode::Decode(instr);
 
     UNIMPLEMENTED_IF_MSG(instr.generates_cc,
                          "Condition codes generation in PSET is not implemented");
diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp
new file mode 100644
index 000000000..04ca74f46
--- /dev/null
+++ b/src/video_core/shader/decode/warp.cpp
@@ -0,0 +1,55 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "video_core/engines/shader_bytecode.h"
+#include "video_core/shader/node_helper.h"
+#include "video_core/shader/shader_ir.h"
+
+namespace VideoCommon::Shader {
+
+using Tegra::Shader::Instruction;
+using Tegra::Shader::OpCode;
+using Tegra::Shader::Pred;
+using Tegra::Shader::VoteOperation;
+
+namespace {
+OperationCode GetOperationCode(VoteOperation vote_op) {
+    switch (vote_op) {
+    case VoteOperation::All:
+        return OperationCode::VoteAll;
+    case VoteOperation::Any:
+        return OperationCode::VoteAny;
+    case VoteOperation::Eq:
+        return OperationCode::VoteEqual;
+    default:
+        UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op));
+        return OperationCode::VoteAll;
+    }
+}
+} // Anonymous namespace
+
+u32 ShaderIR::DecodeWarp(NodeBlock& bb, u32 pc) {
+    const Instruction instr = {program_code[pc]};
+    const auto opcode = OpCode::Decode(instr);
+
+    switch (opcode->get().GetId()) {
+    case OpCode::Id::VOTE: {
+        const Node value = GetPredicate(instr.vote.value, instr.vote.negate_value != 0);
+        const Node active = Operation(OperationCode::BallotThread, value);
+        const Node vote = Operation(GetOperationCode(instr.vote.operation), value);
+        SetRegister(bb, instr.gpr0, active);
+        SetPredicate(bb, instr.vote.dest_pred, vote);
+        break;
+    }
+    default:
+        UNIMPLEMENTED_MSG("Unhandled warp instruction: {}", opcode->get().GetName());
+        break;
+    }
+
+    return pc;
+}
+
+} // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h
index 5f0852364..5db9313c4 100644
--- a/src/video_core/shader/node.h
+++ b/src/video_core/shader/node.h
@@ -168,6 +168,11 @@ enum class OperationCode {
     WorkGroupIdY,       /// () -> uint
     WorkGroupIdZ,       /// () -> uint
 
+    BallotThread, /// (bool) -> uint
+    VoteAll,      /// (bool) -> bool
+    VoteAny,      /// (bool) -> bool
+    VoteEqual,    /// (bool) -> bool
+
     Amount,
 };
 
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index 5e91fe129..1e5c7f660 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -405,4 +405,9 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) {
                      Immediate(offset), Immediate(bits));
 }
 
+Node ShaderIR::BitfieldInsert(Node base, Node insert, u32 offset, u32 bits) {
+    return Operation(OperationCode::UBitfieldInsert, NO_PRECISE, base, insert, Immediate(offset),
+                     Immediate(bits));
+}
+
 } // namespace VideoCommon::Shader
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 59a083d90..bcc9b79b6 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -167,6 +167,7 @@ private:
     u32 DecodeFfma(NodeBlock& bb, u32 pc);
     u32 DecodeHfma2(NodeBlock& bb, u32 pc);
     u32 DecodeConversion(NodeBlock& bb, u32 pc);
+    u32 DecodeWarp(NodeBlock& bb, u32 pc);
     u32 DecodeMemory(NodeBlock& bb, u32 pc);
     u32 DecodeTexture(NodeBlock& bb, u32 pc);
     u32 DecodeImage(NodeBlock& bb, u32 pc);
@@ -279,6 +280,9 @@ private:
     /// Extracts a sequence of bits from a node
     Node BitfieldExtract(Node value, u32 offset, u32 bits);
 
+    /// Inserts a sequence of bits from a node
+    Node BitfieldInsert(Node base, Node insert, u32 offset, u32 bits);
+
     void WriteTexInstructionFloat(NodeBlock& bb, Tegra::Shader::Instruction instr,
                                   const Node4& components);
 
diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h
index 358d6757c..e7ef66ee2 100644
--- a/src/video_core/texture_cache/surface_params.h
+++ b/src/video_core/texture_cache/surface_params.h
@@ -58,7 +58,6 @@ public:
     std::size_t GetHostSizeInBytes() const {
         std::size_t host_size_in_bytes;
         if (GetCompressionType() == SurfaceCompression::Converted) {
-            constexpr std::size_t rgb8_bpp = 4ULL;
             // ASTC is uncompressed in software, in emulated as RGBA8
             host_size_in_bytes = 0;
             for (u32 level = 0; level < num_levels; ++level) {
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h
index a3a3770a7..2ec0203d1 100644
--- a/src/video_core/texture_cache/texture_cache.h
+++ b/src/video_core/texture_cache/texture_cache.h
@@ -308,8 +308,6 @@ protected:
         if (!guard_render_targets && surface->IsRenderTarget()) {
             ManageRenderTargetUnregister(surface);
         }
-        const GPUVAddr gpu_addr = surface->GetGpuAddr();
-        const CacheAddr cache_ptr = surface->GetCacheAddr();
         const std::size_t size = surface->GetSizeInBytes();
         const VAddr cpu_addr = surface->GetCpuAddr();
         rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1);
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 7e8295944..7df5f1452 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -257,19 +257,21 @@ std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y,
 
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
                     u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
-                    u32 block_height_bit) {
+                    u32 block_height_bit, u32 offset_x, u32 offset_y) {
     const u32 block_height = 1U << block_height_bit;
     const u32 image_width_in_gobs{(swizzled_width * bytes_per_pixel + (gob_size_x - 1)) /
                                   gob_size_x};
     for (u32 line = 0; line < subrect_height; ++line) {
+        const u32 dst_y = line + offset_y;
         const u32 gob_address_y =
-            (line / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
-            ((line % (gob_size_y * block_height)) / gob_size_y) * gob_size;
-        const auto& table = legacy_swizzle_table[line % gob_size_y];
+            (dst_y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
+            ((dst_y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
+        const auto& table = legacy_swizzle_table[dst_y % gob_size_y];
         for (u32 x = 0; x < subrect_width; ++x) {
+            const u32 dst_x = x + offset_x;
             const u32 gob_address =
-                gob_address_y + (x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
-            const u32 swizzled_offset = gob_address + table[(x * bytes_per_pixel) % gob_size_x];
+                gob_address_y + (dst_x * bytes_per_pixel / gob_size_x) * gob_size * block_height;
+            const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % gob_size_x];
             u8* source_line = unswizzled_data + line * source_pitch + x * bytes_per_pixel;
             u8* dest_addr = swizzled_data + swizzled_offset;
 
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index eaec9b5a5..f1e3952bc 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -44,7 +44,8 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
 
 /// Copies an untiled subrectangle into a tiled surface.
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
-                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height);
+                    u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
+                    u32 offset_x, u32 offset_y);
 
 /// Copies a tiled subrectangle into a linear surface.
 void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h
index e3be018b9..e36bc2c04 100644
--- a/src/video_core/textures/texture.h
+++ b/src/video_core/textures/texture.h
@@ -213,7 +213,7 @@ struct TICEntry {
         if (header_version != TICHeaderVersion::OneDBuffer) {
             return width_minus_1 + 1;
         }
-        return (buffer_high_width_minus_one << 16) | buffer_low_width_minus_one;
+        return ((buffer_high_width_minus_one << 16) | buffer_low_width_minus_one) + 1;
     }
 
     u32 Height() const {
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index 5d0fb3f9f..0456248ac 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -516,6 +516,7 @@ void Config::ReadPathValues() {
 
     UISettings::values.roms_path = ReadSetting(QStringLiteral("romsPath")).toString();
     UISettings::values.symbols_path = ReadSetting(QStringLiteral("symbolsPath")).toString();
+    UISettings::values.screenshot_path = ReadSetting(QStringLiteral("screenshotPath")).toString();
     UISettings::values.game_directory_path =
         ReadSetting(QStringLiteral("gameListRootDir"), QStringLiteral(".")).toString();
     UISettings::values.game_directory_deepscan =
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index a7c656fdb..ac57229d5 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -119,6 +119,7 @@ Q_IMPORT_PLUGIN(QWindowsIntegrationPlugin);
 #endif
 
 #ifdef _WIN32
+#include <windows.h>
 extern "C" {
 // tells Nvidia and AMD drivers to use the dedicated GPU by default on laptops with switchable
 // graphics
@@ -747,6 +748,18 @@ void GMainWindow::OnDisplayTitleBars(bool show) {
     }
 }
 
+void GMainWindow::PreventOSSleep() {
+#ifdef _WIN32
+    SetThreadExecutionState(ES_CONTINUOUS | ES_SYSTEM_REQUIRED | ES_DISPLAY_REQUIRED);
+#endif
+}
+
+void GMainWindow::AllowOSSleep() {
+#ifdef _WIN32
+    SetThreadExecutionState(ES_CONTINUOUS);
+#endif
+}
+
 QStringList GMainWindow::GetUnsupportedGLExtensions() {
     QStringList unsupported_ext;
 
@@ -966,6 +979,8 @@ void GMainWindow::BootGame(const QString& filename) {
 }
 
 void GMainWindow::ShutdownGame() {
+    AllowOSSleep();
+
     discord_rpc->Pause();
     emu_thread->RequestStop();
 
@@ -1567,6 +1582,8 @@ void GMainWindow::OnMenuRecentFile() {
 }
 
 void GMainWindow::OnStartGame() {
+    PreventOSSleep();
+
     emu_thread->SetRunning(true);
 
     qRegisterMetaType<Core::Frontend::SoftwareKeyboardParameters>(
@@ -1598,6 +1615,8 @@ void GMainWindow::OnPauseGame() {
     ui.action_Pause->setEnabled(false);
     ui.action_Stop->setEnabled(true);
     ui.action_Capture_Screenshot->setEnabled(false);
+
+    AllowOSSleep();
 }
 
 void GMainWindow::OnStopGame() {
diff --git a/src/yuzu/main.h b/src/yuzu/main.h
index 1137bbc7a..501608ddc 100644
--- a/src/yuzu/main.h
+++ b/src/yuzu/main.h
@@ -130,6 +130,9 @@ private:
     void ConnectWidgetEvents();
     void ConnectMenuEvents();
 
+    void PreventOSSleep();
+    void AllowOSSleep();
+
     QStringList GetUnsupportedGLExtensions();
     bool LoadROM(const QString& filename);
     void BootGame(const QString& filename);