From a16c2611316e534bda310f99319f4e8c74c49c92 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 20 Nov 2022 00:09:56 +0100 Subject: Buffer Cache: Fully rework the buffer cache. --- src/video_core/CMakeLists.txt | 5 + src/video_core/buffer_cache/buffer_base.h | 459 +--------- src/video_core/buffer_cache/buffer_cache.cpp | 4 +- src/video_core/buffer_cache/buffer_cache.h | 990 +++++++-------------- src/video_core/buffer_cache/buffer_cache_base.h | 507 +++++++++++ src/video_core/buffer_cache/memory_tracker_base.h | 258 ++++++ src/video_core/buffer_cache/word_manager.h | 474 ++++++++++ src/video_core/renderer_opengl/gl_buffer_cache.h | 4 + .../renderer_opengl/gl_buffer_cache_base.cpp | 9 + src/video_core/renderer_vulkan/vk_buffer_cache.cpp | 8 +- src/video_core/renderer_vulkan/vk_buffer_cache.h | 8 +- .../renderer_vulkan/vk_buffer_cache_base.cpp | 9 + 12 files changed, 1644 insertions(+), 1091 deletions(-) create mode 100644 src/video_core/buffer_cache/buffer_cache_base.h create mode 100644 src/video_core/buffer_cache/memory_tracker_base.h create mode 100644 src/video_core/buffer_cache/word_manager.h create mode 100644 src/video_core/renderer_opengl/gl_buffer_cache_base.cpp create mode 100644 src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp (limited to 'src/video_core') diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e904573d7..92cab93f3 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -11,8 +11,11 @@ endif() add_library(video_core STATIC buffer_cache/buffer_base.h + buffer_cache/buffer_cache_base.h buffer_cache/buffer_cache.cpp buffer_cache/buffer_cache.h + buffer_cache/memory_tracker_base.h + buffer_cache/word_manager.h cache_types.h cdma_pusher.cpp cdma_pusher.h @@ -104,6 +107,7 @@ add_library(video_core STATIC renderer_null/renderer_null.h renderer_opengl/blit_image.cpp renderer_opengl/blit_image.h + renderer_opengl/gl_buffer_cache_base.cpp renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h renderer_opengl/gl_compute_pipeline.cpp @@ -154,6 +158,7 @@ add_library(video_core STATIC renderer_vulkan/renderer_vulkan.cpp renderer_vulkan/vk_blit_screen.cpp renderer_vulkan/vk_blit_screen.h + renderer_vulkan/vk_buffer_cache_base.cpp renderer_vulkan/vk_buffer_cache.cpp renderer_vulkan/vk_buffer_cache.h renderer_vulkan/vk_command_pool.cpp diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 1b4d63616..66d8bb43c 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once @@ -11,9 +11,7 @@ #include "common/alignment.h" #include "common/common_funcs.h" #include "common/common_types.h" -#include "common/div_ceil.h" -#include "common/settings.h" -#include "core/memory.h" +#include "video_core/buffer_cache/word_manager.h" namespace VideoCommon { @@ -36,116 +34,14 @@ struct NullBufferParams {}; */ template class BufferBase { - static constexpr u64 PAGES_PER_WORD = 64; - static constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; - static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; - - /// Vector tracking modified pages tightly packed with small vector optimization - union WordsArray { - /// Returns the pointer to the words state - [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { - return is_short ? &stack : heap; - } - - /// Returns the pointer to the words state - [[nodiscard]] u64* Pointer(bool is_short) noexcept { - return is_short ? &stack : heap; - } - - u64 stack = 0; ///< Small buffers storage - u64* heap; ///< Not-small buffers pointer to the storage - }; - - struct Words { - explicit Words() = default; - explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { - if (IsShort()) { - cpu.stack = ~u64{0}; - gpu.stack = 0; - cached_cpu.stack = 0; - untracked.stack = ~u64{0}; - } else { - // Share allocation between CPU and GPU pages and set their default values - const size_t num_words = NumWords(); - u64* const alloc = new u64[num_words * 4]; - cpu.heap = alloc; - gpu.heap = alloc + num_words; - cached_cpu.heap = alloc + num_words * 2; - untracked.heap = alloc + num_words * 3; - std::fill_n(cpu.heap, num_words, ~u64{0}); - std::fill_n(gpu.heap, num_words, 0); - std::fill_n(cached_cpu.heap, num_words, 0); - std::fill_n(untracked.heap, num_words, ~u64{0}); - } - // Clean up tailing bits - const u64 last_word_size = size_bytes % BYTES_PER_WORD; - const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); - const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; - const u64 last_word = (~u64{0} << shift) >> shift; - cpu.Pointer(IsShort())[NumWords() - 1] = last_word; - untracked.Pointer(IsShort())[NumWords() - 1] = last_word; - } - - ~Words() { - Release(); - } - - Words& operator=(Words&& rhs) noexcept { - Release(); - size_bytes = rhs.size_bytes; - cpu = rhs.cpu; - gpu = rhs.gpu; - cached_cpu = rhs.cached_cpu; - untracked = rhs.untracked; - rhs.cpu.heap = nullptr; - return *this; - } - - Words(Words&& rhs) noexcept - : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, - cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { - rhs.cpu.heap = nullptr; - } - - Words& operator=(const Words&) = delete; - Words(const Words&) = delete; - - /// Returns true when the buffer fits in the small vector optimization - [[nodiscard]] bool IsShort() const noexcept { - return size_bytes <= BYTES_PER_WORD; - } - - /// Returns the number of words of the buffer - [[nodiscard]] size_t NumWords() const noexcept { - return Common::DivCeil(size_bytes, BYTES_PER_WORD); - } - - /// Release buffer resources - void Release() { - if (!IsShort()) { - // CPU written words is the base for the heap allocation - delete[] cpu.heap; - } - } - - u64 size_bytes = 0; - WordsArray cpu; - WordsArray gpu; - WordsArray cached_cpu; - WordsArray untracked; - }; - - enum class Type { - CPU, - GPU, - CachedCPU, - Untracked, - }; - public: + static constexpr u64 BASE_PAGE_BITS = 16; + static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; + explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) - : rasterizer{&rasterizer_}, cpu_addr{Common::AlignDown(cpu_addr_, BYTES_PER_PAGE)}, - words(Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BYTES_PER_PAGE)) {} + : cpu_addr{Common::AlignDown(cpu_addr_, BASE_PAGE_SIZE)}, + word_manager(cpu_addr, rasterizer_, + Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BASE_PAGE_SIZE)) {} explicit BufferBase(NullBufferParams) {} @@ -159,94 +55,82 @@ public: [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion(offset, query_size); + return word_manager.ModifiedRegion(offset, query_size); } /// Returns the inclusive GPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion(offset, query_size); + return word_manager.ModifiedRegion(offset, query_size); } /// Returns true if a region has been modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified(offset, query_size); + return word_manager.IsRegionModified(offset, query_size); } /// Returns true if a region has been modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified(offset, query_size); + return word_manager.IsRegionModified(offset, query_size); } /// Mark region as CPU modified, notifying the rasterizer about this change void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState(dirty_cpu_addr, size); + word_manager.ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as CPU modified, notifying the rasterizer about this change void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState(dirty_cpu_addr, size); + word_manager.ChangeRegionState(dirty_cpu_addr, size); } /// Mark region as modified from the host GPU void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState(dirty_cpu_addr, size); + word_manager.ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as modified from the host GPU void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState(dirty_cpu_addr, size); + word_manager.ChangeRegionState(dirty_cpu_addr, size); } /// Mark region as modified from the CPU /// but don't mark it as modified until FlusHCachedWrites is called. void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { flags |= BufferFlagBits::CachedWrites; - ChangeRegionState(dirty_cpu_addr, size); + word_manager.ChangeRegionState(dirty_cpu_addr, size); } /// Flushes cached CPU writes, and notify the rasterizer about the deltas void FlushCachedWrites() noexcept { flags &= ~BufferFlagBits::CachedWrites; - const u64 num_words = NumWords(); - u64* const cached_words = Array(); - u64* const untracked_words = Array(); - u64* const cpu_words = Array(); - for (u64 word_index = 0; word_index < num_words; ++word_index) { - const u64 cached_bits = cached_words[word_index]; - NotifyRasterizer(word_index, untracked_words[word_index], cached_bits); - untracked_words[word_index] |= cached_bits; - cpu_words[word_index] |= cached_bits; - if (!Settings::values.use_pessimistic_flushes) { - cached_words[word_index] = 0; - } - } + word_manager.FlushCachedWrites(); } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified template void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, true, func); + word_manager.ForEachModifiedRange(query_cpu_range, size, true, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, clear, func); + word_manager.ForEachModifiedRange(query_cpu_range, size, clear, func); } template void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, true, func); + word_manager.ForEachModifiedRange(query_cpu_range, size, true, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(Func&& func) { - ForEachModifiedRange(cpu_addr, SizeBytes(), true, func); + word_manager.ForEachModifiedRange(cpu_addr, SizeBytes(), true, func); } /// Mark buffer as picked @@ -297,7 +181,7 @@ public: /// Returns the size in bytes of the buffer [[nodiscard]] u64 SizeBytes() const noexcept { - return words.size_bytes; + return word_manager.SizeBytes(); } size_t getLRUID() const noexcept { @@ -309,301 +193,8 @@ public: } private: - template - u64* Array() noexcept { - if constexpr (type == Type::CPU) { - return words.cpu.Pointer(IsShort()); - } else if constexpr (type == Type::GPU) { - return words.gpu.Pointer(IsShort()); - } else if constexpr (type == Type::CachedCPU) { - return words.cached_cpu.Pointer(IsShort()); - } else if constexpr (type == Type::Untracked) { - return words.untracked.Pointer(IsShort()); - } - } - - template - const u64* Array() const noexcept { - if constexpr (type == Type::CPU) { - return words.cpu.Pointer(IsShort()); - } else if constexpr (type == Type::GPU) { - return words.gpu.Pointer(IsShort()); - } else if constexpr (type == Type::CachedCPU) { - return words.cached_cpu.Pointer(IsShort()); - } else if constexpr (type == Type::Untracked) { - return words.untracked.Pointer(IsShort()); - } - } - - /** - * Change the state of a range of pages - * - * @param dirty_addr Base address to mark or unmark as modified - * @param size Size in bytes to mark or unmark as modified - */ - template - void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { - const s64 difference = dirty_addr - cpu_addr; - const u64 offset = std::max(difference, 0); - size += std::min(difference, 0); - if (offset >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array(); - u64* const state_words = Array(); - const u64 offset_end = std::min(offset + size, SizeBytes()); - const u64 begin_page_index = offset / BYTES_PER_PAGE; - const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; - const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); - const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); - u64 page_index = begin_page_index % PAGES_PER_WORD; - u64 word_index = begin_word_index; - while (word_index < end_word_index) { - const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; - const u64 left_offset = - std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; - const u64 right_offset = page_index; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - NotifyRasterizer(word_index, untracked_words[word_index], bits); - } - if constexpr (enable) { - state_words[word_index] |= bits; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] |= bits; - } - } else { - state_words[word_index] &= ~bits; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] &= ~bits; - } - } - page_index = 0; - ++word_index; - } - } - - /** - * Notify rasterizer about changes in the CPU tracking state of a word in the buffer - * - * @param word_index Index to the word to notify to the rasterizer - * @param current_bits Current state of the word - * @param new_bits New state of the word - * - * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages - */ - template - void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { - u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; - VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; - while (changed_bits != 0) { - const int empty_bits = std::countr_zero(changed_bits); - addr += empty_bits * BYTES_PER_PAGE; - changed_bits >>= empty_bits; - - const u32 continuous_bits = std::countr_one(changed_bits); - const u64 size = continuous_bits * BYTES_PER_PAGE; - const VAddr begin_addr = addr; - addr += size; - changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; - rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); - } - } - - /** - * Loop over each page in the given range, turn off those bits and notify the rasterizer if - * needed. Call the given function on each turned off range. - * - * @param query_cpu_range Base CPU address to loop over - * @param size Size in bytes of the CPU range to loop over - * @param func Function to call for each turned off region - */ - template - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { - static_assert(type != Type::Untracked); - - const s64 difference = query_cpu_range - cpu_addr; - const u64 query_begin = std::max(difference, 0); - size += std::min(difference, 0); - if (query_begin >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array(); - u64* const state_words = Array(); - const u64 query_end = query_begin + std::min(static_cast(size), SizeBytes()); - u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; - u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); - - const auto modified = [](u64 word) { return word != 0; }; - const auto first_modified_word = std::find_if(words_begin, words_end, modified); - if (first_modified_word == words_end) { - // Exit early when the buffer is not modified - return; - } - const auto last_modified_word = std::find_if_not(first_modified_word, words_end, modified); - - const u64 word_index_begin = std::distance(state_words, first_modified_word); - const u64 word_index_end = std::distance(state_words, last_modified_word); - - const unsigned local_page_begin = std::countr_zero(*first_modified_word); - const unsigned local_page_end = - static_cast(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); - const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; - const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; - const u64 query_page_begin = query_begin / BYTES_PER_PAGE; - const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); - const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); - const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); - const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; - const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; - - u64 page_begin = first_word_page_begin; - u64 current_base = 0; - u64 current_size = 0; - bool on_going = false; - for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { - const bool is_last_word = word_index + 1 == word_index_end; - const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; - const u64 right_offset = page_begin; - const u64 left_offset = PAGES_PER_WORD - page_end; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - - const u64 current_word = state_words[word_index] & bits; - if (clear) { - state_words[word_index] &= ~bits; - } - - if constexpr (type == Type::CPU) { - const u64 current_bits = untracked_words[word_index] & bits; - untracked_words[word_index] &= ~bits; - NotifyRasterizer(word_index, current_bits, ~u64{0}); - } - // Exclude CPU modified pages when visiting GPU pages - const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); - u64 page = page_begin; - page_begin = 0; - - while (page < page_end) { - const int empty_bits = std::countr_zero(word >> page); - if (on_going && empty_bits != 0) { - InvokeModifiedRange(func, current_size, current_base); - current_size = 0; - on_going = false; - } - if (empty_bits == PAGES_PER_WORD) { - break; - } - page += empty_bits; - - const int continuous_bits = std::countr_one(word >> page); - if (!on_going && continuous_bits != 0) { - current_base = word_index * PAGES_PER_WORD + page; - on_going = true; - } - current_size += continuous_bits; - page += continuous_bits; - } - } - if (on_going && current_size > 0) { - InvokeModifiedRange(func, current_size, current_base); - } - } - - template - void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { - const u64 current_size_bytes = current_size * BYTES_PER_PAGE; - const u64 offset_begin = current_base * BYTES_PER_PAGE; - const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); - func(offset_begin, offset_end - offset_begin); - } - - /** - * Returns true when a region has been modified - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template - [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const u64* const untracked_words = Array(); - const u64* const state_words = Array(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min(word_begin + num_query_words, NumWords()); - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; - for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); - const u64 local_page_end = page_end % PAGES_PER_WORD; - const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; - if (((word >> page_index) << page_index) << page_end_shift != 0) { - return true; - } - } - return false; - } - - /** - * Returns a begin end pair with the inclusive modified region - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template - [[nodiscard]] std::pair ModifiedRegion(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const u64* const untracked_words = Array(); - const u64* const state_words = Array(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min(word_begin + num_query_words, NumWords()); - const u64 page_base = offset / BYTES_PER_PAGE; - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 begin = std::numeric_limits::max(); - u64 end = 0; - for (u64 word_index = word_begin; word_index < word_end; ++word_index) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 local_page_begin = std::countr_zero(word); - const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); - const u64 page_index = word_index * PAGES_PER_WORD; - const u64 page_begin = std::max(page_index + local_page_begin, page_base); - const u64 page_end = std::min(page_index + local_page_end, page_limit); - begin = std::min(begin, page_begin); - end = std::max(end, page_end); - } - static constexpr std::pair EMPTY{0, 0}; - return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; - } - - /// Returns the number of words of the buffer - [[nodiscard]] size_t NumWords() const noexcept { - return words.NumWords(); - } - - /// Returns true when the buffer fits in the small vector optimization - [[nodiscard]] bool IsShort() const noexcept { - return words.IsShort(); - } - - RasterizerInterface* rasterizer = nullptr; VAddr cpu_addr = 0; - Words words; + WordManager word_manager; BufferFlagBits flags{}; int stream_score = 0; size_t lru_id = SIZE_MAX; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index a16308b60..40db243d2 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #include "common/microprofile.h" diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index abdc593df..a0701ce4e 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1,482 +1,21 @@ -// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once #include -#include #include -#include #include -#include -#include - -#include -#include - -#include "common/common_types.h" -#include "common/div_ceil.h" -#include "common/literals.h" -#include "common/lru_cache.h" -#include "common/microprofile.h" -#include "common/polyfill_ranges.h" -#include "common/scratch_buffer.h" -#include "common/settings.h" -#include "core/memory.h" -#include "video_core/buffer_cache/buffer_base.h" -#include "video_core/control/channel_state_cache.h" -#include "video_core/delayed_destruction_ring.h" -#include "video_core/dirty_flags.h" -#include "video_core/engines/draw_manager.h" -#include "video_core/engines/kepler_compute.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h" -#include "video_core/surface.h" -#include "video_core/texture_cache/slot_vector.h" -#include "video_core/texture_cache/types.h" -namespace VideoCommon { - -MICROPROFILE_DECLARE(GPU_PrepareBuffers); -MICROPROFILE_DECLARE(GPU_BindUploadBuffers); -MICROPROFILE_DECLARE(GPU_DownloadMemory); - -using BufferId = SlotId; - -using VideoCore::Surface::PixelFormat; -using namespace Common::Literals; - -constexpr u32 NUM_VERTEX_BUFFERS = 32; -constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; -constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; -constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; -constexpr u32 NUM_STORAGE_BUFFERS = 16; -constexpr u32 NUM_TEXTURE_BUFFERS = 16; -constexpr u32 NUM_STAGES = 5; - -enum class ObtainBufferSynchronize : u32 { - NoSynchronize = 0, - FullSynchronize = 1, - SynchronizeNoDirty = 2, -}; - -enum class ObtainBufferOperation : u32 { - DoNothing = 0, - MarkAsWritten = 1, - DiscardWrite = 2, - MarkQuery = 3, -}; - -using UniformBufferSizes = std::array, NUM_STAGES>; -using ComputeUniformBufferSizes = std::array; - -template -class BufferCache : public VideoCommon::ChannelSetupCaches { - - // Page size for caching purposes. - // This is unrelated to the CPU page size and it can be changed as it seems optimal. - static constexpr u32 YUZU_PAGEBITS = 16; - static constexpr u64 YUZU_PAGESIZE = u64{1} << YUZU_PAGEBITS; - - static constexpr bool IS_OPENGL = P::IS_OPENGL; - static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = - P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; - static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = - P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; - static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; - static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; - static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; - static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; - - static constexpr BufferId NULL_BUFFER_ID{0}; - - static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; - static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; - static constexpr s64 TARGET_THRESHOLD = 4_GiB; - - using Maxwell = Tegra::Engines::Maxwell3D::Regs; - - using Runtime = typename P::Runtime; - using Buffer = typename P::Buffer; - - using IntervalSet = boost::icl::interval_set; - using IntervalType = typename IntervalSet::interval_type; - - struct Empty {}; - - struct OverlapResult { - std::vector ids; - VAddr begin; - VAddr end; - bool has_stream_leap = false; - }; - - struct Binding { - VAddr cpu_addr{}; - u32 size{}; - BufferId buffer_id; - }; - - struct TextureBufferBinding : Binding { - PixelFormat format; - }; - - static constexpr Binding NULL_BINDING{ - .cpu_addr = 0, - .size = 0, - .buffer_id = NULL_BUFFER_ID, - }; - -public: - static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast(4_KiB); - - explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, Runtime& runtime_); - - void TickFrame(); - - void WriteMemory(VAddr cpu_addr, u64 size); - - void CachedWriteMemory(VAddr cpu_addr, u64 size); - - void DownloadMemory(VAddr cpu_addr, u64 size); - - bool InlineMemory(VAddr dest_address, size_t copy_size, std::span inlined_buffer); - - void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); - - void DisableGraphicsUniformBuffer(size_t stage, u32 index); - - void UpdateGraphicsBuffers(bool is_indexed); - - void UpdateComputeBuffers(); - - void BindHostGeometryBuffers(bool is_indexed); - - void BindHostStageBuffers(size_t stage); - - void BindHostComputeBuffers(); - - void SetUniformBuffersState(const std::array& mask, - const UniformBufferSizes* sizes); - - void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); - - void UnbindGraphicsStorageBuffers(size_t stage); - - void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, - bool is_written); - - void UnbindGraphicsTextureBuffers(size_t stage); - - void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, - PixelFormat format, bool is_written, bool is_image); - - void UnbindComputeStorageBuffers(); - - void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, - bool is_written); - - void UnbindComputeTextureBuffers(); - - void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, - bool is_written, bool is_image); - - void FlushCachedWrites(); - - /// Return true when there are uncommitted buffers to be downloaded - [[nodiscard]] bool HasUncommittedFlushes() const noexcept; - - void AccumulateFlushes(); - - /// Return true when the caller should wait for async downloads - [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; - - /// Commit asynchronous downloads - void CommitAsyncFlushes(); - void CommitAsyncFlushesHigh(); - - /// Pop asynchronous downloads - void PopAsyncFlushes(); - - bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); - - bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); - - [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, - ObtainBufferSynchronize sync_info, - ObtainBufferOperation post_op); - - /// Return true when a CPU region is modified from the GPU - [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); - - /// Return true when a region is registered on the cache - [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); - - /// Return true when a CPU region is modified from the CPU - [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); - - void SetDrawIndirect( - const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { - current_draw_indirect = current_draw_indirect_; - } - - [[nodiscard]] std::pair GetDrawIndirectCount(); - - [[nodiscard]] std::pair GetDrawIndirectBuffer(); - - std::recursive_mutex mutex; - Runtime& runtime; - -private: - template - static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { - for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { - const int disabled_bits = std::countr_zero(enabled_mask); - index += disabled_bits; - enabled_mask >>= disabled_bits; - func(index); - } - } - - template - void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { - const u64 page_end = Common::DivCeil(cpu_addr + size, YUZU_PAGESIZE); - for (u64 page = cpu_addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId buffer_id = page_table[page]; - if (!buffer_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[buffer_id]; - func(buffer_id, buffer); - - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - } - - template - void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) { - const VAddr start_address = cpu_addr; - const VAddr end_address = start_address + size; - const VAddr search_base = - static_cast(std::min(0LL, static_cast(start_address - size))); - const IntervalType search_interval{search_base, search_base + 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - for (; it != common_ranges.end(); it++) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end); - } - } - - static bool IsRangeGranular(VAddr cpu_addr, size_t size) { - return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == - ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); - } - - void RunGarbageCollector(); - - void BindHostIndexBuffer(); - - void BindHostVertexBuffers(); - - void BindHostDrawIndirectBuffers(); - - void BindHostGraphicsUniformBuffers(size_t stage); - - void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); - - void BindHostGraphicsStorageBuffers(size_t stage); - - void BindHostGraphicsTextureBuffers(size_t stage); - - void BindHostTransformFeedbackBuffers(); - - void BindHostComputeUniformBuffers(); - - void BindHostComputeStorageBuffers(); - - void BindHostComputeTextureBuffers(); - - void DoUpdateGraphicsBuffers(bool is_indexed); - - void DoUpdateComputeBuffers(); - - void UpdateIndexBuffer(); - - void UpdateVertexBuffers(); - - void UpdateVertexBuffer(u32 index); - - void UpdateDrawIndirect(); - - void UpdateUniformBuffers(size_t stage); - - void UpdateStorageBuffers(size_t stage); - - void UpdateTextureBuffers(size_t stage); - - void UpdateTransformFeedbackBuffers(); - - void UpdateTransformFeedbackBuffer(u32 index); - - void UpdateComputeUniformBuffers(); - - void UpdateComputeStorageBuffers(); - - void UpdateComputeTextureBuffers(); - - void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); - - [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); - - [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); - - void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); - - [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); - - void Register(BufferId buffer_id); - - void Unregister(BufferId buffer_id); - - template - void ChangeRegister(BufferId buffer_id); - - void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; - - bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); - - bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); - - void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, - std::span copies); - - void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, - std::span copies); - - void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span copies); - - void DownloadBufferMemory(Buffer& buffer_id); - - void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); - - void DeleteBuffer(BufferId buffer_id); - - void NotifyBufferDeletion(); - - [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, - bool is_written = false) const; - - [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, - PixelFormat format); - - [[nodiscard]] std::span ImmediateBufferWithData(VAddr cpu_addr, size_t size); - - [[nodiscard]] std::span ImmediateBuffer(size_t wanted_capacity); - - [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; - - void ClearDownload(IntervalType subtract_interval); - - VideoCore::RasterizerInterface& rasterizer; - Core::Memory::Memory& cpu_memory; - - SlotVector slot_buffers; - DelayedDestructionRing delayed_destruction_ring; - - const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; - - u32 last_index_count = 0; - - Binding index_buffer; - std::array vertex_buffers; - std::array, NUM_STAGES> uniform_buffers; - std::array, NUM_STAGES> storage_buffers; - std::array, NUM_STAGES> texture_buffers; - std::array transform_feedback_buffers; - Binding count_buffer_binding; - Binding indirect_buffer_binding; - - std::array compute_uniform_buffers; - std::array compute_storage_buffers; - std::array compute_texture_buffers; - - std::array enabled_uniform_buffer_masks{}; - u32 enabled_compute_uniform_buffer_mask = 0; - - const UniformBufferSizes* uniform_buffer_sizes{}; - const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; - - std::array enabled_storage_buffers{}; - std::array written_storage_buffers{}; - u32 enabled_compute_storage_buffers = 0; - u32 written_compute_storage_buffers = 0; - - std::array enabled_texture_buffers{}; - std::array written_texture_buffers{}; - std::array image_texture_buffers{}; - u32 enabled_compute_texture_buffers = 0; - u32 written_compute_texture_buffers = 0; - u32 image_compute_texture_buffers = 0; - - std::array uniform_cache_hits{}; - std::array uniform_cache_shots{}; - - u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; - - bool has_deleted_buffers = false; - - std::conditional_t, Empty> - dirty_uniform_buffers{}; - std::conditional_t, Empty> fast_bound_uniform_buffers{}; - std::conditional_t, NUM_STAGES>, Empty> - uniform_buffer_binding_sizes{}; - - std::vector cached_write_buffer_ids; - - IntervalSet uncommitted_ranges; - IntervalSet common_ranges; - std::deque committed_ranges; - - Common::ScratchBuffer immediate_buffer_alloc; - - struct LRUItemParams { - using ObjectType = BufferId; - using TickType = u64; - }; - Common::LeastRecentlyUsedCache lru_cache; - u64 frame_tick = 0; - u64 total_used_memory = 0; - u64 minimum_memory = 0; - u64 critical_memory = 0; +#include "video_core/buffer_cache/buffer_cache_base.h" - std::array> YUZU_PAGEBITS)> page_table; -}; +namespace VideoCommon { template BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, Core::Memory::Memory& cpu_memory_, Runtime& runtime_) - : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_} { + : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, memory_tracker{ + rasterizer} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); @@ -547,19 +86,18 @@ void BufferCache

::TickFrame() { template void BufferCache

::WriteMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { - buffer.MarkRegionAsCpuModified(cpu_addr, size); - }); + memory_tracker.MarkRegionAsCpuModified(cpu_addr, size); + const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); } template void BufferCache

::CachedWriteMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - if (!buffer.HasCachedWrites()) { - cached_write_buffer_ids.push_back(buffer_id); - } - buffer.CachedCpuWrite(cpu_addr, size); - }); + memory_tracker.CachedCpuWrite(cpu_addr, size); + const IntervalType add_interval{Common::AlignDown(cpu_addr, YUZU_PAGESIZE), + Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE)}; + cached_ranges.add(add_interval); } template @@ -572,6 +110,9 @@ void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { template void BufferCache

::ClearDownload(IntervalType subtract_interval) { uncommitted_ranges.subtract(subtract_interval); + for (auto& interval_set : async_downloads) { + interval_set.subtract(subtract_interval); + } for (auto& interval_set : committed_ranges) { interval_set.subtract(subtract_interval); } @@ -611,15 +152,19 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am }}; boost::container::small_vector tmp_intervals; + const bool is_high_accuracy = + Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; auto mirror = [&](VAddr base_address, VAddr base_address_end) { const u64 size = base_address_end - base_address; const VAddr diff = base_address - *cpu_src_address; const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; - uncommitted_ranges.add(add_interval); tmp_intervals.push_back(add_interval); + if (is_high_accuracy) { + uncommitted_ranges.add(add_interval); + } }; - ForEachWrittenRange(*cpu_src_address, amount, mirror); + ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. common_ranges.subtract(subtract_interval); const bool has_new_downloads = tmp_intervals.size() != 0; @@ -628,7 +173,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } runtime.CopyBuffer(dest_buffer, src_buffer, copies); if (has_new_downloads) { - dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); + memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } std::vector tmp_buffer(amount); cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); @@ -866,23 +411,24 @@ void BufferCache

::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add template void BufferCache

::FlushCachedWrites() { - for (const BufferId buffer_id : cached_write_buffer_ids) { - slot_buffers[buffer_id].FlushCachedWrites(); - } cached_write_buffer_ids.clear(); + memory_tracker.FlushCachedWrites(); + /*for (auto& interval : cached_ranges) { + VAddr cpu_addr = interval.lower(); + const std::size_t size = interval.upper() - interval.lower(); + memory_tracker.FlushCachedWrites(cpu_addr, size); + // common_ranges.subtract(interval); + }*/ + cached_ranges.clear(); } template bool BufferCache

::HasUncommittedFlushes() const noexcept { - return !uncommitted_ranges.empty() || !committed_ranges.empty(); + return !uncommitted_ranges.empty() || !committed_ranges.empty() || !pending_queries.empty(); } template void BufferCache

::AccumulateFlushes() { - if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { - uncommitted_ranges.clear(); - return; - } if (uncommitted_ranges.empty()) { return; } @@ -891,7 +437,8 @@ void BufferCache

::AccumulateFlushes() { template bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { - return false; + return (!async_buffers.empty() && async_buffers.front().has_value()) || + (!query_async_buffers.empty() && query_async_buffers.front().has_value()); } template @@ -899,11 +446,10 @@ void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { + async_buffers.emplace_back(std::optional{}); return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); - const bool is_accuracy_normal = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; auto it = committed_ranges.begin(); while (it != committed_ranges.end()) { @@ -926,11 +472,12 @@ void BufferCache

::CommitAsyncFlushesHigh() { const std::size_t size = interval.upper() - interval.lower(); const VAddr cpu_addr = interval.lower(); ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - buffer.ForEachDownloadRangeAndClear( - cpu_addr, size, [&](u64 range_offset, u64 range_size) { - if (is_accuracy_normal) { - return; - } + const VAddr buffer_start = buffer.CpuAddr(); + const VAddr buffer_end = buffer_start + buffer.SizeBytes(); + const VAddr new_start = std::max(buffer_start, cpu_addr); + const VAddr new_end = std::min(buffer_end, cpu_addr + size); + memory_tracker.ForEachDownloadRange( + new_start, new_end - new_start, false, [&](u64 cpu_addr_out, u64 range_size) { const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { const u64 new_offset = start - buffer_addr; @@ -950,38 +497,36 @@ void BufferCache

::CommitAsyncFlushesHigh() { largest_copy = std::max(largest_copy, new_size); }; - const VAddr start_address = buffer_addr + range_offset; - const VAddr end_address = start_address + range_size; - ForEachWrittenRange(start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - common_ranges.subtract(subtract_interval); + ForEachInRangeSet(common_ranges, cpu_addr_out, range_size, add_download); }); }); } } committed_ranges.clear(); if (downloads.empty()) { + async_buffers.emplace_back(std::optional{}); return; } - if constexpr (USE_MEMORY_MAPS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); + boost::container::small_vector normalized_copies; + IntervalSet new_async_range{}; runtime.PreCopyBarrier(); for (auto& [copy, buffer_id] : downloads) { - // Have in mind the staging buffer offset for the copy copy.dst_offset += download_staging.offset; const std::array copies{copy}; - runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); - } - runtime.PostCopyBarrier(); - runtime.Finish(); - for (const auto& [copy, buffer_id] : downloads) { - const Buffer& buffer = slot_buffers[buffer_id]; - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - // Undo the modified offset - const u64 dst_offset = copy.dst_offset - download_staging.offset; - const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + BufferCopy second_copy{copy}; + Buffer& buffer = slot_buffers[buffer_id]; + second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; + VAddr orig_cpu_addr = static_cast(second_copy.src_offset); + const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; + new_async_range.add(base_interval); + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); + normalized_copies.push_back(second_copy); } + async_downloads.emplace_back(std::move(new_async_range)); + pending_downloads.emplace_back(std::move(normalized_copies)); + async_buffers.emplace_back(download_staging); } else { const std::span immediate_buffer = ImmediateBuffer(largest_copy); for (const auto& [copy, buffer_id] : downloads) { @@ -994,42 +539,154 @@ void BufferCache

::CommitAsyncFlushesHigh() { } template -void BufferCache

::CommitAsyncFlushes() { - if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { - CommitAsyncFlushesHigh(); +void BufferCache

::CommitAsyncQueries() { + if (pending_queries.empty()) { + query_async_buffers.emplace_back(std::optional{}); + return; + } + + MICROPROFILE_SCOPE(GPU_DownloadMemory); + boost::container::small_vector, 8> downloads; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + do { + has_deleted_buffers = false; + downloads.clear(); + total_size_bytes = 0; + largest_copy = 0; + for (const auto& query_info : pending_queries) { + const std::size_t size = query_info.second; + const VAddr cpu_addr = query_info.first; + const BufferId buffer_id = FindBuffer(cpu_addr, static_cast(size)); + Buffer& buffer = slot_buffers[buffer_id]; + if (has_deleted_buffers) { + break; + } + downloads.push_back({ + BufferCopy{ + .src_offset = buffer.Offset(cpu_addr), + .dst_offset = total_size_bytes, + .size = size, + }, + buffer_id, + }); + constexpr u64 align = 8ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (size + align - 1) & mask; + largest_copy = std::max(largest_copy, size); + } + } while (has_deleted_buffers); + pending_queries.clear(); + if (downloads.empty()) { + query_async_buffers.push_back(std::optional{}); + return; + } + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); + boost::container::small_vector normalized_copies; + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + // Have in mind the staging buffer offset for the copy + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + const Buffer& buffer = slot_buffers[buffer_id]; + BufferCopy second_copy{copy}; + second_copy.src_offset = static_cast(buffer.CpuAddr()) + second_copy.src_offset; + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); + normalized_copies.push_back(second_copy); + } + committed_queries.emplace_back(std::move(normalized_copies)); + query_async_buffers.emplace_back(download_staging); } else { - uncommitted_ranges.clear(); - committed_ranges.clear(); + query_async_buffers.push_back(std::optional{}); } } template -void BufferCache

::PopAsyncFlushes() {} +void BufferCache

::CommitAsyncFlushes() { + CommitAsyncFlushesHigh(); + CommitAsyncQueries(); +} template -bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId image_id = page_table[page]; - if (!image_id) { - ++page; - continue; +void BufferCache

::PopAsyncFlushes() { + MICROPROFILE_SCOPE(GPU_DownloadMemory); + PopAsyncBuffers(); + PopAsyncQueries(); +} + +template +void BufferCache

::PopAsyncBuffers() { + if (async_buffers.empty()) { + return; + } + if (!async_buffers.front().has_value()) { + async_buffers.pop_front(); + return; + } + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto& downloads = pending_downloads.front(); + auto& async_buffer = async_buffers.front(); + auto& async_range = async_downloads.front(); + u8* base = async_buffer->mapped_span.data(); + const size_t base_offset = async_buffer->offset; + for (const auto& copy : downloads) { + const VAddr cpu_addr = static_cast(copy.src_offset); + const u64 dst_offset = copy.dst_offset - base_offset; + const u8* read_mapped_memory = base + dst_offset; + ForEachInRangeSet(async_range, cpu_addr, copy.size, [&](VAddr start, VAddr end) { + const size_t diff = start - cpu_addr; + const size_t new_size = end - start; + cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[diff], new_size); + const IntervalType base_interval{start, end}; + common_ranges.subtract(base_interval); + }); } - Buffer& buffer = slot_buffers[image_id]; - if (buffer.IsRegionGpuModified(addr, size)) { - return true; + runtime.FreeDeferredStagingBuffer(*async_buffer); + async_buffers.pop_front(); + pending_downloads.pop_front(); + async_downloads.pop_front(); + } +} + +template +void BufferCache

::PopAsyncQueries() { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (query_async_buffers.empty()) { + return; + } + if (!query_async_buffers.front().has_value()) { + query_async_buffers.pop_front(); + return; } - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); + auto& downloads = committed_queries.front(); + auto& async_buffer = query_async_buffers.front(); + flushed_queries.clear(); + u8* base = async_buffer->mapped_span.data(); + const size_t base_offset = async_buffer->offset; + for (const auto& copy : downloads) { + const size_t dst_offset = copy.dst_offset - base_offset; + const u8* read_mapped_memory = base + dst_offset; + u64 new_value{}; + std::memcpy(&new_value, read_mapped_memory, copy.size); + flushed_queries.push_back(new_value); + } + runtime.FreeDeferredStagingBuffer(*async_buffer); + committed_queries.pop_front(); + query_async_buffers.pop_front(); } - return false; +} + +template +bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { + return memory_tracker.IsRegionGpuModified(addr, size); } template bool BufferCache

::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; - const u64 page_end = Common::DivCeil(end_addr, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { + const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE); + for (u64 page = addr >> PAGE_BITS; page < page_end;) { const BufferId buffer_id = page_table[page]; if (!buffer_id) { ++page; @@ -1041,28 +698,14 @@ bool BufferCache

::IsRegionRegistered(VAddr addr, size_t size) { if (buf_start_addr < end_addr && addr < buf_end_addr) { return true; } - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); + page = Common::DivCeil(end_addr, PAGE_SIZE); } return false; } template bool BufferCache

::IsRegionCpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId image_id = page_table[page]; - if (!image_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[image_id]; - if (buffer.IsRegionCpuModified(addr, size)) { - return true; - } - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - return false; + return memory_tracker.IsRegionCpuModified(addr, size); } template @@ -1155,7 +798,7 @@ void BufferCache

::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 TouchBuffer(buffer, binding.buffer_id); const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && size <= uniform_buffer_skip_cache_size && - !buffer.IsRegionGpuModified(cpu_addr, size); + !memory_tracker.IsRegionGpuModified(cpu_addr, size); if (use_fast_buffer) { if constexpr (IS_OPENGL) { if (runtime.HasFastBufferSubData()) { @@ -1378,27 +1021,28 @@ void BufferCache

::UpdateIndexBuffer() { // We have to check for the dirty flags and index count // The index count is currently changed without updating the dirty flags const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - const auto& index_array = draw_state.index_buffer; + const auto& index_buffer_ref = draw_state.index_buffer; auto& flags = maxwell3d->dirty.flags; if (!flags[Dirty::IndexBuffer]) { return; } flags[Dirty::IndexBuffer] = false; - last_index_count = index_array.count; if (!draw_state.inline_index_draw_indexes.empty()) { auto inline_index_size = static_cast(draw_state.inline_index_draw_indexes.size()); index_buffer = Binding{ .cpu_addr = 0, .size = inline_index_size, - .buffer_id = CreateBuffer(0, inline_index_size), + .buffer_id = FindBuffer(0, inline_index_size), }; return; } - const GPUVAddr gpu_addr_begin = index_array.StartAddress(); - const GPUVAddr gpu_addr_end = index_array.EndAddress(); + + const GPUVAddr gpu_addr_begin = index_buffer_ref.StartAddress(); + const GPUVAddr gpu_addr_end = index_buffer_ref.EndAddress(); const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); - const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); + const u32 draw_size = + (index_buffer_ref.count + index_buffer_ref.first) * index_buffer_ref.FormatSizeInBytes(); const u32 size = std::min(address_size, draw_size); if (size == 0 || !cpu_addr) { index_buffer = NULL_BINDING; @@ -1434,17 +1078,15 @@ void BufferCache

::UpdateVertexBuffer(u32 index) { const GPUVAddr gpu_addr_begin = array.Address(); const GPUVAddr gpu_addr_end = limit.Address() + 1; const std::optional cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); - u32 address_size = static_cast( - std::min(gpu_addr_end - gpu_addr_begin, static_cast(std::numeric_limits::max()))); - if (array.enable == 0 || address_size == 0 || !cpu_addr) { + const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); + u32 size = address_size; // TODO: Analyze stride and number of vertices + if (array.enable == 0 || size == 0 || !cpu_addr) { vertex_buffers[index] = NULL_BINDING; return; } if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { - address_size = - static_cast(gpu_memory->MaxContinuousRange(gpu_addr_begin, address_size)); + size = static_cast(gpu_memory->MaxContinuousRange(gpu_addr_begin, size)); } - const u32 size = address_size; // TODO: Analyze stride and number of vertices vertex_buffers[index] = Binding{ .cpu_addr = *cpu_addr, .size = size, @@ -1590,18 +1232,17 @@ void BufferCache

::UpdateComputeTextureBuffers() { } template -void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkRegionAsGpuModified(cpu_addr, size); +void BufferCache

::MarkWrittenBuffer(BufferId, VAddr cpu_addr, u32 size) { + memory_tracker.MarkRegionAsGpuModified(cpu_addr, size); const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - - const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); - if (!is_async) { - return; + for (auto& interval_set : async_downloads) { + interval_set.subtract(base_interval); + } + if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { + uncommitted_ranges.add(base_interval); } - uncommitted_ranges.add(base_interval); } template @@ -1609,7 +1250,7 @@ BufferId BufferCache

::FindBuffer(VAddr cpu_addr, u32 size) { if (cpu_addr == 0) { return NULL_BUFFER_ID; } - const u64 page = cpu_addr >> YUZU_PAGEBITS; + const u64 page = cpu_addr >> PAGE_BITS; const BufferId buffer_id = page_table[page]; if (!buffer_id) { return CreateBuffer(cpu_addr, size); @@ -1638,9 +1279,8 @@ typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(VAddr cpu .has_stream_leap = has_stream_leap, }; } - for (; cpu_addr >> YUZU_PAGEBITS < Common::DivCeil(end, YUZU_PAGESIZE); - cpu_addr += YUZU_PAGESIZE) { - const BufferId overlap_id = page_table[cpu_addr >> YUZU_PAGEBITS]; + for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { + const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; if (!overlap_id) { continue; } @@ -1666,11 +1306,11 @@ typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(VAddr cpu // as a stream buffer. Increase the size to skip constantly recreating buffers. has_stream_leap = true; if (expands_right) { - begin -= YUZU_PAGESIZE * 256; + begin -= PAGE_SIZE * 256; cpu_addr = begin; } if (expands_left) { - end += YUZU_PAGESIZE * 256; + end += PAGE_SIZE * 256; } } } @@ -1690,21 +1330,15 @@ void BufferCache

::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, if (accumulate_stream_score) { new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); } - std::vector copies; + boost::container::small_vector copies; const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); - overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { - copies.push_back(BufferCopy{ - .src_offset = begin, - .dst_offset = dst_base_offset + begin, - .size = range_size, - }); - new_buffer.UnmarkRegionAsCpuModified(begin, range_size); - new_buffer.MarkRegionAsGpuModified(begin, range_size); + copies.push_back(BufferCopy{ + .src_offset = 0, + .dst_offset = dst_base_offset, + .size = overlap.SizeBytes(), }); - if (!copies.empty()) { - runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); - } - DeleteBuffer(overlap_id); + runtime.CopyBuffer(new_buffer, overlap, copies); + DeleteBuffer(overlap_id, true); } template @@ -1718,7 +1352,7 @@ BufferId BufferCache

::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } Register(new_buffer_id); - TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id); + TouchBuffer(new_buffer, new_buffer_id); return new_buffer_id; } @@ -1746,8 +1380,8 @@ void BufferCache

::ChangeRegister(BufferId buffer_id) { } const VAddr cpu_addr_begin = buffer.CpuAddr(); const VAddr cpu_addr_end = cpu_addr_begin + size; - const u64 page_begin = cpu_addr_begin / YUZU_PAGESIZE; - const u64 page_end = Common::DivCeil(cpu_addr_end, YUZU_PAGESIZE); + const u64 page_begin = cpu_addr_begin / PAGE_SIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); for (u64 page = page_begin; page != page_end; ++page) { if constexpr (insert) { page_table[page] = buffer_id; @@ -1766,9 +1400,6 @@ void BufferCache

::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept { template bool BufferCache

::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { - if (buffer.CpuAddr() == 0) { - return true; - } return SynchronizeBufferImpl(buffer, cpu_addr, size); } @@ -1777,10 +1408,11 @@ bool BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s boost::container::small_vector copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + VAddr buffer_start = buffer.CpuAddr(); + memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { copies.push_back(BufferCopy{ .src_offset = total_size_bytes, - .dst_offset = range_offset, + .dst_offset = cpu_addr_out - buffer_start, .size = range_size, }); total_size_bytes += range_size; @@ -1794,6 +1426,51 @@ bool BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s return false; } +template +bool BufferCache

::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + IntervalSet found_sets{}; + auto make_copies = [&] { + for (auto& interval : found_sets) { + const std::size_t sub_size = interval.upper() - interval.lower(); + const VAddr cpu_addr = interval.lower(); + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = cpu_addr - buffer.CpuAddr(), + .size = sub_size, + }); + total_size_bytes += sub_size; + largest_copy = std::max(largest_copy, sub_size); + } + const std::span copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); + }; + memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { + const VAddr base_adr = cpu_addr_out; + const VAddr end_adr = base_adr + range_size; + const IntervalType add_interval{base_adr, end_adr}; + found_sets.add(add_interval); + }); + if (found_sets.empty()) { + return true; + } + const IntervalType search_interval{cpu_addr, cpu_addr + size}; + auto it = common_ranges.lower_bound(search_interval); + auto it_end = common_ranges.upper_bound(search_interval); + if (it == common_ranges.end()) { + make_copies(); + return false; + } + while (it != it_end) { + found_sets.subtract(*it); + it++; + } + make_copies(); + return false; +} + template void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span copies) { @@ -1805,39 +1482,45 @@ void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 larg } template -void BufferCache

::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, - std::span copies) { - std::span immediate_buffer; - for (const BufferCopy& copy : copies) { - std::span upload_span; - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; - if (IsRangeGranular(cpu_addr, copy.size)) { - upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); - } else { - if (immediate_buffer.empty()) { - immediate_buffer = ImmediateBuffer(largest_copy); +void BufferCache

::ImmediateUploadMemory([[maybe_unused]] Buffer& buffer, + [[maybe_unused]] u64 largest_copy, + [[maybe_unused]] std::span copies) { + if constexpr (!USE_MEMORY_MAPS) { + std::span immediate_buffer; + for (const BufferCopy& copy : copies) { + std::span upload_span; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + if (IsRangeGranular(cpu_addr, copy.size)) { + upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); + } else { + if (immediate_buffer.empty()) { + immediate_buffer = ImmediateBuffer(largest_copy); + } + cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + upload_span = immediate_buffer.subspan(0, copy.size); } - cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); - upload_span = immediate_buffer.subspan(0, copy.size); + buffer.ImmediateUpload(copy.dst_offset, upload_span); } - buffer.ImmediateUpload(copy.dst_offset, upload_span); } } template -void BufferCache

::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, - std::span copies) { - auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); - const std::span staging_pointer = upload_staging.mapped_span; - for (BufferCopy& copy : copies) { - u8* const src_pointer = staging_pointer.data() + copy.src_offset; - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; - cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); +void BufferCache

::MappedUploadMemory([[maybe_unused]] Buffer& buffer, + [[maybe_unused]] u64 total_size_bytes, + [[maybe_unused]] std::span copies) { + if constexpr (USE_MEMORY_MAPS) { + auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); + const std::span staging_pointer = upload_staging.mapped_span; + for (BufferCopy& copy : copies) { + u8* const src_pointer = staging_pointer.data() + copy.src_offset; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); - // Apply the staging offset - copy.src_offset += upload_staging.offset; + // Apply the staging offset + copy.src_offset += upload_staging.offset; + } + runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } - runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } template @@ -1886,30 +1569,31 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si boost::container::small_vector copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) { - const VAddr buffer_addr = buffer.CpuAddr(); - const auto add_download = [&](VAddr start, VAddr end) { - const u64 new_offset = start - buffer_addr; - const u64 new_size = end - start; - copies.push_back(BufferCopy{ - .src_offset = new_offset, - .dst_offset = total_size_bytes, - .size = new_size, - }); - // Align up to avoid cache conflicts - constexpr u64 align = 256ULL; - constexpr u64 mask = ~(align - 1ULL); - total_size_bytes += (new_size + align - 1) & mask; - largest_copy = std::max(largest_copy, new_size); - }; - - const VAddr start_address = buffer_addr + range_offset; - const VAddr end_address = start_address + range_size; - ForEachWrittenRange(start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); - }); + memory_tracker.ForEachDownloadRangeAndClear( + cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { + const VAddr buffer_addr = buffer.CpuAddr(); + const auto add_download = [&](VAddr start, VAddr end) { + const u64 new_offset = start - buffer_addr; + const u64 new_size = end - start; + copies.push_back(BufferCopy{ + .src_offset = new_offset, + .dst_offset = total_size_bytes, + .size = new_size, + }); + // Align up to avoid cache conflicts + constexpr u64 align = 8ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; + largest_copy = std::max(largest_copy, new_size); + }; + + const VAddr start_address = cpu_addr_out; + const VAddr end_address = start_address + range_size; + ForEachInRangeSet(common_ranges, start_address, range_size, add_download); + const IntervalType subtract_interval{start_address, end_address}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + }); if (total_size_bytes == 0) { return; } @@ -1943,7 +1627,7 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si } template -void BufferCache

::DeleteBuffer(BufferId buffer_id) { +void BufferCache

::DeleteBuffer(BufferId buffer_id, bool do_not_mark) { const auto scalar_replace = [buffer_id](Binding& binding) { if (binding.buffer_id == buffer_id) { binding.buffer_id = BufferId{}; @@ -1962,8 +1646,10 @@ void BufferCache

::DeleteBuffer(BufferId buffer_id) { std::erase(cached_write_buffer_ids, buffer_id); // Mark the whole buffer as CPU written to stop tracking CPU writes - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + if (!do_not_mark) { + Buffer& buffer = slot_buffers[buffer_id]; + memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + } Unregister(buffer_id); delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); @@ -2011,7 +1697,7 @@ typename BufferCache

::Binding BufferCache

::StorageBufferBinding(GPUVAddr s LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index); return NULL_BINDING; } - const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); + const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, YUZU_PAGESIZE); const Binding binding{ .cpu_addr = *cpu_addr, .size = is_written ? size : static_cast(cpu_end - *cpu_addr), diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h new file mode 100644 index 000000000..4b3677da3 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -0,0 +1,507 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#define BOOST_NO_MT +#include +#undef BOOST_NO_MT +#include +#include +#include + +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "common/literals.h" +#include "common/lru_cache.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "core/memory.h" +#include "video_core/buffer_cache/buffer_base.h" +#include "video_core/control/channel_state_cache.h" +#include "video_core/delayed_destruction_ring.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/draw_manager.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" + + +namespace boost { +template +class fast_pool_allocator; +} + +namespace VideoCommon { + +MICROPROFILE_DECLARE(GPU_PrepareBuffers); +MICROPROFILE_DECLARE(GPU_BindUploadBuffers); +MICROPROFILE_DECLARE(GPU_DownloadMemory); + +using BufferId = SlotId; + +using VideoCore::Surface::PixelFormat; +using namespace Common::Literals; + +constexpr u32 NUM_VERTEX_BUFFERS = 32; +constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; +constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; +constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; +constexpr u32 NUM_STORAGE_BUFFERS = 16; +constexpr u32 NUM_TEXTURE_BUFFERS = 16; +constexpr u32 NUM_STAGES = 5; + +using UniformBufferSizes = std::array, NUM_STAGES>; +using ComputeUniformBufferSizes = std::array; + +enum class ObtainBufferSynchronize : u32 { + NoSynchronize = 0, + FullSynchronize = 1, + SynchronizeNoDirty = 2, +}; + +enum class ObtainBufferOperation : u32 { + DoNothing = 0, + MarkAsWritten = 1, + DiscardWrite = 2, + MarkQuery = 3, +}; + +template +class BufferCache : public VideoCommon::ChannelSetupCaches { + // Page size for caching purposes. + // This is unrelated to the CPU page size and it can be changed as it seems optimal. + static constexpr u32 PAGE_BITS = 16; + static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; + static constexpr u32 CPU_PAGE_BITS = 12; + static constexpr u64 CPU_PAGE_SIZE = u64{1} << CPU_PAGE_BITS; + + static constexpr bool IS_OPENGL = P::IS_OPENGL; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = + P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = + P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; + static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; + static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS; + + static constexpr BufferId NULL_BUFFER_ID{0}; + + static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; + static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; + static constexpr s64 TARGET_THRESHOLD = 4_GiB; + + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + using Runtime = typename P::Runtime; + using Buffer = typename P::Buffer; + using Async_Buffer = typename P::Async_Buffer; + using MemoryTracker = typename P::MemoryTracker; + + using IntervalCompare = ICL_COMPARE_INSTANCE(ICL_COMPARE_DEFAULT, VAddr); + using IntervalInstance = ICL_INTERVAL_INSTANCE(ICL_INTERVAL_DEFAULT, VAddr, IntervalCompare); + using IntervalAllocator = boost::fast_pool_allocator; + using IntervalSet = + boost::icl::interval_set; + using IntervalType = typename IntervalSet::interval_type; + + struct Empty {}; + + struct OverlapResult { + std::vector ids; + VAddr begin; + VAddr end; + bool has_stream_leap = false; + }; + + struct Binding { + VAddr cpu_addr{}; + u32 size{}; + BufferId buffer_id; + }; + + struct TextureBufferBinding : Binding { + PixelFormat format; + }; + + static constexpr Binding NULL_BINDING{ + .cpu_addr = 0, + .size = 0, + .buffer_id = NULL_BUFFER_ID, + }; + +public: + static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast(4_KiB); + + explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, Runtime& runtime_); + + void TickFrame(); + + void WriteMemory(VAddr cpu_addr, u64 size); + + void CachedWriteMemory(VAddr cpu_addr, u64 size); + + void DownloadMemory(VAddr cpu_addr, u64 size); + + bool InlineMemory(VAddr dest_address, size_t copy_size, std::span inlined_buffer); + + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); + + void DisableGraphicsUniformBuffer(size_t stage, u32 index); + + void UpdateGraphicsBuffers(bool is_indexed); + + void UpdateComputeBuffers(); + + void BindHostGeometryBuffers(bool is_indexed); + + void BindHostStageBuffers(size_t stage); + + void BindHostComputeBuffers(); + + void SetUniformBuffersState(const std::array& mask, + const UniformBufferSizes* sizes); + + void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); + + void UnbindGraphicsStorageBuffers(size_t stage); + + void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindGraphicsTextureBuffers(size_t stage); + + void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, + PixelFormat format, bool is_written, bool is_image); + + void UnbindComputeStorageBuffers(); + + void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindComputeTextureBuffers(); + + void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, + bool is_written, bool is_image); + + [[nodiscard]] std::pair ObtainBuffer(GPUVAddr gpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op); + void FlushCachedWrites(); + + /// Return true when there are uncommitted buffers to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; + + void AccumulateFlushes(); + + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; + + /// Commit asynchronous downloads + void CommitAsyncFlushes(); + void CommitAsyncFlushesHigh(); + void CommitAsyncQueries(); + + /// Pop asynchronous downloads + void PopAsyncFlushes(); + + void PopAsyncQueries(); + void PopAsyncBuffers(); + + bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); + + bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + + /// Return true when a region is registered on the cache + [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); + + /// Return true when a CPU region is modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + + void SetDrawIndirect(const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { + current_draw_indirect = current_draw_indirect_; + } + + [[nodiscard]] std::pair GetDrawIndirectCount(); + + [[nodiscard]] std::pair GetDrawIndirectBuffer(); + + std::recursive_mutex mutex; + Runtime& runtime; + +private: + template + static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { + for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { + const int disabled_bits = std::countr_zero(enabled_mask); + index += disabled_bits; + enabled_mask >>= disabled_bits; + func(index); + } + } + + template + void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { + const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); + for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; + continue; + } + Buffer& buffer = slot_buffers[buffer_id]; + func(buffer_id, buffer); + + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); + } + } + + template + void ForEachInRangeSet(IntervalSet& current_range, VAddr cpu_addr, u64 size, Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + VAddr inter_addr_end = it->upper(); + VAddr inter_addr = it->lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } + } + + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { + return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == + ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); + } + + void RunGarbageCollector(); + + void BindHostIndexBuffer(); + + void BindHostVertexBuffers(); + + void BindHostDrawIndirectBuffers(); + + void BindHostGraphicsUniformBuffers(size_t stage); + + void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); + + void BindHostGraphicsStorageBuffers(size_t stage); + + void BindHostGraphicsTextureBuffers(size_t stage); + + void BindHostTransformFeedbackBuffers(); + + void BindHostComputeUniformBuffers(); + + void BindHostComputeStorageBuffers(); + + void BindHostComputeTextureBuffers(); + + void DoUpdateGraphicsBuffers(bool is_indexed); + + void DoUpdateComputeBuffers(); + + void UpdateIndexBuffer(); + + void UpdateVertexBuffers(); + + void UpdateVertexBuffer(u32 index); + + void UpdateDrawIndirect(); + + void UpdateUniformBuffers(size_t stage); + + void UpdateStorageBuffers(size_t stage); + + void UpdateTextureBuffers(size_t stage); + + void UpdateTransformFeedbackBuffers(); + + void UpdateTransformFeedbackBuffer(u32 index); + + void UpdateComputeUniformBuffers(); + + void UpdateComputeStorageBuffers(); + + void UpdateComputeTextureBuffers(); + + void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); + + [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); + + void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); + + [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); + + void Register(BufferId buffer_id); + + void Unregister(BufferId buffer_id); + + template + void ChangeRegister(BufferId buffer_id); + + void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; + + bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); + + bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + + bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); + + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span copies); + + void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span copies); + + void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span copies); + + void DownloadBufferMemory(Buffer& buffer_id); + + void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); + + void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false); + + void NotifyBufferDeletion(); + + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, bool is_written) const; + + [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, + PixelFormat format); + + [[nodiscard]] std::span ImmediateBufferWithData(VAddr cpu_addr, size_t size); + + [[nodiscard]] std::span ImmediateBuffer(size_t wanted_capacity); + + [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; + + void ClearDownload(IntervalType subtract_interval); + + VideoCore::RasterizerInterface& rasterizer; + Core::Memory::Memory& cpu_memory; + + SlotVector slot_buffers; + DelayedDestructionRing delayed_destruction_ring; + + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; + + u32 last_index_count = 0; + + Binding index_buffer; + std::array vertex_buffers; + std::array, NUM_STAGES> uniform_buffers; + std::array, NUM_STAGES> storage_buffers; + std::array, NUM_STAGES> texture_buffers; + std::array transform_feedback_buffers; + Binding count_buffer_binding; + Binding indirect_buffer_binding; + + std::array compute_uniform_buffers; + std::array compute_storage_buffers; + std::array compute_texture_buffers; + + std::array enabled_uniform_buffer_masks{}; + u32 enabled_compute_uniform_buffer_mask = 0; + + const UniformBufferSizes* uniform_buffer_sizes{}; + const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; + + std::array enabled_storage_buffers{}; + std::array written_storage_buffers{}; + u32 enabled_compute_storage_buffers = 0; + u32 written_compute_storage_buffers = 0; + + std::array enabled_texture_buffers{}; + std::array written_texture_buffers{}; + std::array image_texture_buffers{}; + u32 enabled_compute_texture_buffers = 0; + u32 written_compute_texture_buffers = 0; + u32 image_compute_texture_buffers = 0; + + std::array uniform_cache_hits{}; + std::array uniform_cache_shots{}; + + u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; + + bool has_deleted_buffers = false; + + std::conditional_t, Empty> + dirty_uniform_buffers{}; + std::conditional_t, Empty> fast_bound_uniform_buffers{}; + std::conditional_t, NUM_STAGES>, Empty> + uniform_buffer_binding_sizes{}; + + std::vector cached_write_buffer_ids; + + MemoryTracker memory_tracker; + IntervalSet uncommitted_ranges; + IntervalSet common_ranges; + IntervalSet cached_ranges; + std::deque committed_ranges; + + // Async Buffers + std::deque async_downloads; + std::deque> async_buffers; + std::deque> pending_downloads; + std::optional current_buffer; + + // queries + boost::container::small_vector, 8> pending_queries; + std::deque> committed_queries; + boost::container::small_vector flushed_queries; + std::deque> query_async_buffers; + + size_t immediate_buffer_capacity = 0; + Common::ScratchBuffer immediate_buffer_alloc; + + struct LRUItemParams { + using ObjectType = BufferId; + using TickType = u64; + }; + Common::LeastRecentlyUsedCache lru_cache; + u64 frame_tick = 0; + u64 total_used_memory = 0; + u64 minimum_memory = 0; + u64 critical_memory = 0; + + std::array> PAGE_BITS)> page_table; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h new file mode 100644 index 000000000..93bd779c9 --- /dev/null +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -0,0 +1,258 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "common/alignment.h" +#include "common/common_types.h" +#include "video_core/buffer_cache/word_manager.h" + +namespace VideoCommon { + +template +class MemoryTrackerBase { + static constexpr size_t MAX_CPU_PAGE_BITS = 39; + static constexpr size_t HIGHER_PAGE_BITS = 22; + static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; + static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; + static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS); + static constexpr size_t MANAGER_POOL_SIZE = 32; + static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD; + using Manager = WordManager; + +public: + MemoryTrackerBase(RasterizerInterface& rasterizer_) : rasterizer{&rasterizer_} {} + ~MemoryTrackerBase() = default; + + /// Returns the inclusive CPU modified range in a begin end pair + [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, + u64 query_size) noexcept { + return IteratePairs(query_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + return manager->ModifiedRegion(offset, size); + }); + } + + /// Returns the inclusive GPU modified range in a begin end pair + [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, + u64 query_size) noexcept { + return IteratePairs(query_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + return manager->ModifiedRegion(offset, size); + }); + } + + /// Returns true if a region has been modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages(query_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + return manager->IsRegionModified(offset, size); + }); + } + + /// Returns true if a region has been modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages(query_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + return manager->IsRegionModified(offset, size); + }); + } + + /// Mark region as CPU modified, notifying the rasterizer about this change + void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages( + dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + manager->ChangeRegionState(manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as CPU modified, notifying the rasterizer about this change + void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages( + dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + manager->ChangeRegionState(manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the host GPU + void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages( + dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + manager->ChangeRegionState(manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as modified from the host GPU + void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages( + dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + manager->ChangeRegionState(manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the CPU + /// but don't mark it as modified until FlusHCachedWrites is called. + void CachedCpuWrite(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages( + dirty_cpu_addr, query_size, [this](Manager* manager, u64 offset, size_t size) { + const VAddr cpu_address = manager->GetCpuAddr() + offset; + manager->ChangeRegionState(cpu_address, size); + cached_pages.insert(static_cast(cpu_address >> HIGHER_PAGE_BITS)); + }); + } + + /// Flushes cached CPU writes, and notify the rasterizer about the deltas + void FlushCachedWrites(VAddr query_cpu_addr, u64 query_size) noexcept { + IteratePages(query_cpu_addr, query_size, + [](Manager* manager, [[maybe_unused]] u64 offset, + [[maybe_unused]] size_t size) { manager->FlushCachedWrites(); }); + } + + void FlushCachedWrites() noexcept { + for (auto id : cached_pages) { + top_tier[id]->FlushCachedWrites(); + } + cached_pages.clear(); + } + + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified + template + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, true, func); + }); + } + + /// Call 'func' for each GPU modified range and unmark those pages as GPU modified + template + void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { + IteratePages(query_cpu_range, query_size, + [&func, clear](Manager* manager, u64 offset, size_t size) { + manager->ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, clear, func); + }); + } + + template + void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, true, func); + }); + } + +private: + template + bool IteratePages(VAddr cpu_address, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result::type; + static constexpr bool BOOL_BREAK = std::is_same_v; + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + while (remaining_size > 0) { + const std::size_t copy_amount{std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + if (manager) { + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + return false; + } + + template + std::pair IteratePairs(VAddr cpu_address, size_t size, Func&& func) { + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + u64 begin = std::numeric_limits::max(); + u64 end = 0; + while (remaining_size > 0) { + const std::size_t copy_amount{std::min(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + const auto execute = [&] { + auto [new_begin, new_end] = func(manager, page_offset, copy_amount); + if (new_begin != 0 || new_end != 0) { + const u64 base_address = page_index << HIGHER_PAGE_BITS; + begin = std::min(new_begin + base_address, begin); + end = std::max(new_end + base_address, end); + } + }; + if (manager) { + execute(); + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + execute(); + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + return begin < end ? std::make_pair(begin, end) : std::make_pair(0ULL, 0ULL); + } + + void CreateRegion(std::size_t page_index) { + const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS; + top_tier[page_index] = GetNewManager(base_cpu_addr); + } + + Manager* GetNewManager(VAddr base_cpu_addess) { + const auto on_return = [&] { + auto* new_manager = free_managers.front(); + new_manager->SetCpuAddress(base_cpu_addess); + free_managers.pop_front(); + return new_manager; + }; + if (!free_managers.empty()) { + return on_return(); + } + manager_pool.emplace_back(); + auto& last_pool = manager_pool.back(); + for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) { + new (&last_pool[i]) Manager(0, *rasterizer, HIGHER_PAGE_SIZE); + free_managers.push_back(&last_pool[i]); + } + return on_return(); + } + + std::deque> manager_pool; + std::deque free_managers; + + std::array top_tier{}; + + std::unordered_set cached_pages; + + RasterizerInterface* rasterizer = nullptr; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h new file mode 100644 index 000000000..782951fe7 --- /dev/null +++ b/src/video_core/buffer_cache/word_manager.h @@ -0,0 +1,474 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include +#include +#include +#include + +#include "common/alignment.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "core/memory.h" + +namespace VideoCommon { + +constexpr u64 PAGES_PER_WORD = 64; +constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; +constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; + +/// Vector tracking modified pages tightly packed with small vector optimization +template +union WordsArray { + /// Returns the pointer to the words state + [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { + return is_short ? stack.data() : heap; + } + + /// Returns the pointer to the words state + [[nodiscard]] u64* Pointer(bool is_short) noexcept { + return is_short ? stack.data() : heap; + } + + std::array stack{}; ///< Small buffers storage + u64* heap; ///< Not-small buffers pointer to the storage +}; + +template +struct Words { + explicit Words() = default; + explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { + if (IsShort()) { + cpu.stack.fill(~u64{0}); + gpu.stack.fill(0); + cached_cpu.stack.fill(0); + untracked.stack.fill(~u64{0}); + } else { + const size_t num_words = NumWords(); + // Share allocation between CPU and GPU pages and set their default values + u64* const alloc = new u64[num_words * 4]; + cpu.heap = alloc; + gpu.heap = alloc + num_words; + cached_cpu.heap = alloc + num_words * 2; + untracked.heap = alloc + num_words * 3; + std::fill_n(cpu.heap, num_words, ~u64{0}); + std::fill_n(gpu.heap, num_words, 0); + std::fill_n(cached_cpu.heap, num_words, 0); + std::fill_n(untracked.heap, num_words, ~u64{0}); + } + // Clean up tailing bits + const u64 last_word_size = size_bytes % BYTES_PER_WORD; + const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); + const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; + const u64 last_word = (~u64{0} << shift) >> shift; + cpu.Pointer(IsShort())[NumWords() - 1] = last_word; + untracked.Pointer(IsShort())[NumWords() - 1] = last_word; + } + + ~Words() { + Release(); + } + + Words& operator=(Words&& rhs) noexcept { + Release(); + size_bytes = rhs.size_bytes; + cpu = rhs.cpu; + gpu = rhs.gpu; + cached_cpu = rhs.cached_cpu; + untracked = rhs.untracked; + rhs.cpu.heap = nullptr; + return *this; + } + + Words(Words&& rhs) noexcept + : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, + cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { + rhs.cpu.heap = nullptr; + } + + Words& operator=(const Words&) = delete; + Words(const Words&) = delete; + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return size_bytes <= stack_words * BYTES_PER_WORD; + } + + /// Returns the number of words of the buffer + [[nodiscard]] size_t NumWords() const noexcept { + return Common::DivCeil(size_bytes, BYTES_PER_WORD); + } + + /// Release buffer resources + void Release() { + if (!IsShort()) { + // CPU written words is the base for the heap allocation + delete[] cpu.heap; + } + } + + u64 size_bytes = 0; + WordsArray cpu; + WordsArray gpu; + WordsArray cached_cpu; + WordsArray untracked; +}; + +enum class Type { + CPU, + GPU, + CachedCPU, + Untracked, +}; + +template +class WordManager { +public: + explicit WordManager(VAddr cpu_addr_, RasterizerInterface& rasterizer_, u64 size_bytes) + : cpu_addr{cpu_addr_}, rasterizer{&rasterizer_}, words{size_bytes} {} + + explicit WordManager() = default; + + void SetCpuAddress(VAddr new_cpu_addr) { + cpu_addr = new_cpu_addr; + } + + VAddr GetCpuAddr() const { + return cpu_addr; + } + + /** + * Change the state of a range of pages + * + * @param dirty_addr Base address to mark or unmark as modified + * @param size Size in bytes to mark or unmark as modified + */ + template + void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { + const s64 difference = dirty_addr - cpu_addr; + const u64 offset = std::max(difference, 0); + size += std::min(difference, 0); + if (offset >= SizeBytes() || size < 0) { + return; + } + u64* const untracked_words = Array(); + u64* const state_words = Array(); + const u64 offset_end = std::min(offset + size, SizeBytes()); + const u64 begin_page_index = offset / BYTES_PER_PAGE; + const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; + const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); + const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); + u64 page_index = begin_page_index % PAGES_PER_WORD; + u64 word_index = begin_word_index; + while (word_index < end_word_index) { + const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; + const u64 left_offset = + std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; + const u64 right_offset = page_index; + u64 bits = ~u64{0}; + bits = (bits >> right_offset) << right_offset; + bits = (bits << left_offset) >> left_offset; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer(word_index, untracked_words[word_index], bits); + } + if constexpr (enable) { + state_words[word_index] |= bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] |= bits; + } + } else { + state_words[word_index] &= ~bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] &= ~bits; + } + } + page_index = 0; + ++word_index; + } + } + + /** + * Loop over each page in the given range, turn off those bits and notify the rasterizer if + * needed. Call the given function on each turned off range. + * + * @param query_cpu_range Base CPU address to loop over + * @param size Size in bytes of the CPU range to loop over + * @param func Function to call for each turned off region + */ + template + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { + static_assert(type != Type::Untracked); + + const s64 difference = query_cpu_range - cpu_addr; + const u64 query_begin = std::max(difference, 0); + size += std::min(difference, 0); + if (query_begin >= SizeBytes() || size < 0) { + return; + } + [[maybe_unused]] u64* const untracked_words = Array(); + [[maybe_unused]] u64* const cpu_words = Array(); + u64* const state_words = Array(); + const u64 query_end = query_begin + std::min(static_cast(size), SizeBytes()); + u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; + u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); + u64 first_page = (query_begin / BYTES_PER_PAGE) % PAGES_PER_WORD; + + const auto modified = [](u64 word) { return word != 0; }; + const auto first_modified_word = std::find_if(words_begin, words_end, modified); + if (first_modified_word == words_end) { + // Exit early when the buffer is not modified + return; + } + if (first_modified_word != words_begin) { + first_page = 0; + } + std::reverse_iterator first_word_reverse(first_modified_word); + std::reverse_iterator last_word_iterator(words_end); + auto last_word_result = std::find_if(last_word_iterator, first_word_reverse, modified); + u64* const last_modified_word = &(*last_word_result) + 1; + + const u64 word_index_begin = std::distance(state_words, first_modified_word); + const u64 word_index_end = std::distance(state_words, last_modified_word); + const unsigned local_page_begin = std::countr_zero(*first_modified_word); + const unsigned local_page_end = + static_cast(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); + const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; + const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; + const u64 query_page_begin = query_begin / BYTES_PER_PAGE; + const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); + const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); + const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); + const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; + const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; + + u64 page_begin = std::max(first_word_page_begin, first_page); + u64 current_base = 0; + u64 current_size = 0; + bool on_going = false; + for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { + const bool is_last_word = word_index + 1 == word_index_end; + const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; + const u64 right_offset = page_begin; + const u64 left_offset = PAGES_PER_WORD - page_end; + u64 bits = ~u64{0}; + bits = (bits >> right_offset) << right_offset; + bits = (bits << left_offset) >> left_offset; + + const u64 current_word = state_words[word_index] & bits; + if (clear) { + state_words[word_index] &= ~bits; + } + + if constexpr (type == Type::CachedCPU) { + NotifyRasterizer(word_index, untracked_words[word_index], current_word); + untracked_words[word_index] |= current_word; + cpu_words[word_index] |= current_word; + } + + if constexpr (type == Type::CPU) { + const u64 current_bits = untracked_words[word_index] & bits; + untracked_words[word_index] &= ~bits; + NotifyRasterizer(word_index, current_bits, ~u64{0}); + } + const u64 word = current_word; + u64 page = page_begin; + page_begin = 0; + + while (page < page_end) { + const int empty_bits = std::countr_zero(word >> page); + if (on_going && empty_bits != 0) { + InvokeModifiedRange(func, current_size, current_base); + current_size = 0; + on_going = false; + } + if (empty_bits == PAGES_PER_WORD) { + break; + } + page += empty_bits; + + const int continuous_bits = std::countr_one(word >> page); + if (!on_going && continuous_bits != 0) { + current_base = word_index * PAGES_PER_WORD + page; + on_going = true; + } + current_size += continuous_bits; + page += continuous_bits; + } + } + if (on_going && current_size > 0) { + InvokeModifiedRange(func, current_size, current_base); + } + } + + template + void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { + const u64 current_size_bytes = current_size * BYTES_PER_PAGE; + const u64 offset_begin = current_base * BYTES_PER_PAGE; + const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); + func(cpu_addr + offset_begin, offset_end - offset_begin); + } + + /** + * Returns true when a region has been modified + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + + const u64* const state_words = Array(); + const u64 num_query_words = size / BYTES_PER_WORD + 1; + const u64 word_begin = offset / BYTES_PER_WORD; + const u64 word_end = std::min(word_begin + num_query_words, NumWords()); + const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); + u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; + for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { + const u64 word = state_words[word_index]; + if (word == 0) { + continue; + } + const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); + const u64 local_page_end = page_end % PAGES_PER_WORD; + const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; + if (((word >> page_index) << page_index) << page_end_shift != 0) { + return true; + } + } + return false; + } + + /** + * Returns a begin end pair with the inclusive modified region + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template + [[nodiscard]] std::pair ModifiedRegion(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + const u64* const state_words = Array(); + const u64 num_query_words = size / BYTES_PER_WORD + 1; + const u64 word_begin = offset / BYTES_PER_WORD; + const u64 word_end = std::min(word_begin + num_query_words, NumWords()); + const u64 page_base = offset / BYTES_PER_PAGE; + u64 page_begin = page_base & (PAGES_PER_WORD - 1); + u64 page_end = + Common::DivCeil(offset + size, BYTES_PER_PAGE) - (page_base & ~(PAGES_PER_WORD - 1)); + u64 begin = std::numeric_limits::max(); + u64 end = 0; + for (u64 word_index = word_begin; word_index < word_end; ++word_index) { + const u64 base_mask = (1ULL << page_begin) - 1ULL; + const u64 end_mask = page_end >= PAGES_PER_WORD ? 0ULL : ~((1ULL << page_end) - 1ULL); + const u64 off_word = end_mask | base_mask; + const u64 word = state_words[word_index] & ~off_word; + if (word == 0) { + page_begin = 0; + page_end -= PAGES_PER_WORD; + continue; + } + const u64 local_page_begin = std::countr_zero(word); + const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); + const u64 page_index = word_index * PAGES_PER_WORD; + begin = std::min(begin, page_index + local_page_begin); + end = page_index + local_page_end; + page_begin = 0; + page_end -= PAGES_PER_WORD; + } + static constexpr std::pair EMPTY{0, 0}; + return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; + } + + /// Returns the number of words of the manager + [[nodiscard]] size_t NumWords() const noexcept { + return words.NumWords(); + } + + /// Returns the size in bytes of the manager + [[nodiscard]] u64 SizeBytes() const noexcept { + return words.size_bytes; + } + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return words.IsShort(); + } + + void FlushCachedWrites() noexcept { + const u64 num_words = NumWords(); + u64* const cached_words = Array(); + u64* const untracked_words = Array(); + u64* const cpu_words = Array(); + for (u64 word_index = 0; word_index < num_words; ++word_index) { + const u64 cached_bits = cached_words[word_index]; + NotifyRasterizer(word_index, untracked_words[word_index], cached_bits); + untracked_words[word_index] |= cached_bits; + cpu_words[word_index] |= cached_bits; + cached_words[word_index] = 0; + } + } + +private: + template + u64* Array() noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + template + const u64* Array() const noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + /** + * Notify rasterizer about changes in the CPU tracking state of a word in the buffer + * + * @param word_index Index to the word to notify to the rasterizer + * @param current_bits Current state of the word + * @param new_bits New state of the word + * + * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages + */ + template + void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { + u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; + VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; + while (changed_bits != 0) { + const int empty_bits = std::countr_zero(changed_bits); + addr += empty_bits * BYTES_PER_PAGE; + changed_bits >>= empty_bits; + + const u32 continuous_bits = std::countr_one(changed_bits); + const u64 size = continuous_bits * BYTES_PER_PAGE; + const VAddr begin_addr = addr; + addr += size; + changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; + rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); + } + } + + VAddr cpu_addr = 0; + RasterizerInterface* rasterizer = nullptr; + Words words; +}; + +} // namespace VideoCommon diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a8c3f8b67..18d3c3ac0 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -8,6 +8,7 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -200,6 +201,8 @@ private: struct BufferCacheParams { using Runtime = OpenGL::BufferCacheRuntime; using Buffer = OpenGL::Buffer; + using Async_Buffer = u32; + using MemoryTracker = VideoCommon::MemoryTrackerBase; static constexpr bool IS_OPENGL = true; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; @@ -208,6 +211,7 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; static constexpr bool USE_MEMORY_MAPS = false; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = false; }; using BufferCache = VideoCommon::BufferCache; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp new file mode 100644 index 000000000..f15ae8e25 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache; +} diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 9cbcb3c8f..510602e8e 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -314,8 +314,12 @@ StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) { return staging_pool.Request(size, MemoryUsage::Upload); } -StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) { - return staging_pool.Request(size, MemoryUsage::Download); +StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) { + return staging_pool.Request(size, MemoryUsage::Download, deferred); +} + +void BufferCacheRuntime::FreeDeferredStagingBuffer(StagingBufferRef& ref) { + staging_pool.FreeDeferred(ref); } u64 BufferCacheRuntime::GetDeviceLocalMemory() const { diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 183b33632..05968e6a6 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -4,6 +4,7 @@ #pragma once #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -75,7 +76,9 @@ public: [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); - [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); + [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false); + + void FreeDeferredStagingBuffer(StagingBufferRef& ref); void PreCopyBarrier(); @@ -142,6 +145,8 @@ private: struct BufferCacheParams { using Runtime = Vulkan::BufferCacheRuntime; using Buffer = Vulkan::Buffer; + using Async_Buffer = Vulkan::StagingBufferRef; + using MemoryTracker = VideoCommon::MemoryTrackerBase; static constexpr bool IS_OPENGL = false; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; @@ -150,6 +155,7 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true; }; using BufferCache = VideoCommon::BufferCache; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp new file mode 100644 index 000000000..f9e271507 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache; +} -- cgit v1.2.3 From f2d3212de97ebed710bc03792343fae45b3203f3 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 22 Apr 2023 13:36:18 +0200 Subject: Buffer Cache rework: Setup async downloads. --- src/video_core/buffer_cache/buffer_cache.h | 229 ++++++++++-------------- src/video_core/buffer_cache/buffer_cache_base.h | 65 ++++++- 2 files changed, 154 insertions(+), 140 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index a0701ce4e..43fe5b080 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -11,6 +11,8 @@ namespace VideoCommon { +using Core::Memory::YUZU_PAGESIZE; + template BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, Core::Memory::Memory& cpu_memory_, Runtime& runtime_) @@ -87,9 +89,11 @@ void BufferCache

::TickFrame() { template void BufferCache

::WriteMemory(VAddr cpu_addr, u64 size) { memory_tracker.MarkRegionAsCpuModified(cpu_addr, size); - const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); + if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) { + const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + } } template @@ -102,17 +106,33 @@ void BufferCache

::CachedWriteMemory(VAddr cpu_addr, u64 size) { template void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { + WaitOnAsyncFlushes(cpu_addr, size); ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer, cpu_addr, size); }); } +template +void BufferCache

::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { + bool must_wait = false; + ForEachInOverlapCounter(async_downloads, cpu_addr, size, + [&](VAddr, VAddr, int) { must_wait = true; }); + bool must_release = false; + ForEachInRangeSet(pending_ranges, cpu_addr, size, [&](VAddr, VAddr) { must_release = true; }); + if (must_release) { + std::function tmp([]() {}); + rasterizer.SignalFence(std::move(tmp)); + } + if (must_wait || must_release) { + rasterizer.ReleaseFences(); + } +} + template void BufferCache

::ClearDownload(IntervalType subtract_interval) { + async_downloads -= std::make_pair(subtract_interval, std::numeric_limits::max()); uncommitted_ranges.subtract(subtract_interval); - for (auto& interval_set : async_downloads) { - interval_set.subtract(subtract_interval); - } + pending_ranges.subtract(subtract_interval); for (auto& interval_set : committed_ranges) { interval_set.subtract(subtract_interval); } @@ -132,6 +152,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; + WaitOnAsyncFlushes(*cpu_src_address, static_cast(amount)); ClearDownload(subtract_interval); BufferId buffer_a; @@ -162,6 +183,7 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am tmp_intervals.push_back(add_interval); if (is_high_accuracy) { uncommitted_ranges.add(add_interval); + pending_ranges.add(add_interval); } }; ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); @@ -413,18 +435,15 @@ template void BufferCache

::FlushCachedWrites() { cached_write_buffer_ids.clear(); memory_tracker.FlushCachedWrites(); - /*for (auto& interval : cached_ranges) { - VAddr cpu_addr = interval.lower(); - const std::size_t size = interval.upper() - interval.lower(); - memory_tracker.FlushCachedWrites(cpu_addr, size); - // common_ranges.subtract(interval); - }*/ + for (auto& interval : cached_ranges) { + ClearDownload(interval); + } cached_ranges.clear(); } template bool BufferCache

::HasUncommittedFlushes() const noexcept { - return !uncommitted_ranges.empty() || !committed_ranges.empty() || !pending_queries.empty(); + return !uncommitted_ranges.empty() || !committed_ranges.empty(); } template @@ -437,8 +456,11 @@ void BufferCache

::AccumulateFlushes() { template bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { - return (!async_buffers.empty() && async_buffers.front().has_value()) || - (!query_async_buffers.empty() && query_async_buffers.front().has_value()); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + return (!async_buffers.empty() && async_buffers.front().has_value()); + } else { + return false; + } } template @@ -446,11 +468,14 @@ void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { - async_buffers.emplace_back(std::optional{}); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + async_buffers.emplace_back(std::optional{}); + } return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); + pending_ranges.clear(); auto it = committed_ranges.begin(); while (it != committed_ranges.end()) { auto& current_intervals = *it; @@ -491,7 +516,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { buffer_id, }); // Align up to avoid cache conflicts - constexpr u64 align = 8ULL; + constexpr u64 align = 64ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); @@ -504,7 +529,9 @@ void BufferCache

::CommitAsyncFlushesHigh() { } committed_ranges.clear(); if (downloads.empty()) { - async_buffers.emplace_back(std::optional{}); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + async_buffers.emplace_back(std::optional{}); + } return; } if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { @@ -520,99 +547,54 @@ void BufferCache

::CommitAsyncFlushesHigh() { second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; VAddr orig_cpu_addr = static_cast(second_copy.src_offset); const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; - new_async_range.add(base_interval); + async_downloads += std::make_pair(base_interval, 1); runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); normalized_copies.push_back(second_copy); } - async_downloads.emplace_back(std::move(new_async_range)); + runtime.PostCopyBarrier(); pending_downloads.emplace_back(std::move(normalized_copies)); async_buffers.emplace_back(download_staging); } else { - const std::span immediate_buffer = ImmediateBuffer(largest_copy); - for (const auto& [copy, buffer_id] : downloads) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); - } - } -} - -template -void BufferCache

::CommitAsyncQueries() { - if (pending_queries.empty()) { - query_async_buffers.emplace_back(std::optional{}); - return; - } - - MICROPROFILE_SCOPE(GPU_DownloadMemory); - boost::container::small_vector, 8> downloads; - u64 total_size_bytes = 0; - u64 largest_copy = 0; - do { - has_deleted_buffers = false; - downloads.clear(); - total_size_bytes = 0; - largest_copy = 0; - for (const auto& query_info : pending_queries) { - const std::size_t size = query_info.second; - const VAddr cpu_addr = query_info.first; - const BufferId buffer_id = FindBuffer(cpu_addr, static_cast(size)); - Buffer& buffer = slot_buffers[buffer_id]; - if (has_deleted_buffers) { - break; + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + // Have in mind the staging buffer offset for the copy + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); + } + runtime.PostCopyBarrier(); + runtime.Finish(); + for (const auto& [copy, buffer_id] : downloads) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + } + } else { + const std::span immediate_buffer = ImmediateBuffer(largest_copy); + for (const auto& [copy, buffer_id] : downloads) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); } - downloads.push_back({ - BufferCopy{ - .src_offset = buffer.Offset(cpu_addr), - .dst_offset = total_size_bytes, - .size = size, - }, - buffer_id, - }); - constexpr u64 align = 8ULL; - constexpr u64 mask = ~(align - 1ULL); - total_size_bytes += (size + align - 1) & mask; - largest_copy = std::max(largest_copy, size); - } - } while (has_deleted_buffers); - pending_queries.clear(); - if (downloads.empty()) { - query_async_buffers.push_back(std::optional{}); - return; - } - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); - boost::container::small_vector normalized_copies; - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - // Have in mind the staging buffer offset for the copy - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - const Buffer& buffer = slot_buffers[buffer_id]; - BufferCopy second_copy{copy}; - second_copy.src_offset = static_cast(buffer.CpuAddr()) + second_copy.src_offset; - runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); - normalized_copies.push_back(second_copy); } - committed_queries.emplace_back(std::move(normalized_copies)); - query_async_buffers.emplace_back(download_staging); - } else { - query_async_buffers.push_back(std::optional{}); } } template void BufferCache

::CommitAsyncFlushes() { CommitAsyncFlushesHigh(); - CommitAsyncQueries(); } template void BufferCache

::PopAsyncFlushes() { MICROPROFILE_SCOPE(GPU_DownloadMemory); PopAsyncBuffers(); - PopAsyncQueries(); } template @@ -627,59 +609,34 @@ void BufferCache

::PopAsyncBuffers() { if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { auto& downloads = pending_downloads.front(); auto& async_buffer = async_buffers.front(); - auto& async_range = async_downloads.front(); u8* base = async_buffer->mapped_span.data(); const size_t base_offset = async_buffer->offset; for (const auto& copy : downloads) { const VAddr cpu_addr = static_cast(copy.src_offset); const u64 dst_offset = copy.dst_offset - base_offset; const u8* read_mapped_memory = base + dst_offset; - ForEachInRangeSet(async_range, cpu_addr, copy.size, [&](VAddr start, VAddr end) { - const size_t diff = start - cpu_addr; - const size_t new_size = end - start; - cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[diff], new_size); - const IntervalType base_interval{start, end}; - common_ranges.subtract(base_interval); - }); + ForEachInOverlapCounter( + async_downloads, cpu_addr, copy.size, [&](VAddr start, VAddr end, int count) { + cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[start - cpu_addr], + end - start); + if (count == 1) { + const IntervalType base_interval{start, end}; + common_ranges.subtract(base_interval); + } + }); + async_downloads -= std::make_pair(IntervalType(cpu_addr, cpu_addr + copy.size), 1); } runtime.FreeDeferredStagingBuffer(*async_buffer); async_buffers.pop_front(); pending_downloads.pop_front(); - async_downloads.pop_front(); - } -} - -template -void BufferCache

::PopAsyncQueries() { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - if (query_async_buffers.empty()) { - return; - } - if (!query_async_buffers.front().has_value()) { - query_async_buffers.pop_front(); - return; - } - auto& downloads = committed_queries.front(); - auto& async_buffer = query_async_buffers.front(); - flushed_queries.clear(); - u8* base = async_buffer->mapped_span.data(); - const size_t base_offset = async_buffer->offset; - for (const auto& copy : downloads) { - const size_t dst_offset = copy.dst_offset - base_offset; - const u8* read_mapped_memory = base + dst_offset; - u64 new_value{}; - std::memcpy(&new_value, read_mapped_memory, copy.size); - flushed_queries.push_back(new_value); - } - runtime.FreeDeferredStagingBuffer(*async_buffer); - committed_queries.pop_front(); - query_async_buffers.pop_front(); } } template bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { - return memory_tracker.IsRegionGpuModified(addr, size); + bool is_dirty = false; + ForEachInRangeSet(common_ranges, addr, size, [&](VAddr, VAddr) { is_dirty = true; }); + return is_dirty; } template @@ -1232,16 +1189,18 @@ void BufferCache

::UpdateComputeTextureBuffers() { } template -void BufferCache

::MarkWrittenBuffer(BufferId, VAddr cpu_addr, u32 size) { +void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { memory_tracker.MarkRegionAsGpuModified(cpu_addr, size); + if (memory_tracker.IsRegionCpuModified(cpu_addr, size)) { + SynchronizeBuffer(slot_buffers[buffer_id], cpu_addr, size); + } + const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - for (auto& interval_set : async_downloads) { - interval_set.subtract(base_interval); - } if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { uncommitted_ranges.add(base_interval); + pending_ranges.add(base_interval); } } @@ -1530,7 +1489,9 @@ bool BufferCache

::InlineMemory(VAddr dest_address, size_t copy_size, if (!is_dirty) { return false; } - if (!IsRegionGpuModified(dest_address, copy_size)) { + VAddr aligned_start = Common::AlignDown(dest_address, YUZU_PAGESIZE); + VAddr aligned_end = Common::AlignUp(dest_address + copy_size, YUZU_PAGESIZE); + if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) { return false; } diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 4b3677da3..6f29cba25 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -17,6 +17,7 @@ #include #undef BOOST_NO_MT #include +#include #include #include @@ -44,8 +45,7 @@ namespace boost { template -class fast_pool_allocator; +class fast_pool_allocator; } namespace VideoCommon { @@ -123,6 +123,31 @@ class BufferCache : public VideoCommon::ChannelSetupCaches; using IntervalType = typename IntervalSet::interval_type; + template + struct counter_add_functor : public boost::icl::identity_based_inplace_combine { + // types + typedef counter_add_functor type; + typedef boost::icl::identity_based_inplace_combine base_type; + + // public member functions + void operator()(Type& current, const Type& added) const { + current += added; + if (current < base_type::identity_element()) { + current = base_type::identity_element(); + } + } + + // public static functions + static void version(Type&){}; + }; + + using OverlapCombine = ICL_COMBINE_INSTANCE(counter_add_functor, int); + using OverlapSection = ICL_SECTION_INSTANCE(boost::icl::inter_section, int); + using OverlapCounter = + boost::icl::split_interval_map; + struct Empty {}; struct OverlapResult { @@ -219,12 +244,9 @@ public: /// Commit asynchronous downloads void CommitAsyncFlushes(); void CommitAsyncFlushesHigh(); - void CommitAsyncQueries(); /// Pop asynchronous downloads void PopAsyncFlushes(); - - void PopAsyncQueries(); void PopAsyncBuffers(); bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); @@ -302,6 +324,34 @@ private: } } + template + void ForEachInOverlapCounter(OverlapCounter& current_range, VAddr cpu_addr, u64 size, + Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + auto& inter = it->first; + VAddr inter_addr_end = inter.upper(); + VAddr inter_addr = inter.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + if (it->second <= 0) { + __debugbreak(); + } + func(inter_addr, inter_addr_end, it->second); + } + } + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); @@ -309,6 +359,8 @@ private: void RunGarbageCollector(); + void WaitOnAsyncFlushes(VAddr cpu_addr, u64 size); + void BindHostIndexBuffer(); void BindHostVertexBuffers(); @@ -474,10 +526,11 @@ private: IntervalSet uncommitted_ranges; IntervalSet common_ranges; IntervalSet cached_ranges; + IntervalSet pending_ranges; std::deque committed_ranges; // Async Buffers - std::deque async_downloads; + OverlapCounter async_downloads; std::deque> async_buffers; std::deque> pending_downloads; std::optional current_buffer; -- cgit v1.2.3 From ed4553806a08e4130fcea36230985cb74d1b326a Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sat, 22 Apr 2023 20:10:40 +0200 Subject: Implement Async downloads in normal and fix a few issues. --- src/video_core/buffer_cache/buffer_cache.h | 69 +++++++++++++------------ src/video_core/buffer_cache/buffer_cache_base.h | 25 +++++++-- src/video_core/buffer_cache/word_manager.h | 6 ++- 3 files changed, 61 insertions(+), 39 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 43fe5b080..faa48a678 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -22,6 +22,8 @@ BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); + active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); + if (!runtime.CanReportMemoryUsage()) { minimum_memory = DEFAULT_EXPECTED_MEMORY; critical_memory = DEFAULT_CRITICAL_MEMORY; @@ -72,6 +74,8 @@ void BufferCache

::TickFrame() { uniform_cache_hits[0] = 0; uniform_cache_shots[0] = 0; + active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); + const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; @@ -130,7 +134,7 @@ void BufferCache

::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { template void BufferCache

::ClearDownload(IntervalType subtract_interval) { - async_downloads -= std::make_pair(subtract_interval, std::numeric_limits::max()); + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1024); uncommitted_ranges.subtract(subtract_interval); pending_ranges.subtract(subtract_interval); for (auto& interval_set : committed_ranges) { @@ -173,18 +177,14 @@ bool BufferCache

::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am }}; boost::container::small_vector tmp_intervals; - const bool is_high_accuracy = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High; auto mirror = [&](VAddr base_address, VAddr base_address_end) { const u64 size = base_address_end - base_address; const VAddr diff = base_address - *cpu_src_address; const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; tmp_intervals.push_back(add_interval); - if (is_high_accuracy) { - uncommitted_ranges.add(add_interval); - pending_ranges.add(add_interval); - } + uncommitted_ranges.add(add_interval); + pending_ranges.add(add_interval); }; ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. @@ -468,7 +468,7 @@ void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { async_buffers.emplace_back(std::optional{}); } return; @@ -529,31 +529,33 @@ void BufferCache

::CommitAsyncFlushesHigh() { } committed_ranges.clear(); if (downloads.empty()) { - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { async_buffers.emplace_back(std::optional{}); } return; } - if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); - boost::container::small_vector normalized_copies; - IntervalSet new_async_range{}; - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - BufferCopy second_copy{copy}; - Buffer& buffer = slot_buffers[buffer_id]; - second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; - VAddr orig_cpu_addr = static_cast(second_copy.src_offset); - const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; - async_downloads += std::make_pair(base_interval, 1); - runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); - normalized_copies.push_back(second_copy); + if (active_async_buffers) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); + boost::container::small_vector normalized_copies; + IntervalSet new_async_range{}; + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + BufferCopy second_copy{copy}; + Buffer& buffer = slot_buffers[buffer_id]; + second_copy.src_offset = static_cast(buffer.CpuAddr()) + copy.src_offset; + VAddr orig_cpu_addr = static_cast(second_copy.src_offset); + const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; + async_downloads += std::make_pair(base_interval, 1); + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); + normalized_copies.push_back(second_copy); + } + runtime.PostCopyBarrier(); + pending_downloads.emplace_back(std::move(normalized_copies)); + async_buffers.emplace_back(download_staging); } - runtime.PostCopyBarrier(); - pending_downloads.emplace_back(std::move(normalized_copies)); - async_buffers.emplace_back(download_staging); } else { if constexpr (USE_MEMORY_MAPS) { auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); @@ -624,7 +626,8 @@ void BufferCache

::PopAsyncBuffers() { common_ranges.subtract(base_interval); } }); - async_downloads -= std::make_pair(IntervalType(cpu_addr, cpu_addr + copy.size), 1); + const IntervalType subtract_interval{cpu_addr, cpu_addr + copy.size}; + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1); } runtime.FreeDeferredStagingBuffer(*async_buffer); async_buffers.pop_front(); @@ -1198,10 +1201,8 @@ void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 s const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { - uncommitted_ranges.add(base_interval); - pending_ranges.add(base_interval); - } + uncommitted_ranges.add(base_interval); + pending_ranges.add(base_interval); } template @@ -1542,7 +1543,7 @@ void BufferCache

::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si .size = new_size, }); // Align up to avoid cache conflicts - constexpr u64 align = 8ULL; + constexpr u64 align = 64ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 6f29cba25..d4914a8f5 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -345,13 +345,30 @@ private: if (inter_addr < start_address) { inter_addr = start_address; } - if (it->second <= 0) { - __debugbreak(); - } func(inter_addr, inter_addr_end, it->second); } } + void RemoveEachInOverlapCounter(OverlapCounter& current_range, const IntervalType search_interval, int subtract_value) { + bool any_removals = false; + current_range.add(std::make_pair(search_interval, subtract_value)); + do { + any_removals = false; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + if (it->second <= 0) { + any_removals = true; + current_range.erase(it); + break; + } + } + } while (any_removals); + } + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); @@ -554,6 +571,8 @@ private: u64 minimum_memory = 0; u64 critical_memory = 0; + bool active_async_buffers = false; + std::array> PAGE_BITS)> page_table; }; diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h index 782951fe7..21729752b 100644 --- a/src/video_core/buffer_cache/word_manager.h +++ b/src/video_core/buffer_cache/word_manager.h @@ -273,7 +273,7 @@ public: untracked_words[word_index] &= ~bits; NotifyRasterizer(word_index, current_bits, ~u64{0}); } - const u64 word = current_word; + const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); u64 page = page_begin; page_begin = 0; @@ -321,6 +321,7 @@ public: [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { static_assert(type != Type::Untracked); + const u64* const untracked_words = Array(); const u64* const state_words = Array(); const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 word_begin = offset / BYTES_PER_WORD; @@ -328,7 +329,8 @@ public: const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 word = state_words[word_index]; + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; if (word == 0) { continue; } -- cgit v1.2.3 From 80480fe3f70997d0520ef8bf38f5fe530c54f8e5 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 23 Apr 2023 03:58:16 +0200 Subject: Clang: format and ficx compile errors. --- src/video_core/buffer_cache/buffer_base.h | 26 +++---- src/video_core/buffer_cache/buffer_cache.h | 4 +- src/video_core/buffer_cache/buffer_cache_base.h | 32 ++++----- src/video_core/buffer_cache/memory_tracker_base.h | 82 +++++++++++++---------- src/video_core/renderer_vulkan/vk_buffer_cache.h | 2 +- 5 files changed, 78 insertions(+), 68 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 66d8bb43c..095f79387 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -55,53 +55,53 @@ public: [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return word_manager.ModifiedRegion(offset, query_size); + return word_manager.template ModifiedRegion(offset, query_size); } /// Returns the inclusive GPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return word_manager.ModifiedRegion(offset, query_size); + return word_manager.template ModifiedRegion(offset, query_size); } /// Returns true if a region has been modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return word_manager.IsRegionModified(offset, query_size); + return word_manager.template IsRegionModified(offset, query_size); } /// Returns true if a region has been modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return word_manager.IsRegionModified(offset, query_size); + return word_manager.template IsRegionModified(offset, query_size); } /// Mark region as CPU modified, notifying the rasterizer about this change void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - word_manager.ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as CPU modified, notifying the rasterizer about this change void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - word_manager.ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Mark region as modified from the host GPU void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - word_manager.ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as modified from the host GPU void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - word_manager.ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Mark region as modified from the CPU /// but don't mark it as modified until FlusHCachedWrites is called. void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { flags |= BufferFlagBits::CachedWrites; - word_manager.ChangeRegionState(dirty_cpu_addr, size); + word_manager.template ChangeRegionState(dirty_cpu_addr, size); } /// Flushes cached CPU writes, and notify the rasterizer about the deltas @@ -113,24 +113,24 @@ public: /// Call 'func' for each CPU modified range and unmark those pages as CPU modified template void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - word_manager.ForEachModifiedRange(query_cpu_range, size, true, func); + word_manager.template ForEachModifiedRange(query_cpu_range, size, true, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { - word_manager.ForEachModifiedRange(query_cpu_range, size, clear, func); + word_manager.template ForEachModifiedRange(query_cpu_range, size, clear, func); } template void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { - word_manager.ForEachModifiedRange(query_cpu_range, size, true, func); + word_manager.template ForEachModifiedRange(query_cpu_range, size, true, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(Func&& func) { - word_manager.ForEachModifiedRange(cpu_addr, SizeBytes(), true, func); + word_manager.template ForEachModifiedRange(cpu_addr, SizeBytes(), true, func); } /// Mark buffer as picked diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index faa48a678..8fed08dab 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1395,10 +1395,10 @@ bool BufferCache

::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, auto make_copies = [&] { for (auto& interval : found_sets) { const std::size_t sub_size = interval.upper() - interval.lower(); - const VAddr cpu_addr = interval.lower(); + const VAddr cpu_addr_ = interval.lower(); copies.push_back(BufferCopy{ .src_offset = total_size_bytes, - .dst_offset = cpu_addr - buffer.CpuAddr(), + .dst_offset = cpu_addr_ - buffer.CpuAddr(), .size = sub_size, }); total_size_bytes += sub_size; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index d4914a8f5..acff22d4f 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -16,10 +17,13 @@ #define BOOST_NO_MT #include #undef BOOST_NO_MT +#include +#include #include #include #include #include +#include #include "common/common_types.h" #include "common/div_ceil.h" @@ -42,7 +46,6 @@ #include "video_core/texture_cache/slot_vector.h" #include "video_core/texture_cache/types.h" - namespace boost { template class fast_pool_allocator; @@ -116,11 +119,10 @@ class BufferCache : public VideoCommon::ChannelSetupCaches; + using IntervalCompare = std::less; + using IntervalInstance = boost::icl::interval_type_default; + using IntervalAllocator = boost::fast_pool_allocator; + using IntervalSet = boost::icl::interval_set; using IntervalType = typename IntervalSet::interval_type; template @@ -141,12 +143,9 @@ class BufferCache : public VideoCommon::ChannelSetupCaches; + using OverlapCombine = counter_add_functor; + using OverlapSection = boost::icl::inter_section; + using OverlapCounter = boost::icl::split_interval_map; struct Empty {}; @@ -262,7 +261,8 @@ public: /// Return true when a CPU region is modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); - void SetDrawIndirect(const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { + void SetDrawIndirect( + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { current_draw_indirect = current_draw_indirect_; } @@ -349,7 +349,8 @@ private: } } - void RemoveEachInOverlapCounter(OverlapCounter& current_range, const IntervalType search_interval, int subtract_value) { + void RemoveEachInOverlapCounter(OverlapCounter& current_range, + const IntervalType search_interval, int subtract_value) { bool any_removals = false; current_range.add(std::make_pair(search_interval, subtract_value)); do { @@ -469,7 +470,8 @@ private: void NotifyBufferDeletion(); - [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, bool is_written) const; + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, + bool is_written) const; [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, PixelFormat format); diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h index 93bd779c9..016d8430f 100644 --- a/src/video_core/buffer_cache/memory_tracker_base.h +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -35,67 +35,71 @@ public: /// Returns the inclusive CPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, u64 query_size) noexcept { - return IteratePairs(query_cpu_addr, query_size, - [](Manager* manager, u64 offset, size_t size) { - return manager->ModifiedRegion(offset, size); - }); + return IteratePairs( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template ModifiedRegion(offset, size); + }); } /// Returns the inclusive GPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, u64 query_size) noexcept { - return IteratePairs(query_cpu_addr, query_size, - [](Manager* manager, u64 offset, size_t size) { - return manager->ModifiedRegion(offset, size); - }); + return IteratePairs( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template ModifiedRegion(offset, size); + }); } /// Returns true if a region has been modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { - return IteratePages(query_cpu_addr, query_size, - [](Manager* manager, u64 offset, size_t size) { - return manager->IsRegionModified(offset, size); - }); + return IteratePages( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified(offset, size); + }); } /// Returns true if a region has been modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { - return IteratePages(query_cpu_addr, query_size, - [](Manager* manager, u64 offset, size_t size) { - return manager->IsRegionModified(offset, size); - }); + return IteratePages( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified(offset, size); + }); } /// Mark region as CPU modified, notifying the rasterizer about this change void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { - IteratePages( - dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { - manager->ChangeRegionState(manager->GetCpuAddr() + offset, size); - }); + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); } /// Unmark region as CPU modified, notifying the rasterizer about this change void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { - IteratePages( - dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { - manager->ChangeRegionState(manager->GetCpuAddr() + offset, size); - }); + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); } /// Mark region as modified from the host GPU void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { - IteratePages( - dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { - manager->ChangeRegionState(manager->GetCpuAddr() + offset, size); - }); + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); } /// Unmark region as modified from the host GPU void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { - IteratePages( - dirty_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { - manager->ChangeRegionState(manager->GetCpuAddr() + offset, size); - }); + IteratePages(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState( + manager->GetCpuAddr() + offset, size); + }); } /// Mark region as modified from the CPU @@ -104,7 +108,7 @@ public: IteratePages( dirty_cpu_addr, query_size, [this](Manager* manager, u64 offset, size_t size) { const VAddr cpu_address = manager->GetCpuAddr() + offset; - manager->ChangeRegionState(cpu_address, size); + manager->template ChangeRegionState(cpu_address, size); cached_pages.insert(static_cast(cpu_address >> HIGHER_PAGE_BITS)); }); } @@ -128,7 +132,7 @@ public: void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { IteratePages(query_cpu_range, query_size, [&func](Manager* manager, u64 offset, size_t size) { - manager->ForEachModifiedRange( + manager->template ForEachModifiedRange( manager->GetCpuAddr() + offset, size, true, func); }); } @@ -138,7 +142,7 @@ public: void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { IteratePages(query_cpu_range, query_size, [&func, clear](Manager* manager, u64 offset, size_t size) { - manager->ForEachModifiedRange( + manager->template ForEachModifiedRange( manager->GetCpuAddr() + offset, size, clear, func); }); } @@ -147,7 +151,7 @@ public: void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { IteratePages(query_cpu_range, query_size, [&func](Manager* manager, u64 offset, size_t size) { - manager->ForEachModifiedRange( + manager->template ForEachModifiedRange( manager->GetCpuAddr() + offset, size, true, func); }); } @@ -218,7 +222,11 @@ private: page_offset = 0; remaining_size -= copy_amount; } - return begin < end ? std::make_pair(begin, end) : std::make_pair(0ULL, 0ULL); + if (begin < end) { + return std::make_pair(begin, end); + } else { + return std::make_pair(0ULL, 0ULL); + } } void CreateRegion(std::size_t page_index) { diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 05968e6a6..879f1ed94 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -3,7 +3,7 @@ #pragma once -#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/buffer_cache_base.h" #include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" -- cgit v1.2.3 From f8d31d1ae18c27be37590a7d15e5e7b691bed14f Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Sun, 23 Apr 2023 21:03:50 +0200 Subject: Buffer Cache: Release stagging buffers on tick frame --- src/video_core/buffer_cache/buffer_cache.h | 28 ++++++++++++++++++------- src/video_core/buffer_cache/buffer_cache_base.h | 6 +----- 2 files changed, 22 insertions(+), 12 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 8fed08dab..e5c626c36 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -22,7 +22,7 @@ BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); - active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); + active_async_buffers = !Settings::IsGPULevelHigh(); if (!runtime.CanReportMemoryUsage()) { minimum_memory = DEFAULT_EXPECTED_MEMORY; @@ -74,7 +74,7 @@ void BufferCache

::TickFrame() { uniform_cache_hits[0] = 0; uniform_cache_shots[0] = 0; - active_async_buffers = IMPLEMENTS_ASYNC_DOWNLOADS && !Settings::IsGPULevelHigh(); + active_async_buffers = !Settings::IsGPULevelHigh(); const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; @@ -88,6 +88,13 @@ void BufferCache

::TickFrame() { } ++frame_tick; delayed_destruction_ring.Tick(); + + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + for (auto& buffer : async_buffers_death_ring) { + runtime.FreeDeferredStagingBuffer(buffer); + } + async_buffers_death_ring.clear(); + } } template @@ -468,8 +475,10 @@ void BufferCache

::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { - if (active_async_buffers) { - async_buffers.emplace_back(std::optional{}); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { + async_buffers.emplace_back(std::optional{}); + } } return; } @@ -529,8 +538,10 @@ void BufferCache

::CommitAsyncFlushesHigh() { } committed_ranges.clear(); if (downloads.empty()) { - if (active_async_buffers) { - async_buffers.emplace_back(std::optional{}); + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { + async_buffers.emplace_back(std::optional{}); + } } return; } @@ -555,6 +566,9 @@ void BufferCache

::CommitAsyncFlushesHigh() { runtime.PostCopyBarrier(); pending_downloads.emplace_back(std::move(normalized_copies)); async_buffers.emplace_back(download_staging); + } else { + committed_ranges.clear(); + uncommitted_ranges.clear(); } } else { if constexpr (USE_MEMORY_MAPS) { @@ -629,7 +643,7 @@ void BufferCache

::PopAsyncBuffers() { const IntervalType subtract_interval{cpu_addr, cpu_addr + copy.size}; RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1); } - runtime.FreeDeferredStagingBuffer(*async_buffer); + async_buffers_death_ring.emplace_back(*async_buffer); async_buffers.pop_front(); pending_downloads.pop_front(); } diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index acff22d4f..75cb98ba3 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -554,11 +554,7 @@ private: std::deque> pending_downloads; std::optional current_buffer; - // queries - boost::container::small_vector, 8> pending_queries; - std::deque> committed_queries; - boost::container::small_vector flushed_queries; - std::deque> query_async_buffers; + std::deque async_buffers_death_ring; size_t immediate_buffer_capacity = 0; Common::ScratchBuffer immediate_buffer_alloc; -- cgit v1.2.3 From d6f565e5da22ec6a6a77ffabd88e59f3a25bcc96 Mon Sep 17 00:00:00 2001 From: Fernando Sahmkow Date: Fri, 28 Apr 2023 23:54:54 +0200 Subject: BufferCache: Fixes and address feedback --- src/video_core/buffer_cache/buffer_base.h | 99 +----- src/video_core/buffer_cache/buffer_cache.h | 42 ++- src/video_core/buffer_cache/buffer_cache_base.h | 19 +- src/video_core/buffer_cache/memory_tracker_base.h | 17 +- src/video_core/buffer_cache/word_manager.h | 384 +++++++++++----------- 5 files changed, 240 insertions(+), 321 deletions(-) (limited to 'src/video_core') diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 095f79387..9cbd95c4b 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -38,10 +38,8 @@ public: static constexpr u64 BASE_PAGE_BITS = 16; static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; - explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) - : cpu_addr{Common::AlignDown(cpu_addr_, BASE_PAGE_SIZE)}, - word_manager(cpu_addr, rasterizer_, - Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BASE_PAGE_SIZE)) {} + explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_) + : cpu_addr{cpu_addr_}, size_bytes{size_bytes_} {} explicit BufferBase(NullBufferParams) {} @@ -51,88 +49,6 @@ public: BufferBase& operator=(BufferBase&&) = default; BufferBase(BufferBase&&) = default; - /// Returns the inclusive CPU modified range in a begin end pair - [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, - u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return word_manager.template ModifiedRegion(offset, query_size); - } - - /// Returns the inclusive GPU modified range in a begin end pair - [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, - u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return word_manager.template ModifiedRegion(offset, query_size); - } - - /// Returns true if a region has been modified from the CPU - [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return word_manager.template IsRegionModified(offset, query_size); - } - - /// Returns true if a region has been modified from the GPU - [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return word_manager.template IsRegionModified(offset, query_size); - } - - /// Mark region as CPU modified, notifying the rasterizer about this change - void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - word_manager.template ChangeRegionState(dirty_cpu_addr, size); - } - - /// Unmark region as CPU modified, notifying the rasterizer about this change - void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - word_manager.template ChangeRegionState(dirty_cpu_addr, size); - } - - /// Mark region as modified from the host GPU - void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - word_manager.template ChangeRegionState(dirty_cpu_addr, size); - } - - /// Unmark region as modified from the host GPU - void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - word_manager.template ChangeRegionState(dirty_cpu_addr, size); - } - - /// Mark region as modified from the CPU - /// but don't mark it as modified until FlusHCachedWrites is called. - void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { - flags |= BufferFlagBits::CachedWrites; - word_manager.template ChangeRegionState(dirty_cpu_addr, size); - } - - /// Flushes cached CPU writes, and notify the rasterizer about the deltas - void FlushCachedWrites() noexcept { - flags &= ~BufferFlagBits::CachedWrites; - word_manager.FlushCachedWrites(); - } - - /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - template - void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - word_manager.template ForEachModifiedRange(query_cpu_range, size, true, func); - } - - /// Call 'func' for each GPU modified range and unmark those pages as GPU modified - template - void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { - word_manager.template ForEachModifiedRange(query_cpu_range, size, clear, func); - } - - template - void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { - word_manager.template ForEachModifiedRange(query_cpu_range, size, true, func); - } - - /// Call 'func' for each GPU modified range and unmark those pages as GPU modified - template - void ForEachDownloadRange(Func&& func) { - word_manager.template ForEachModifiedRange(cpu_addr, SizeBytes(), true, func); - } - /// Mark buffer as picked void Pick() noexcept { flags |= BufferFlagBits::Picked; @@ -179,11 +95,6 @@ public: return static_cast(other_cpu_addr - cpu_addr); } - /// Returns the size in bytes of the buffer - [[nodiscard]] u64 SizeBytes() const noexcept { - return word_manager.SizeBytes(); - } - size_t getLRUID() const noexcept { return lru_id; } @@ -192,12 +103,16 @@ public: lru_id = lru_id_; } + size_t SizeBytes() const { + return size_bytes; + } + private: VAddr cpu_addr = 0; - WordManager word_manager; BufferFlagBits flags{}; int stream_score = 0; size_t lru_id = SIZE_MAX; + size_t size_bytes = 0; }; } // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index e5c626c36..7975564b5 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -21,6 +21,7 @@ BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); + inline_buffer_id = NULL_BUFFER_ID; active_async_buffers = !Settings::IsGPULevelHigh(); @@ -442,9 +443,6 @@ template void BufferCache

::FlushCachedWrites() { cached_write_buffer_ids.clear(); memory_tracker.FlushCachedWrites(); - for (auto& interval : cached_ranges) { - ClearDownload(interval); - } cached_ranges.clear(); } @@ -659,8 +657,8 @@ bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { template bool BufferCache

::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; - const u64 page_end = Common::DivCeil(end_addr, PAGE_SIZE); - for (u64 page = addr >> PAGE_BITS; page < page_end;) { + const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); + for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) { const BufferId buffer_id = page_table[page]; if (!buffer_id) { ++page; @@ -672,7 +670,7 @@ bool BufferCache

::IsRegionRegistered(VAddr addr, size_t size) { if (buf_start_addr < end_addr && addr < buf_end_addr) { return true; } - page = Common::DivCeil(end_addr, PAGE_SIZE); + page = Common::DivCeil(end_addr, CACHING_PAGESIZE); } return false; } @@ -689,7 +687,7 @@ void BufferCache

::BindHostIndexBuffer() { const u32 offset = buffer.Offset(index_buffer.cpu_addr); const u32 size = index_buffer.size; const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - if (!draw_state.inline_index_draw_indexes.empty()) { + if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { if constexpr (USE_MEMORY_MAPS) { auto upload_staging = runtime.UploadStagingBuffer(size); std::array copies{ @@ -1001,12 +999,20 @@ void BufferCache

::UpdateIndexBuffer() { return; } flags[Dirty::IndexBuffer] = false; - if (!draw_state.inline_index_draw_indexes.empty()) { + if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { auto inline_index_size = static_cast(draw_state.inline_index_draw_indexes.size()); + u32 buffer_size = Common::AlignUp(inline_index_size, CACHING_PAGESIZE); + if (inline_buffer_id == NULL_BUFFER_ID) [[unlikely]] { + inline_buffer_id = CreateBuffer(0, buffer_size); + } + if (slot_buffers[inline_buffer_id].SizeBytes() < buffer_size) [[unlikely]] { + slot_buffers.erase(inline_buffer_id); + inline_buffer_id = CreateBuffer(0, buffer_size); + } index_buffer = Binding{ .cpu_addr = 0, .size = inline_index_size, - .buffer_id = FindBuffer(0, inline_index_size), + .buffer_id = inline_buffer_id, }; return; } @@ -1224,7 +1230,7 @@ BufferId BufferCache

::FindBuffer(VAddr cpu_addr, u32 size) { if (cpu_addr == 0) { return NULL_BUFFER_ID; } - const u64 page = cpu_addr >> PAGE_BITS; + const u64 page = cpu_addr >> CACHING_PAGEBITS; const BufferId buffer_id = page_table[page]; if (!buffer_id) { return CreateBuffer(cpu_addr, size); @@ -1253,8 +1259,9 @@ typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(VAddr cpu .has_stream_leap = has_stream_leap, }; } - for (; cpu_addr >> PAGE_BITS < Common::DivCeil(end, PAGE_SIZE); cpu_addr += PAGE_SIZE) { - const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; + for (; cpu_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE); + cpu_addr += CACHING_PAGESIZE) { + const BufferId overlap_id = page_table[cpu_addr >> CACHING_PAGEBITS]; if (!overlap_id) { continue; } @@ -1280,11 +1287,11 @@ typename BufferCache

::OverlapResult BufferCache

::ResolveOverlaps(VAddr cpu // as a stream buffer. Increase the size to skip constantly recreating buffers. has_stream_leap = true; if (expands_right) { - begin -= PAGE_SIZE * 256; + begin -= CACHING_PAGESIZE * 256; cpu_addr = begin; } if (expands_left) { - end += PAGE_SIZE * 256; + end += CACHING_PAGESIZE * 256; } } } @@ -1317,6 +1324,9 @@ void BufferCache

::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, template BufferId BufferCache

::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { + VAddr cpu_addr_end = Common::AlignUp(cpu_addr + wanted_size, CACHING_PAGESIZE); + cpu_addr = Common::AlignDown(cpu_addr, CACHING_PAGESIZE); + wanted_size = static_cast(cpu_addr_end - cpu_addr); const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); const u32 size = static_cast(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); @@ -1354,8 +1364,8 @@ void BufferCache

::ChangeRegister(BufferId buffer_id) { } const VAddr cpu_addr_begin = buffer.CpuAddr(); const VAddr cpu_addr_end = cpu_addr_begin + size; - const u64 page_begin = cpu_addr_begin / PAGE_SIZE; - const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); + const u64 page_begin = cpu_addr_begin / CACHING_PAGESIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, CACHING_PAGESIZE); for (u64 page = page_begin; page != page_end; ++page) { if constexpr (insert) { page_table[page] = buffer_id; diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index 75cb98ba3..656baa550 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -90,10 +90,8 @@ template class BufferCache : public VideoCommon::ChannelSetupCaches { // Page size for caching purposes. // This is unrelated to the CPU page size and it can be changed as it seems optimal. - static constexpr u32 PAGE_BITS = 16; - static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; - static constexpr u32 CPU_PAGE_BITS = 12; - static constexpr u64 CPU_PAGE_SIZE = u64{1} << CPU_PAGE_BITS; + static constexpr u32 CACHING_PAGEBITS = 16; + static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS; static constexpr bool IS_OPENGL = P::IS_OPENGL; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = @@ -112,6 +110,10 @@ class BufferCache : public VideoCommon::ChannelSetupCaches void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { - const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); - for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { + const u64 page_end = Common::DivCeil(cpu_addr + size, CACHING_PAGESIZE); + for (u64 page = cpu_addr >> CACHING_PAGEBITS; page < page_end;) { const BufferId buffer_id = page_table[page]; if (!buffer_id) { ++page; @@ -297,7 +299,7 @@ private: func(buffer_id, buffer); const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, PAGE_SIZE); + page = Common::DivCeil(end_addr, CACHING_PAGESIZE); } } @@ -568,10 +570,11 @@ private: u64 total_used_memory = 0; u64 minimum_memory = 0; u64 critical_memory = 0; + BufferId inline_buffer_id; bool active_async_buffers = false; - std::array> PAGE_BITS)> page_table; + std::array> CACHING_PAGEBITS)> page_table; }; } // namespace VideoCommon diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h index 016d8430f..4bc59017f 100644 --- a/src/video_core/buffer_cache/memory_tracker_base.h +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -132,8 +132,8 @@ public: void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { IteratePages(query_cpu_range, query_size, [&func](Manager* manager, u64 offset, size_t size) { - manager->template ForEachModifiedRange( - manager->GetCpuAddr() + offset, size, true, func); + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); }); } @@ -142,8 +142,13 @@ public: void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { IteratePages(query_cpu_range, query_size, [&func, clear](Manager* manager, u64 offset, size_t size) { - manager->template ForEachModifiedRange( - manager->GetCpuAddr() + offset, size, clear, func); + if (clear) { + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); + } else { + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); + } }); } @@ -151,8 +156,8 @@ public: void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { IteratePages(query_cpu_range, query_size, [&func](Manager* manager, u64 offset, size_t size) { - manager->template ForEachModifiedRange( - manager->GetCpuAddr() + offset, size, true, func); + manager->template ForEachModifiedRange( + manager->GetCpuAddr() + offset, size, func); }); } diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h index 21729752b..a42455045 100644 --- a/src/video_core/buffer_cache/word_manager.h +++ b/src/video_core/buffer_cache/word_manager.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "common/alignment.h" @@ -20,9 +21,16 @@ constexpr u64 PAGES_PER_WORD = 64; constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; +enum class Type { + CPU, + GPU, + CachedCPU, + Untracked, +}; + /// Vector tracking modified pages tightly packed with small vector optimization template -union WordsArray { +struct WordsArray { /// Returns the pointer to the words state [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { return is_short ? stack.data() : heap; @@ -41,13 +49,13 @@ template struct Words { explicit Words() = default; explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { + num_words = Common::DivCeil(size_bytes, BYTES_PER_WORD); if (IsShort()) { cpu.stack.fill(~u64{0}); gpu.stack.fill(0); cached_cpu.stack.fill(0); untracked.stack.fill(~u64{0}); } else { - const size_t num_words = NumWords(); // Share allocation between CPU and GPU pages and set their default values u64* const alloc = new u64[num_words * 4]; cpu.heap = alloc; @@ -75,6 +83,7 @@ struct Words { Words& operator=(Words&& rhs) noexcept { Release(); size_bytes = rhs.size_bytes; + num_words = rhs.num_words; cpu = rhs.cpu; gpu = rhs.gpu; cached_cpu = rhs.cached_cpu; @@ -84,7 +93,7 @@ struct Words { } Words(Words&& rhs) noexcept - : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, + : size_bytes{rhs.size_bytes}, num_words{rhs.num_words}, cpu{rhs.cpu}, gpu{rhs.gpu}, cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { rhs.cpu.heap = nullptr; } @@ -94,12 +103,12 @@ struct Words { /// Returns true when the buffer fits in the small vector optimization [[nodiscard]] bool IsShort() const noexcept { - return size_bytes <= stack_words * BYTES_PER_WORD; + return num_words <= stack_words; } /// Returns the number of words of the buffer [[nodiscard]] size_t NumWords() const noexcept { - return Common::DivCeil(size_bytes, BYTES_PER_WORD); + return num_words; } /// Release buffer resources @@ -110,20 +119,40 @@ struct Words { } } + template + std::span Span() noexcept { + if constexpr (type == Type::CPU) { + return std::span(cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::GPU) { + return std::span(gpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::CachedCPU) { + return std::span(cached_cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::Untracked) { + return std::span(untracked.Pointer(IsShort()), num_words); + } + } + + template + std::span Span() const noexcept { + if constexpr (type == Type::CPU) { + return std::span(cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::GPU) { + return std::span(gpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::CachedCPU) { + return std::span(cached_cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::Untracked) { + return std::span(untracked.Pointer(IsShort()), num_words); + } + } + u64 size_bytes = 0; + size_t num_words = 0; WordsArray cpu; WordsArray gpu; WordsArray cached_cpu; WordsArray untracked; }; -enum class Type { - CPU, - GPU, - CachedCPU, - Untracked, -}; - template class WordManager { public: @@ -140,6 +169,69 @@ public: return cpu_addr; } + static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) { + constexpr size_t number_bits = sizeof(u64) * 8; + const size_t limit_page_end = number_bits - std::min(page_end, number_bits); + u64 bits = (word >> page_start) << page_start; + bits = (bits << limit_page_end) >> limit_page_end; + return bits; + } + + static std::pair GetWordPage(VAddr address) { + const size_t converted_address = static_cast(address); + const size_t word_number = converted_address / BYTES_PER_WORD; + const size_t amount_pages = converted_address % BYTES_PER_WORD; + return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE); + } + + template + void IterateWords(size_t offset, size_t size, Func&& func) const { + using FuncReturn = std::invoke_result_t; + static constexpr bool BOOL_BREAK = std::is_same_v; + const size_t start = static_cast(std::max(static_cast(offset), 0LL)); + const size_t end = static_cast(std::max(static_cast(offset + size), 0LL)); + if (start >= SizeBytes() || end <= start) { + return; + } + auto [start_word, start_page] = GetWordPage(start); + auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL); + const size_t num_words = NumWords(); + start_word = std::min(start_word, num_words); + end_word = std::min(end_word, num_words); + const size_t diff = end_word - start_word; + end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD; + end_word = std::min(end_word, num_words); + end_page += diff * PAGES_PER_WORD; + constexpr u64 base_mask{~0ULL}; + for (size_t word_index = start_word; word_index < end_word; word_index++) { + const u64 mask = ExtractBits(base_mask, start_page, end_page); + start_page = 0; + end_page -= PAGES_PER_WORD; + if constexpr (BOOL_BREAK) { + if (func(word_index, mask)) { + return; + } + } else { + func(word_index, mask); + } + } + } + + template + void IteratePages(u64 mask, Func&& func) const { + size_t offset = 0; + while (mask != 0) { + const size_t empty_bits = std::countr_zero(mask); + offset += empty_bits; + mask = mask >> empty_bits; + + const size_t continuous_bits = std::countr_one(mask); + func(offset, continuous_bits); + mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0; + offset += continuous_bits; + } + } + /** * Change the state of a range of pages * @@ -147,47 +239,33 @@ public: * @param size Size in bytes to mark or unmark as modified */ template - void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { - const s64 difference = dirty_addr - cpu_addr; - const u64 offset = std::max(difference, 0); - size += std::min(difference, 0); - if (offset >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array(); - u64* const state_words = Array(); - const u64 offset_end = std::min(offset + size, SizeBytes()); - const u64 begin_page_index = offset / BYTES_PER_PAGE; - const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; - const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); - const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); - u64 page_index = begin_page_index % PAGES_PER_WORD; - u64 word_index = begin_word_index; - while (word_index < end_word_index) { - const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; - const u64 left_offset = - std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; - const u64 right_offset = page_index; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; + void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) { + std::span state_words = words.template Span(); + [[maybe_unused]] std::span untracked_words = words.template Span(); + [[maybe_unused]] std::span cached_words = words.template Span(); + IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) { if constexpr (type == Type::CPU || type == Type::CachedCPU) { - NotifyRasterizer(word_index, untracked_words[word_index], bits); + NotifyRasterizer(index, untracked_words[index], mask); } if constexpr (enable) { - state_words[word_index] |= bits; + state_words[index] |= mask; if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] |= bits; + untracked_words[index] |= mask; + } + if constexpr (type == Type::CPU) { + cached_words[index] &= ~mask; } } else { - state_words[word_index] &= ~bits; + if constexpr (type == Type::CPU) { + const u64 word = state_words[index] & mask; + cached_words[index] &= ~word; + } + state_words[index] &= ~mask; if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] &= ~bits; + untracked_words[index] &= ~mask; } } - page_index = 0; - ++word_index; - } + }); } /** @@ -198,119 +276,59 @@ public: * @param size Size in bytes of the CPU range to loop over * @param func Function to call for each turned off region */ - template - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { + template + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { static_assert(type != Type::Untracked); - const s64 difference = query_cpu_range - cpu_addr; - const u64 query_begin = std::max(difference, 0); - size += std::min(difference, 0); - if (query_begin >= SizeBytes() || size < 0) { - return; - } - [[maybe_unused]] u64* const untracked_words = Array(); - [[maybe_unused]] u64* const cpu_words = Array(); - u64* const state_words = Array(); - const u64 query_end = query_begin + std::min(static_cast(size), SizeBytes()); - u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; - u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); - u64 first_page = (query_begin / BYTES_PER_PAGE) % PAGES_PER_WORD; - - const auto modified = [](u64 word) { return word != 0; }; - const auto first_modified_word = std::find_if(words_begin, words_end, modified); - if (first_modified_word == words_end) { - // Exit early when the buffer is not modified - return; - } - if (first_modified_word != words_begin) { - first_page = 0; - } - std::reverse_iterator first_word_reverse(first_modified_word); - std::reverse_iterator last_word_iterator(words_end); - auto last_word_result = std::find_if(last_word_iterator, first_word_reverse, modified); - u64* const last_modified_word = &(*last_word_result) + 1; - - const u64 word_index_begin = std::distance(state_words, first_modified_word); - const u64 word_index_end = std::distance(state_words, last_modified_word); - const unsigned local_page_begin = std::countr_zero(*first_modified_word); - const unsigned local_page_end = - static_cast(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); - const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; - const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; - const u64 query_page_begin = query_begin / BYTES_PER_PAGE; - const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); - const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); - const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); - const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; - const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; - - u64 page_begin = std::max(first_word_page_begin, first_page); - u64 current_base = 0; - u64 current_size = 0; - bool on_going = false; - for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { - const bool is_last_word = word_index + 1 == word_index_end; - const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; - const u64 right_offset = page_begin; - const u64 left_offset = PAGES_PER_WORD - page_end; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - - const u64 current_word = state_words[word_index] & bits; - if (clear) { - state_words[word_index] &= ~bits; - } - - if constexpr (type == Type::CachedCPU) { - NotifyRasterizer(word_index, untracked_words[word_index], current_word); - untracked_words[word_index] |= current_word; - cpu_words[word_index] |= current_word; - } - - if constexpr (type == Type::CPU) { - const u64 current_bits = untracked_words[word_index] & bits; - untracked_words[word_index] &= ~bits; - NotifyRasterizer(word_index, current_bits, ~u64{0}); - } - const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); - u64 page = page_begin; - page_begin = 0; - - while (page < page_end) { - const int empty_bits = std::countr_zero(word >> page); - if (on_going && empty_bits != 0) { - InvokeModifiedRange(func, current_size, current_base); - current_size = 0; - on_going = false; + std::span state_words = words.template Span(); + [[maybe_unused]] std::span untracked_words = words.template Span(); + [[maybe_unused]] std::span cached_words = words.template Span(); + const size_t offset = query_cpu_range - cpu_addr; + bool pending = false; + size_t pending_offset{}; + size_t pending_pointer{}; + const auto release = [&]() { + func(cpu_addr + pending_offset * BYTES_PER_PAGE, + (pending_pointer - pending_offset) * BYTES_PER_PAGE); + }; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; + if constexpr (clear) { + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer(index, untracked_words[index], mask); } - if (empty_bits == PAGES_PER_WORD) { - break; + state_words[index] &= ~mask; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[index] &= ~mask; } - page += empty_bits; - - const int continuous_bits = std::countr_one(word >> page); - if (!on_going && continuous_bits != 0) { - current_base = word_index * PAGES_PER_WORD + page; - on_going = true; + if constexpr (type == Type::CPU) { + cached_words[index] &= ~word; } - current_size += continuous_bits; - page += continuous_bits; } - } - if (on_going && current_size > 0) { - InvokeModifiedRange(func, current_size, current_base); + const size_t base_offset = index * PAGES_PER_WORD; + IteratePages(word, [&](size_t pages_offset, size_t pages_size) { + const auto reset = [&]() { + pending_offset = base_offset + pages_offset; + pending_pointer = base_offset + pages_offset + pages_size; + }; + if (!pending) { + reset(); + pending = true; + return; + } + if (pending_pointer == base_offset + pages_offset) { + pending_pointer += pages_size; + return; + } + release(); + reset(); + }); + }); + if (pending) { + release(); } } - template - void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { - const u64 current_size_bytes = current_size * BYTES_PER_PAGE; - const u64 offset_begin = current_base * BYTES_PER_PAGE; - const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); - func(cpu_addr + offset_begin, offset_end - offset_begin); - } - /** * Returns true when a region has been modified * @@ -321,27 +339,17 @@ public: [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { static_assert(type != Type::Untracked); - const u64* const untracked_words = Array(); - const u64* const state_words = Array(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min(word_begin + num_query_words, NumWords()); - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; - for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); - const u64 local_page_end = page_end % PAGES_PER_WORD; - const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; - if (((word >> page_index) << page_index) << page_end_shift != 0) { + const std::span state_words = words.template Span(); + bool result = false; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; + if (word != 0) { + result = true; return true; } - } - return false; + return false; + }); + return result; } /** @@ -353,34 +361,20 @@ public: template [[nodiscard]] std::pair ModifiedRegion(u64 offset, u64 size) const noexcept { static_assert(type != Type::Untracked); - const u64* const state_words = Array(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min(word_begin + num_query_words, NumWords()); - const u64 page_base = offset / BYTES_PER_PAGE; - u64 page_begin = page_base & (PAGES_PER_WORD - 1); - u64 page_end = - Common::DivCeil(offset + size, BYTES_PER_PAGE) - (page_base & ~(PAGES_PER_WORD - 1)); + const std::span state_words = words.template Span(); u64 begin = std::numeric_limits::max(); u64 end = 0; - for (u64 word_index = word_begin; word_index < word_end; ++word_index) { - const u64 base_mask = (1ULL << page_begin) - 1ULL; - const u64 end_mask = page_end >= PAGES_PER_WORD ? 0ULL : ~((1ULL << page_end) - 1ULL); - const u64 off_word = end_mask | base_mask; - const u64 word = state_words[word_index] & ~off_word; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; if (word == 0) { - page_begin = 0; - page_end -= PAGES_PER_WORD; - continue; + return; } const u64 local_page_begin = std::countr_zero(word); const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); - const u64 page_index = word_index * PAGES_PER_WORD; + const u64 page_index = index * PAGES_PER_WORD; begin = std::min(begin, page_index + local_page_begin); end = page_index + local_page_end; - page_begin = 0; - page_end -= PAGES_PER_WORD; - } + }); static constexpr std::pair EMPTY{0, 0}; return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; } @@ -454,18 +448,10 @@ private: void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; - while (changed_bits != 0) { - const int empty_bits = std::countr_zero(changed_bits); - addr += empty_bits * BYTES_PER_PAGE; - changed_bits >>= empty_bits; - - const u32 continuous_bits = std::countr_one(changed_bits); - const u64 size = continuous_bits * BYTES_PER_PAGE; - const VAddr begin_addr = addr; - addr += size; - changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; - rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); - } + IteratePages(changed_bits, [&](size_t offset, size_t size) { + rasterizer->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, + size * BYTES_PER_PAGE, add_to_rasterizer ? 1 : -1); + }); } VAddr cpu_addr = 0; -- cgit v1.2.3