diff options
Diffstat (limited to 'src')
92 files changed, 4177 insertions, 2138 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 312a49f42..5e3a74c0f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -113,6 +113,9 @@ else() $<$<CXX_COMPILER_ID:Clang>:-Wno-braced-scalar-init> $<$<CXX_COMPILER_ID:Clang>:-Wno-unused-private-field> + $<$<CXX_COMPILER_ID:Clang>:-Werror=shadow-uncaptured-local> + $<$<CXX_COMPILER_ID:Clang>:-Werror=implicit-fallthrough> + $<$<CXX_COMPILER_ID:Clang>:-Werror=type-limits> $<$<CXX_COMPILER_ID:AppleClang>:-Wno-braced-scalar-init> $<$<CXX_COMPILER_ID:AppleClang>:-Wno-unused-private-field> ) diff --git a/src/common/intrusive_list.h b/src/common/intrusive_list.h new file mode 100644 index 000000000..d330dc1c2 --- /dev/null +++ b/src/common/intrusive_list.h @@ -0,0 +1,631 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include "common/common_funcs.h" +#include "common/parent_of_member.h" + +namespace Common { + +// Forward declare implementation class for Node. +namespace impl { + +class IntrusiveListImpl; + +} + +class IntrusiveListNode { + YUZU_NON_COPYABLE(IntrusiveListNode); + +private: + friend class impl::IntrusiveListImpl; + + IntrusiveListNode* m_prev; + IntrusiveListNode* m_next; + +public: + constexpr IntrusiveListNode() : m_prev(this), m_next(this) {} + + constexpr bool IsLinked() const { + return m_next != this; + } + +private: + constexpr void LinkPrev(IntrusiveListNode* node) { + // We can't link an already linked node. + ASSERT(!node->IsLinked()); + this->SplicePrev(node, node); + } + + constexpr void SplicePrev(IntrusiveListNode* first, IntrusiveListNode* last) { + // Splice a range into the list. + auto last_prev = last->m_prev; + first->m_prev = m_prev; + last_prev->m_next = this; + m_prev->m_next = first; + m_prev = last_prev; + } + + constexpr void LinkNext(IntrusiveListNode* node) { + // We can't link an already linked node. + ASSERT(!node->IsLinked()); + return this->SpliceNext(node, node); + } + + constexpr void SpliceNext(IntrusiveListNode* first, IntrusiveListNode* last) { + // Splice a range into the list. + auto last_prev = last->m_prev; + first->m_prev = this; + last_prev->m_next = m_next; + m_next->m_prev = last_prev; + m_next = first; + } + + constexpr void Unlink() { + this->Unlink(m_next); + } + + constexpr void Unlink(IntrusiveListNode* last) { + // Unlink a node from a next node. + auto last_prev = last->m_prev; + m_prev->m_next = last; + last->m_prev = m_prev; + last_prev->m_next = this; + m_prev = last_prev; + } + + constexpr IntrusiveListNode* GetPrev() { + return m_prev; + } + + constexpr const IntrusiveListNode* GetPrev() const { + return m_prev; + } + + constexpr IntrusiveListNode* GetNext() { + return m_next; + } + + constexpr const IntrusiveListNode* GetNext() const { + return m_next; + } +}; +// DEPRECATED: static_assert(std::is_literal_type<IntrusiveListNode>::value); + +namespace impl { + +class IntrusiveListImpl { + YUZU_NON_COPYABLE(IntrusiveListImpl); + +private: + IntrusiveListNode m_root_node; + +public: + template <bool Const> + class Iterator; + + using value_type = IntrusiveListNode; + using size_type = size_t; + using difference_type = ptrdiff_t; + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using iterator = Iterator<false>; + using const_iterator = Iterator<true>; + using reverse_iterator = std::reverse_iterator<iterator>; + using const_reverse_iterator = std::reverse_iterator<const_iterator>; + + template <bool Const> + class Iterator { + public: + using iterator_category = std::bidirectional_iterator_tag; + using value_type = typename IntrusiveListImpl::value_type; + using difference_type = typename IntrusiveListImpl::difference_type; + using pointer = + std::conditional_t<Const, IntrusiveListImpl::const_pointer, IntrusiveListImpl::pointer>; + using reference = std::conditional_t<Const, IntrusiveListImpl::const_reference, + IntrusiveListImpl::reference>; + + private: + pointer m_node; + + public: + constexpr explicit Iterator(pointer n) : m_node(n) {} + + constexpr bool operator==(const Iterator& rhs) const { + return m_node == rhs.m_node; + } + + constexpr pointer operator->() const { + return m_node; + } + + constexpr reference operator*() const { + return *m_node; + } + + constexpr Iterator& operator++() { + m_node = m_node->m_next; + return *this; + } + + constexpr Iterator& operator--() { + m_node = m_node->m_prev; + return *this; + } + + constexpr Iterator operator++(int) { + const Iterator it{*this}; + ++(*this); + return it; + } + + constexpr Iterator operator--(int) { + const Iterator it{*this}; + --(*this); + return it; + } + + constexpr operator Iterator<true>() const { + return Iterator<true>(m_node); + } + + constexpr Iterator<false> GetNonConstIterator() const { + return Iterator<false>(const_cast<IntrusiveListImpl::pointer>(m_node)); + } + }; + +public: + constexpr IntrusiveListImpl() : m_root_node() {} + + // Iterator accessors. + constexpr iterator begin() { + return iterator(m_root_node.GetNext()); + } + + constexpr const_iterator begin() const { + return const_iterator(m_root_node.GetNext()); + } + + constexpr iterator end() { + return iterator(std::addressof(m_root_node)); + } + + constexpr const_iterator end() const { + return const_iterator(std::addressof(m_root_node)); + } + + constexpr iterator iterator_to(reference v) { + // Only allow iterator_to for values in lists. + ASSERT(v.IsLinked()); + return iterator(std::addressof(v)); + } + + constexpr const_iterator iterator_to(const_reference v) const { + // Only allow iterator_to for values in lists. + ASSERT(v.IsLinked()); + return const_iterator(std::addressof(v)); + } + + // Content management. + constexpr bool empty() const { + return !m_root_node.IsLinked(); + } + + constexpr size_type size() const { + return static_cast<size_type>(std::distance(this->begin(), this->end())); + } + + constexpr reference back() { + return *m_root_node.GetPrev(); + } + + constexpr const_reference back() const { + return *m_root_node.GetPrev(); + } + + constexpr reference front() { + return *m_root_node.GetNext(); + } + + constexpr const_reference front() const { + return *m_root_node.GetNext(); + } + + constexpr void push_back(reference node) { + m_root_node.LinkPrev(std::addressof(node)); + } + + constexpr void push_front(reference node) { + m_root_node.LinkNext(std::addressof(node)); + } + + constexpr void pop_back() { + m_root_node.GetPrev()->Unlink(); + } + + constexpr void pop_front() { + m_root_node.GetNext()->Unlink(); + } + + constexpr iterator insert(const_iterator pos, reference node) { + pos.GetNonConstIterator()->LinkPrev(std::addressof(node)); + return iterator(std::addressof(node)); + } + + constexpr void splice(const_iterator pos, IntrusiveListImpl& o) { + splice_impl(pos, o.begin(), o.end()); + } + + constexpr void splice(const_iterator pos, IntrusiveListImpl& o, const_iterator first) { + const_iterator last(first); + std::advance(last, 1); + splice_impl(pos, first, last); + } + + constexpr void splice(const_iterator pos, IntrusiveListImpl& o, const_iterator first, + const_iterator last) { + splice_impl(pos, first, last); + } + + constexpr iterator erase(const_iterator pos) { + if (pos == this->end()) { + return this->end(); + } + iterator it(pos.GetNonConstIterator()); + (it++)->Unlink(); + return it; + } + + constexpr void clear() { + while (!this->empty()) { + this->pop_front(); + } + } + +private: + constexpr void splice_impl(const_iterator _pos, const_iterator _first, const_iterator _last) { + if (_first == _last) { + return; + } + iterator pos(_pos.GetNonConstIterator()); + iterator first(_first.GetNonConstIterator()); + iterator last(_last.GetNonConstIterator()); + first->Unlink(std::addressof(*last)); + pos->SplicePrev(std::addressof(*first), std::addressof(*first)); + } +}; + +} // namespace impl + +template <class T, class Traits> +class IntrusiveList { + YUZU_NON_COPYABLE(IntrusiveList); + +private: + impl::IntrusiveListImpl m_impl; + +public: + template <bool Const> + class Iterator; + + using value_type = T; + using size_type = size_t; + using difference_type = ptrdiff_t; + using pointer = value_type*; + using const_pointer = const value_type*; + using reference = value_type&; + using const_reference = const value_type&; + using iterator = Iterator<false>; + using const_iterator = Iterator<true>; + using reverse_iterator = std::reverse_iterator<iterator>; + using const_reverse_iterator = std::reverse_iterator<const_iterator>; + + template <bool Const> + class Iterator { + public: + friend class Common::IntrusiveList<T, Traits>; + + using ImplIterator = + std::conditional_t<Const, Common::impl::IntrusiveListImpl::const_iterator, + Common::impl::IntrusiveListImpl::iterator>; + + using iterator_category = std::bidirectional_iterator_tag; + using value_type = typename IntrusiveList::value_type; + using difference_type = typename IntrusiveList::difference_type; + using pointer = + std::conditional_t<Const, IntrusiveList::const_pointer, IntrusiveList::pointer>; + using reference = + std::conditional_t<Const, IntrusiveList::const_reference, IntrusiveList::reference>; + + private: + ImplIterator m_iterator; + + private: + constexpr explicit Iterator(ImplIterator it) : m_iterator(it) {} + + constexpr ImplIterator GetImplIterator() const { + return m_iterator; + } + + public: + constexpr bool operator==(const Iterator& rhs) const { + return m_iterator == rhs.m_iterator; + } + + constexpr pointer operator->() const { + return std::addressof(Traits::GetParent(*m_iterator)); + } + + constexpr reference operator*() const { + return Traits::GetParent(*m_iterator); + } + + constexpr Iterator& operator++() { + ++m_iterator; + return *this; + } + + constexpr Iterator& operator--() { + --m_iterator; + return *this; + } + + constexpr Iterator operator++(int) { + const Iterator it{*this}; + ++m_iterator; + return it; + } + + constexpr Iterator operator--(int) { + const Iterator it{*this}; + --m_iterator; + return it; + } + + constexpr operator Iterator<true>() const { + return Iterator<true>(m_iterator); + } + }; + +private: + static constexpr IntrusiveListNode& GetNode(reference ref) { + return Traits::GetNode(ref); + } + + static constexpr IntrusiveListNode const& GetNode(const_reference ref) { + return Traits::GetNode(ref); + } + + static constexpr reference GetParent(IntrusiveListNode& node) { + return Traits::GetParent(node); + } + + static constexpr const_reference GetParent(IntrusiveListNode const& node) { + return Traits::GetParent(node); + } + +public: + constexpr IntrusiveList() : m_impl() {} + + // Iterator accessors. + constexpr iterator begin() { + return iterator(m_impl.begin()); + } + + constexpr const_iterator begin() const { + return const_iterator(m_impl.begin()); + } + + constexpr iterator end() { + return iterator(m_impl.end()); + } + + constexpr const_iterator end() const { + return const_iterator(m_impl.end()); + } + + constexpr const_iterator cbegin() const { + return this->begin(); + } + + constexpr const_iterator cend() const { + return this->end(); + } + + constexpr reverse_iterator rbegin() { + return reverse_iterator(this->end()); + } + + constexpr const_reverse_iterator rbegin() const { + return const_reverse_iterator(this->end()); + } + + constexpr reverse_iterator rend() { + return reverse_iterator(this->begin()); + } + + constexpr const_reverse_iterator rend() const { + return const_reverse_iterator(this->begin()); + } + + constexpr const_reverse_iterator crbegin() const { + return this->rbegin(); + } + + constexpr const_reverse_iterator crend() const { + return this->rend(); + } + + constexpr iterator iterator_to(reference v) { + return iterator(m_impl.iterator_to(GetNode(v))); + } + + constexpr const_iterator iterator_to(const_reference v) const { + return const_iterator(m_impl.iterator_to(GetNode(v))); + } + + // Content management. + constexpr bool empty() const { + return m_impl.empty(); + } + + constexpr size_type size() const { + return m_impl.size(); + } + + constexpr reference back() { + return GetParent(m_impl.back()); + } + + constexpr const_reference back() const { + return GetParent(m_impl.back()); + } + + constexpr reference front() { + return GetParent(m_impl.front()); + } + + constexpr const_reference front() const { + return GetParent(m_impl.front()); + } + + constexpr void push_back(reference ref) { + m_impl.push_back(GetNode(ref)); + } + + constexpr void push_front(reference ref) { + m_impl.push_front(GetNode(ref)); + } + + constexpr void pop_back() { + m_impl.pop_back(); + } + + constexpr void pop_front() { + m_impl.pop_front(); + } + + constexpr iterator insert(const_iterator pos, reference ref) { + return iterator(m_impl.insert(pos.GetImplIterator(), GetNode(ref))); + } + + constexpr void splice(const_iterator pos, IntrusiveList& o) { + m_impl.splice(pos.GetImplIterator(), o.m_impl); + } + + constexpr void splice(const_iterator pos, IntrusiveList& o, const_iterator first) { + m_impl.splice(pos.GetImplIterator(), o.m_impl, first.GetImplIterator()); + } + + constexpr void splice(const_iterator pos, IntrusiveList& o, const_iterator first, + const_iterator last) { + m_impl.splice(pos.GetImplIterator(), o.m_impl, first.GetImplIterator(), + last.GetImplIterator()); + } + + constexpr iterator erase(const_iterator pos) { + return iterator(m_impl.erase(pos.GetImplIterator())); + } + + constexpr void clear() { + m_impl.clear(); + } +}; + +template <auto T, class Derived = Common::impl::GetParentType<T>> +class IntrusiveListMemberTraits; + +template <class Parent, IntrusiveListNode Parent::*Member, class Derived> +class IntrusiveListMemberTraits<Member, Derived> { +public: + using ListType = IntrusiveList<Derived, IntrusiveListMemberTraits>; + +private: + friend class IntrusiveList<Derived, IntrusiveListMemberTraits>; + + static constexpr IntrusiveListNode& GetNode(Derived& parent) { + return parent.*Member; + } + + static constexpr IntrusiveListNode const& GetNode(Derived const& parent) { + return parent.*Member; + } + + static Derived& GetParent(IntrusiveListNode& node) { + return Common::GetParentReference<Member, Derived>(std::addressof(node)); + } + + static Derived const& GetParent(IntrusiveListNode const& node) { + return Common::GetParentReference<Member, Derived>(std::addressof(node)); + } +}; + +template <auto T, class Derived = Common::impl::GetParentType<T>> +class IntrusiveListMemberTraitsByNonConstexprOffsetOf; + +template <class Parent, IntrusiveListNode Parent::*Member, class Derived> +class IntrusiveListMemberTraitsByNonConstexprOffsetOf<Member, Derived> { +public: + using ListType = IntrusiveList<Derived, IntrusiveListMemberTraitsByNonConstexprOffsetOf>; + +private: + friend class IntrusiveList<Derived, IntrusiveListMemberTraitsByNonConstexprOffsetOf>; + + static constexpr IntrusiveListNode& GetNode(Derived& parent) { + return parent.*Member; + } + + static constexpr IntrusiveListNode const& GetNode(Derived const& parent) { + return parent.*Member; + } + + static Derived& GetParent(IntrusiveListNode& node) { + return *reinterpret_cast<Derived*>(reinterpret_cast<char*>(std::addressof(node)) - + GetOffset()); + } + + static Derived const& GetParent(IntrusiveListNode const& node) { + return *reinterpret_cast<const Derived*>( + reinterpret_cast<const char*>(std::addressof(node)) - GetOffset()); + } + + static uintptr_t GetOffset() { + return reinterpret_cast<uintptr_t>(std::addressof(reinterpret_cast<Derived*>(0)->*Member)); + } +}; + +template <class Derived> +class IntrusiveListBaseNode : public IntrusiveListNode {}; + +template <class Derived> +class IntrusiveListBaseTraits { +public: + using ListType = IntrusiveList<Derived, IntrusiveListBaseTraits>; + +private: + friend class IntrusiveList<Derived, IntrusiveListBaseTraits>; + + static constexpr IntrusiveListNode& GetNode(Derived& parent) { + return static_cast<IntrusiveListNode&>( + static_cast<IntrusiveListBaseNode<Derived>&>(parent)); + } + + static constexpr IntrusiveListNode const& GetNode(Derived const& parent) { + return static_cast<const IntrusiveListNode&>( + static_cast<const IntrusiveListBaseNode<Derived>&>(parent)); + } + + static constexpr Derived& GetParent(IntrusiveListNode& node) { + return static_cast<Derived&>(static_cast<IntrusiveListBaseNode<Derived>&>(node)); + } + + static constexpr Derived const& GetParent(IntrusiveListNode const& node) { + return static_cast<const Derived&>( + static_cast<const IntrusiveListBaseNode<Derived>&>(node)); + } +}; + +} // namespace Common diff --git a/src/common/settings.cpp b/src/common/settings.cpp index 84955030b..174460c5e 100644 --- a/src/common/settings.cpp +++ b/src/common/settings.cpp @@ -45,6 +45,7 @@ void LogSettings() { log_setting("System_LanguageIndex", values.language_index.GetValue()); log_setting("System_RegionIndex", values.region_index.GetValue()); log_setting("System_TimeZoneIndex", values.time_zone_index.GetValue()); + log_setting("System_UnsafeMemoryLayout", values.use_unsafe_extended_memory_layout.GetValue()); log_setting("Core_UseMultiCore", values.use_multi_core.GetValue()); log_setting("CPU_Accuracy", values.cpu_accuracy.GetValue()); log_setting("Renderer_UseResolutionScaling", values.resolution_setup.GetValue()); @@ -191,7 +192,7 @@ void RestoreGlobalState(bool is_powered_on) { // Core values.use_multi_core.SetGlobal(true); - values.use_extended_memory_layout.SetGlobal(true); + values.use_unsafe_extended_memory_layout.SetGlobal(true); // CPU values.cpu_accuracy.SetGlobal(true); @@ -205,6 +206,7 @@ void RestoreGlobalState(bool is_powered_on) { // Renderer values.fsr_sharpening_slider.SetGlobal(true); values.renderer_backend.SetGlobal(true); + values.async_presentation.SetGlobal(true); values.renderer_force_max_clock.SetGlobal(true); values.vulkan_device.SetGlobal(true); values.fullscreen_mode.SetGlobal(true); @@ -225,7 +227,6 @@ void RestoreGlobalState(bool is_powered_on) { values.shader_backend.SetGlobal(true); values.use_asynchronous_shaders.SetGlobal(true); values.use_fast_gpu_time.SetGlobal(true); - values.use_pessimistic_flushes.SetGlobal(true); values.use_vulkan_driver_pipeline_cache.SetGlobal(true); values.bg_red.SetGlobal(true); values.bg_green.SetGlobal(true); diff --git a/src/common/settings.h b/src/common/settings.h index b77a1580a..55200c36f 100644 --- a/src/common/settings.h +++ b/src/common/settings.h @@ -388,7 +388,8 @@ struct Values { // Core SwitchableSetting<bool> use_multi_core{true, "use_multi_core"}; - SwitchableSetting<bool> use_extended_memory_layout{false, "use_extended_memory_layout"}; + SwitchableSetting<bool> use_unsafe_extended_memory_layout{false, + "use_unsafe_extended_memory_layout"}; // Cpu SwitchableSetting<CPUAccuracy, true> cpu_accuracy{CPUAccuracy::Auto, CPUAccuracy::Auto, @@ -422,6 +423,7 @@ struct Values { // Renderer SwitchableSetting<RendererBackend, true> renderer_backend{ RendererBackend::Vulkan, RendererBackend::OpenGL, RendererBackend::Null, "backend"}; + SwitchableSetting<bool> async_presentation{false, "async_presentation"}; SwitchableSetting<bool> renderer_force_max_clock{false, "force_max_clock"}; Setting<bool> renderer_debug{false, "debug"}; Setting<bool> renderer_shader_feedback{false, "shader_feedback"}; @@ -459,7 +461,6 @@ struct Values { ShaderBackend::SPIRV, "shader_backend"}; SwitchableSetting<bool> use_asynchronous_shaders{false, "use_asynchronous_shaders"}; SwitchableSetting<bool> use_fast_gpu_time{true, "use_fast_gpu_time"}; - SwitchableSetting<bool> use_pessimistic_flushes{false, "use_pessimistic_flushes"}; SwitchableSetting<bool> use_vulkan_driver_pipeline_cache{true, "use_vulkan_driver_pipeline_cache"}; diff --git a/src/core/core.cpp b/src/core/core.cpp index caa6a77be..06fba4ce5 100644 --- a/src/core/core.cpp +++ b/src/core/core.cpp @@ -137,7 +137,7 @@ struct System::Impl { device_memory = std::make_unique<Core::DeviceMemory>(); is_multicore = Settings::values.use_multi_core.GetValue(); - extended_memory_layout = Settings::values.use_extended_memory_layout.GetValue(); + extended_memory_layout = Settings::values.use_unsafe_extended_memory_layout.GetValue(); core_timing.SetMulticore(is_multicore); core_timing.Initialize([&system]() { system.RegisterHostThread(); }); @@ -169,7 +169,7 @@ struct System::Impl { void ReinitializeIfNecessary(System& system) { const bool must_reinitialize = is_multicore != Settings::values.use_multi_core.GetValue() || - extended_memory_layout != Settings::values.use_extended_memory_layout.GetValue(); + extended_memory_layout != Settings::values.use_unsafe_extended_memory_layout.GetValue(); if (!must_reinitialize) { return; @@ -178,7 +178,7 @@ struct System::Impl { LOG_DEBUG(Kernel, "Re-initializing"); is_multicore = Settings::values.use_multi_core.GetValue(); - extended_memory_layout = Settings::values.use_extended_memory_layout.GetValue(); + extended_memory_layout = Settings::values.use_unsafe_extended_memory_layout.GetValue(); Initialize(system); } @@ -293,6 +293,7 @@ struct System::Impl { ASSERT(Kernel::KProcess::Initialize(main_process, system, "main", Kernel::KProcess::ProcessType::Userland, resource_limit) .IsSuccess()); + Kernel::KProcess::Register(system.Kernel(), main_process); kernel.MakeApplicationProcess(main_process); const auto [load_result, load_parameters] = app_loader->Load(*main_process, system); if (load_result != Loader::ResultStatus::Success) { diff --git a/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp b/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp index 36d0d20d2..49bdc671e 100644 --- a/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp +++ b/src/core/hle/kernel/board/nintendo/nx/k_system_control.cpp @@ -35,12 +35,13 @@ namespace { using namespace Common::Literals; u32 GetMemorySizeForInit() { - return Settings::values.use_extended_memory_layout ? Smc::MemorySize_8GB : Smc::MemorySize_4GB; + return Settings::values.use_unsafe_extended_memory_layout ? Smc::MemorySize_8GB + : Smc::MemorySize_4GB; } Smc::MemoryArrangement GetMemoryArrangeForInit() { - return Settings::values.use_extended_memory_layout ? Smc::MemoryArrangement_8GB - : Smc::MemoryArrangement_4GB; + return Settings::values.use_unsafe_extended_memory_layout ? Smc::MemoryArrangement_8GB + : Smc::MemoryArrangement_4GB; } } // namespace diff --git a/src/core/hle/kernel/k_auto_object.h b/src/core/hle/kernel/k_auto_object.h index 9b71fe371..f384b1568 100644 --- a/src/core/hle/kernel/k_auto_object.h +++ b/src/core/hle/kernel/k_auto_object.h @@ -182,8 +182,8 @@ public: explicit KAutoObjectWithList(KernelCore& kernel) : KAutoObject(kernel) {} static int Compare(const KAutoObjectWithList& lhs, const KAutoObjectWithList& rhs) { - const u64 lid = lhs.GetId(); - const u64 rid = rhs.GetId(); + const uintptr_t lid = reinterpret_cast<uintptr_t>(std::addressof(lhs)); + const uintptr_t rid = reinterpret_cast<uintptr_t>(std::addressof(rhs)); if (lid < rid) { return -1; diff --git a/src/core/hle/kernel/k_event_info.h b/src/core/hle/kernel/k_event_info.h index 25b3ff594..eacfa5dc6 100644 --- a/src/core/hle/kernel/k_event_info.h +++ b/src/core/hle/kernel/k_event_info.h @@ -5,14 +5,15 @@ #include <array> -#include <boost/intrusive/list.hpp> +#include "common/intrusive_list.h" #include "core/hle/kernel/slab_helpers.h" #include "core/hle/kernel/svc_types.h" namespace Kernel { -class KEventInfo : public KSlabAllocated<KEventInfo>, public boost::intrusive::list_base_hook<> { +class KEventInfo : public KSlabAllocated<KEventInfo>, + public Common::IntrusiveListBaseNode<KEventInfo> { public: struct InfoCreateThread { u32 thread_id{}; diff --git a/src/core/hle/kernel/k_object_name.h b/src/core/hle/kernel/k_object_name.h index 2d97fc777..a8876fe37 100644 --- a/src/core/hle/kernel/k_object_name.h +++ b/src/core/hle/kernel/k_object_name.h @@ -5,7 +5,8 @@ #include <array> #include <memory> -#include <boost/intrusive/list.hpp> + +#include "common/intrusive_list.h" #include "core/hle/kernel/k_light_lock.h" #include "core/hle/kernel/slab_helpers.h" @@ -15,13 +16,14 @@ namespace Kernel { class KObjectNameGlobalData; -class KObjectName : public KSlabAllocated<KObjectName>, public boost::intrusive::list_base_hook<> { +class KObjectName : public KSlabAllocated<KObjectName>, + public Common::IntrusiveListBaseNode<KObjectName> { public: explicit KObjectName(KernelCore&) {} virtual ~KObjectName() = default; static constexpr size_t NameLengthMax = 12; - using List = boost::intrusive::list<KObjectName>; + using List = Common::IntrusiveListBaseTraits<KObjectName>::ListType; static Result NewFromName(KernelCore& kernel, KAutoObject* obj, const char* name); static Result Delete(KernelCore& kernel, KAutoObject* obj, const char* name); diff --git a/src/core/hle/kernel/k_server_port.h b/src/core/hle/kernel/k_server_port.h index 21c040e62..625280290 100644 --- a/src/core/hle/kernel/k_server_port.h +++ b/src/core/hle/kernel/k_server_port.h @@ -7,7 +7,7 @@ #include <string> #include <utility> -#include <boost/intrusive/list.hpp> +#include "common/intrusive_list.h" #include "core/hle/kernel/k_server_session.h" #include "core/hle/kernel/k_synchronization_object.h" @@ -42,7 +42,7 @@ public: bool IsSignaled() const override; private: - using SessionList = boost::intrusive::list<KServerSession>; + using SessionList = Common::IntrusiveListBaseTraits<KServerSession>::ListType; void CleanupSessions(); diff --git a/src/core/hle/kernel/k_server_session.h b/src/core/hle/kernel/k_server_session.h index 5ee02f556..403891919 100644 --- a/src/core/hle/kernel/k_server_session.h +++ b/src/core/hle/kernel/k_server_session.h @@ -8,7 +8,7 @@ #include <string> #include <utility> -#include <boost/intrusive/list.hpp> +#include "common/intrusive_list.h" #include "core/hle/kernel/k_light_lock.h" #include "core/hle/kernel/k_session_request.h" @@ -27,7 +27,7 @@ class KSession; class KThread; class KServerSession final : public KSynchronizationObject, - public boost::intrusive::list_base_hook<> { + public Common::IntrusiveListBaseNode<KServerSession> { KERNEL_AUTOOBJECT_TRAITS(KServerSession, KSynchronizationObject); friend class ServiceThread; @@ -67,7 +67,8 @@ private: KSession* m_parent{}; /// List of threads which are pending a reply. - boost::intrusive::list<KSessionRequest> m_request_list{}; + using RequestList = Common::IntrusiveListBaseTraits<KSessionRequest>::ListType; + RequestList m_request_list{}; KSessionRequest* m_current_request{}; KLightLock m_lock; diff --git a/src/core/hle/kernel/k_session_request.h b/src/core/hle/kernel/k_session_request.h index b5f04907b..283669e0a 100644 --- a/src/core/hle/kernel/k_session_request.h +++ b/src/core/hle/kernel/k_session_request.h @@ -5,6 +5,8 @@ #include <array> +#include "common/intrusive_list.h" + #include "core/hle/kernel/k_auto_object.h" #include "core/hle/kernel/k_event.h" #include "core/hle/kernel/k_memory_block.h" @@ -16,7 +18,7 @@ namespace Kernel { class KSessionRequest final : public KSlabAllocated<KSessionRequest>, public KAutoObject, - public boost::intrusive::list_base_hook<> { + public Common::IntrusiveListBaseNode<KSessionRequest> { KERNEL_AUTOOBJECT_TRAITS(KSessionRequest, KAutoObject); public: diff --git a/src/core/hle/kernel/k_shared_memory_info.h b/src/core/hle/kernel/k_shared_memory_info.h index 75b73ba39..2d8ff20d6 100644 --- a/src/core/hle/kernel/k_shared_memory_info.h +++ b/src/core/hle/kernel/k_shared_memory_info.h @@ -3,7 +3,7 @@ #pragma once -#include <boost/intrusive/list.hpp> +#include "common/intrusive_list.h" #include "core/hle/kernel/slab_helpers.h" @@ -12,7 +12,7 @@ namespace Kernel { class KSharedMemory; class KSharedMemoryInfo final : public KSlabAllocated<KSharedMemoryInfo>, - public boost::intrusive::list_base_hook<> { + public Common::IntrusiveListBaseNode<KSharedMemoryInfo> { public: explicit KSharedMemoryInfo(KernelCore&) {} diff --git a/src/core/hle/kernel/k_thread.h b/src/core/hle/kernel/k_thread.h index 9c1a41128..f9814ac8f 100644 --- a/src/core/hle/kernel/k_thread.h +++ b/src/core/hle/kernel/k_thread.h @@ -12,7 +12,7 @@ #include <utility> #include <vector> -#include <boost/intrusive/list.hpp> +#include "common/intrusive_list.h" #include "common/intrusive_red_black_tree.h" #include "common/spin_lock.h" @@ -119,7 +119,7 @@ s32 GetCurrentCoreId(KernelCore& kernel); Core::Memory::Memory& GetCurrentMemory(KernelCore& kernel); class KThread final : public KAutoObjectWithSlabHeapAndContainer<KThread, KWorkerTask>, - public boost::intrusive::list_base_hook<>, + public Common::IntrusiveListBaseNode<KThread>, public KTimerTask { KERNEL_AUTOOBJECT_TRAITS(KThread, KSynchronizationObject); @@ -138,7 +138,7 @@ public: public: using ThreadContext32 = Core::ARM_Interface::ThreadContext32; using ThreadContext64 = Core::ARM_Interface::ThreadContext64; - using WaiterList = boost::intrusive::list<KThread>; + using WaiterList = Common::IntrusiveListBaseTraits<KThread>::ListType; /** * Gets the thread's current priority @@ -750,8 +750,9 @@ private: ConditionVariableThreadTreeTraits::TreeType<LockWithPriorityInheritanceComparator>; public: - class LockWithPriorityInheritanceInfo : public KSlabAllocated<LockWithPriorityInheritanceInfo>, - public boost::intrusive::list_base_hook<> { + class LockWithPriorityInheritanceInfo + : public KSlabAllocated<LockWithPriorityInheritanceInfo>, + public Common::IntrusiveListBaseNode<LockWithPriorityInheritanceInfo> { public: explicit LockWithPriorityInheritanceInfo(KernelCore&) {} @@ -839,7 +840,7 @@ public: private: using LockWithPriorityInheritanceInfoList = - boost::intrusive::list<LockWithPriorityInheritanceInfo>; + Common::IntrusiveListBaseTraits<LockWithPriorityInheritanceInfo>::ListType; ConditionVariableThreadTree* m_condvar_tree{}; u64 m_condvar_key{}; diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp index 4f3366c9d..f33600ca5 100644 --- a/src/core/hle/kernel/kernel.cpp +++ b/src/core/hle/kernel/kernel.cpp @@ -95,7 +95,7 @@ struct KernelCore::Impl { pt_heap_region.GetSize()); } - InitializeHackSharedMemory(); + InitializeHackSharedMemory(kernel); RegisterHostThread(nullptr); } @@ -216,10 +216,12 @@ struct KernelCore::Impl { auto* main_thread{Kernel::KThread::Create(system.Kernel())}; main_thread->SetCurrentCore(core); ASSERT(Kernel::KThread::InitializeMainThread(system, main_thread, core).IsSuccess()); + KThread::Register(system.Kernel(), main_thread); auto* idle_thread{Kernel::KThread::Create(system.Kernel())}; idle_thread->SetCurrentCore(core); ASSERT(Kernel::KThread::InitializeIdleThread(system, idle_thread, core).IsSuccess()); + KThread::Register(system.Kernel(), idle_thread); schedulers[i]->Initialize(main_thread, idle_thread, core); } @@ -230,6 +232,7 @@ struct KernelCore::Impl { const Core::Timing::CoreTiming& core_timing) { system_resource_limit = KResourceLimit::Create(system.Kernel()); system_resource_limit->Initialize(&core_timing); + KResourceLimit::Register(kernel, system_resource_limit); const auto sizes{memory_layout->GetTotalAndKernelMemorySizes()}; const auto total_size{sizes.first}; @@ -355,6 +358,7 @@ struct KernelCore::Impl { ASSERT(KThread::InitializeHighPriorityThread(system, shutdown_threads[core_id], {}, {}, core_id) .IsSuccess()); + KThread::Register(system.Kernel(), shutdown_threads[core_id]); } } @@ -729,7 +733,7 @@ struct KernelCore::Impl { memory_manager->Initialize(management_region.GetAddress(), management_region.GetSize()); } - void InitializeHackSharedMemory() { + void InitializeHackSharedMemory(KernelCore& kernel) { // Setup memory regions for emulated processes // TODO(bunnei): These should not be hardcoded regions initialized within the kernel constexpr std::size_t hid_size{0x40000}; @@ -746,14 +750,23 @@ struct KernelCore::Impl { hid_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None, Svc::MemoryPermission::Read, hid_size); + KSharedMemory::Register(kernel, hid_shared_mem); + font_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None, Svc::MemoryPermission::Read, font_size); + KSharedMemory::Register(kernel, font_shared_mem); + irs_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None, Svc::MemoryPermission::Read, irs_size); + KSharedMemory::Register(kernel, irs_shared_mem); + time_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None, Svc::MemoryPermission::Read, time_size); + KSharedMemory::Register(kernel, time_shared_mem); + hidbus_shared_mem->Initialize(system.DeviceMemory(), nullptr, Svc::MemoryPermission::None, Svc::MemoryPermission::Read, hidbus_size); + KSharedMemory::Register(kernel, hidbus_shared_mem); } std::mutex registered_objects_lock; @@ -1072,12 +1085,15 @@ static std::jthread RunHostThreadFunc(KernelCore& kernel, KProcess* process, // Commit the thread reservation. thread_reservation.Commit(); + // Register the thread. + KThread::Register(kernel, thread); + return std::jthread( [&kernel, thread, thread_name{std::move(thread_name)}, func{std::move(func)}] { // Set the thread name. Common::SetCurrentThreadName(thread_name.c_str()); - // Register the thread. + // Set the thread as current. kernel.RegisterHostThread(thread); // Run the callback. @@ -1099,6 +1115,9 @@ std::jthread KernelCore::RunOnHostCoreProcess(std::string&& process_name, // Ensure that we don't hold onto any extra references. SCOPE_EXIT({ process->Close(); }); + // Register the new process. + KProcess::Register(*this, process); + // Run the host thread. return RunHostThreadFunc(*this, process, std::move(process_name), std::move(func)); } @@ -1124,6 +1143,9 @@ void KernelCore::RunOnGuestCoreProcess(std::string&& process_name, std::function // Ensure that we don't hold onto any extra references. SCOPE_EXIT({ process->Close(); }); + // Register the new process. + KProcess::Register(*this, process); + // Reserve a new thread from the process resource limit. KScopedResourceReservation thread_reservation(process, LimitableResource::ThreadCountMax); ASSERT(thread_reservation.Succeeded()); @@ -1136,6 +1158,9 @@ void KernelCore::RunOnGuestCoreProcess(std::string&& process_name, std::function // Commit the thread reservation. thread_reservation.Commit(); + // Register the new thread. + KThread::Register(*this, thread); + // Begin running the thread. ASSERT(R_SUCCEEDED(thread->Run())); } diff --git a/src/core/hle/service/ipc_helpers.h b/src/core/hle/service/ipc_helpers.h index e4cb4e1f2..0e222362e 100644 --- a/src/core/hle/service/ipc_helpers.h +++ b/src/core/hle/service/ipc_helpers.h @@ -156,6 +156,7 @@ public: auto* session = Kernel::KSession::Create(kernel); session->Initialize(nullptr, 0); + Kernel::KSession::Register(kernel, session); auto next_manager = std::make_shared<Service::SessionRequestManager>( kernel, manager->GetServerManager()); diff --git a/src/core/hle/service/kernel_helpers.cpp b/src/core/hle/service/kernel_helpers.cpp index a39ce5212..6a313a03b 100644 --- a/src/core/hle/service/kernel_helpers.cpp +++ b/src/core/hle/service/kernel_helpers.cpp @@ -25,6 +25,9 @@ ServiceContext::ServiceContext(Core::System& system_, std::string name_) Kernel::KProcess::ProcessType::KernelInternal, kernel.GetSystemResourceLimit()) .IsSuccess()); + + // Register the process. + Kernel::KProcess::Register(kernel, process); process_created = true; } diff --git a/src/core/hle/service/mutex.cpp b/src/core/hle/service/mutex.cpp index 07589a0f0..b0ff71d1b 100644 --- a/src/core/hle/service/mutex.cpp +++ b/src/core/hle/service/mutex.cpp @@ -12,6 +12,9 @@ Mutex::Mutex(Core::System& system) : m_system(system) { m_event = Kernel::KEvent::Create(system.Kernel()); m_event->Initialize(nullptr); + // Register the event. + Kernel::KEvent::Register(system.Kernel(), m_event); + ASSERT(R_SUCCEEDED(m_event->Signal())); } diff --git a/src/core/hle/service/server_manager.cpp b/src/core/hle/service/server_manager.cpp index 6b4a1291e..156bc27d8 100644 --- a/src/core/hle/service/server_manager.cpp +++ b/src/core/hle/service/server_manager.cpp @@ -33,6 +33,9 @@ ServerManager::ServerManager(Core::System& system) : m_system{system}, m_serve_m // Initialize event. m_event = Kernel::KEvent::Create(system.Kernel()); m_event->Initialize(nullptr); + + // Register event. + Kernel::KEvent::Register(system.Kernel(), m_event); } ServerManager::~ServerManager() { @@ -160,6 +163,9 @@ Result ServerManager::ManageDeferral(Kernel::KEvent** out_event) { // Initialize the event. m_deferral_event->Initialize(nullptr); + // Register the event. + Kernel::KEvent::Register(m_system.Kernel(), m_deferral_event); + // Set the output. *out_event = m_deferral_event; diff --git a/src/core/hle/service/sm/sm.cpp b/src/core/hle/service/sm/sm.cpp index c45be5726..1608fa24c 100644 --- a/src/core/hle/service/sm/sm.cpp +++ b/src/core/hle/service/sm/sm.cpp @@ -64,6 +64,9 @@ Result ServiceManager::RegisterService(std::string name, u32 max_sessions, auto* port = Kernel::KPort::Create(kernel); port->Initialize(ServerSessionCountMax, false, 0); + // Register the port. + Kernel::KPort::Register(kernel, port); + service_ports.emplace(name, port); registered_services.emplace(name, handler); if (deferral_event) { diff --git a/src/core/hle/service/sm/sm_controller.cpp b/src/core/hle/service/sm/sm_controller.cpp index 419c1df2b..7dce28fe0 100644 --- a/src/core/hle/service/sm/sm_controller.cpp +++ b/src/core/hle/service/sm/sm_controller.cpp @@ -49,6 +49,9 @@ void Controller::CloneCurrentObject(HLERequestContext& ctx) { // Commit the session reservation. session_reservation.Commit(); + // Register the session. + Kernel::KSession::Register(system.Kernel(), session); + // Register with server manager. session_manager->GetServerManager().RegisterSession(&session->GetServerSession(), session_manager); diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 432310632..a9667463f 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -462,7 +462,7 @@ struct Memory::Impl { } if (Settings::IsFastmemEnabled()) { - const bool is_read_enable = Settings::IsGPULevelHigh() || !cached; + const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached; system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached); } diff --git a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp index 0cd87a48f..fee510f7b 100644 --- a/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp +++ b/src/shader_recompiler/backend/spirv/emit_spirv_context_get_set.cpp @@ -473,7 +473,8 @@ void EmitSetFragColor(EmitContext& ctx, u32 index, u32 component, Id value) { } void EmitSetSampleMask(EmitContext& ctx, Id value) { - ctx.OpStore(ctx.sample_mask, value); + const Id pointer{ctx.OpAccessChain(ctx.output_u32, ctx.sample_mask, ctx.u32_zero_value)}; + ctx.OpStore(pointer, value); } void EmitSetFragDepth(EmitContext& ctx, Id value) { diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index d48d4860e..47739794f 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp @@ -1572,7 +1572,8 @@ void EmitContext::DefineOutputs(const IR::Program& program) { Decorate(frag_depth, spv::Decoration::BuiltIn, spv::BuiltIn::FragDepth); } if (info.stores_sample_mask) { - sample_mask = DefineOutput(*this, U32[1], std::nullopt); + const Id array_type{TypeArray(U32[1], Const(1U))}; + sample_mask = DefineOutput(*this, array_type, std::nullopt); Decorate(sample_mask, spv::Decoration::BuiltIn, spv::BuiltIn::SampleMask); } break; diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index 39b774c98..1e158f375 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -15,7 +15,7 @@ add_executable(tests core/core_timing.cpp core/internal_network/network.cpp precompiled_headers.h - video_core/buffer_base.cpp + video_core/memory_tracker.cpp input_common/calibration_configuration_job.cpp ) diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp deleted file mode 100644 index 734dbf4b6..000000000 --- a/src/tests/video_core/buffer_base.cpp +++ /dev/null @@ -1,549 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <stdexcept> -#include <unordered_map> - -#include <catch2/catch_test_macros.hpp> - -#include "common/alignment.h" -#include "common/common_types.h" -#include "video_core/buffer_cache/buffer_base.h" - -namespace { -using VideoCommon::BufferBase; -using Range = std::pair<u64, u64>; - -constexpr u64 PAGE = 4096; -constexpr u64 WORD = 4096 * 64; - -constexpr VAddr c = 0x1328914000; - -class RasterizerInterface { -public: - void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) { - const u64 page_start{addr >> Core::Memory::YUZU_PAGEBITS}; - const u64 page_end{(addr + size + Core::Memory::YUZU_PAGESIZE - 1) >> - Core::Memory::YUZU_PAGEBITS}; - for (u64 page = page_start; page < page_end; ++page) { - int& value = page_table[page]; - value += delta; - if (value < 0) { - throw std::logic_error{"negative page"}; - } - if (value == 0) { - page_table.erase(page); - } - } - } - - [[nodiscard]] int Count(VAddr addr) const noexcept { - const auto it = page_table.find(addr >> Core::Memory::YUZU_PAGEBITS); - return it == page_table.end() ? 0 : it->second; - } - - [[nodiscard]] unsigned Count() const noexcept { - unsigned count = 0; - for (const auto& [index, value] : page_table) { - count += value; - } - return count; - } - -private: - std::unordered_map<u64, int> page_table; -}; -} // Anonymous namespace - -TEST_CASE("BufferBase: Small buffer", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - REQUIRE(rasterizer.Count() == 0); - buffer.UnmarkRegionAsCpuModified(c, WORD); - REQUIRE(rasterizer.Count() == WORD / PAGE); - REQUIRE(buffer.ModifiedCpuRegion(c, WORD) == Range{0, 0}); - - buffer.MarkRegionAsCpuModified(c + PAGE, 1); - REQUIRE(buffer.ModifiedCpuRegion(c, WORD) == Range{PAGE * 1, PAGE * 2}); -} - -TEST_CASE("BufferBase: Large buffer", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 32); - buffer.UnmarkRegionAsCpuModified(c, WORD * 32); - buffer.MarkRegionAsCpuModified(c + 4096, WORD * 4); - REQUIRE(buffer.ModifiedCpuRegion(c, WORD + PAGE * 2) == Range{PAGE, WORD + PAGE * 2}); - REQUIRE(buffer.ModifiedCpuRegion(c + PAGE * 2, PAGE * 6) == Range{PAGE * 2, PAGE * 8}); - REQUIRE(buffer.ModifiedCpuRegion(c, WORD * 32) == Range{PAGE, WORD * 4 + PAGE}); - REQUIRE(buffer.ModifiedCpuRegion(c + WORD * 4, PAGE) == Range{WORD * 4, WORD * 4 + PAGE}); - REQUIRE(buffer.ModifiedCpuRegion(c + WORD * 3 + PAGE * 63, PAGE) == - Range{WORD * 3 + PAGE * 63, WORD * 4}); - - buffer.MarkRegionAsCpuModified(c + WORD * 5 + PAGE * 6, PAGE); - buffer.MarkRegionAsCpuModified(c + WORD * 5 + PAGE * 8, PAGE); - REQUIRE(buffer.ModifiedCpuRegion(c + WORD * 5, WORD) == - Range{WORD * 5 + PAGE * 6, WORD * 5 + PAGE * 9}); - - buffer.UnmarkRegionAsCpuModified(c + WORD * 5 + PAGE * 8, PAGE); - REQUIRE(buffer.ModifiedCpuRegion(c + WORD * 5, WORD) == - Range{WORD * 5 + PAGE * 6, WORD * 5 + PAGE * 7}); - - buffer.MarkRegionAsCpuModified(c + PAGE, WORD * 31 + PAGE * 63); - REQUIRE(buffer.ModifiedCpuRegion(c, WORD * 32) == Range{PAGE, WORD * 32}); - - buffer.UnmarkRegionAsCpuModified(c + PAGE * 4, PAGE); - buffer.UnmarkRegionAsCpuModified(c + PAGE * 6, PAGE); - - buffer.UnmarkRegionAsCpuModified(c, WORD * 32); - REQUIRE(buffer.ModifiedCpuRegion(c, WORD * 32) == Range{0, 0}); -} - -TEST_CASE("BufferBase: Rasterizer counting", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, PAGE * 2); - REQUIRE(rasterizer.Count() == 0); - buffer.UnmarkRegionAsCpuModified(c, PAGE); - REQUIRE(rasterizer.Count() == 1); - buffer.MarkRegionAsCpuModified(c, PAGE * 2); - REQUIRE(rasterizer.Count() == 0); - buffer.UnmarkRegionAsCpuModified(c, PAGE); - buffer.UnmarkRegionAsCpuModified(c + PAGE, PAGE); - REQUIRE(rasterizer.Count() == 2); - buffer.MarkRegionAsCpuModified(c, PAGE * 2); - REQUIRE(rasterizer.Count() == 0); -} - -TEST_CASE("BufferBase: Basic range", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - buffer.MarkRegionAsCpuModified(c, PAGE); - int num = 0; - buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { - REQUIRE(offset == 0U); - REQUIRE(size == PAGE); - ++num; - }); - REQUIRE(num == 1U); -} - -TEST_CASE("BufferBase: Border upload", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 2); - buffer.UnmarkRegionAsCpuModified(c, WORD * 2); - buffer.MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); - buffer.ForEachUploadRange(c, WORD * 2, [](u64 offset, u64 size) { - REQUIRE(offset == WORD - PAGE); - REQUIRE(size == PAGE * 2); - }); -} - -TEST_CASE("BufferBase: Border upload range", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 2); - buffer.UnmarkRegionAsCpuModified(c, WORD * 2); - buffer.MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); - buffer.ForEachUploadRange(c + WORD - PAGE, PAGE * 2, [](u64 offset, u64 size) { - REQUIRE(offset == WORD - PAGE); - REQUIRE(size == PAGE * 2); - }); - buffer.MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); - buffer.ForEachUploadRange(c + WORD - PAGE, PAGE, [](u64 offset, u64 size) { - REQUIRE(offset == WORD - PAGE); - REQUIRE(size == PAGE); - }); - buffer.ForEachUploadRange(c + WORD, PAGE, [](u64 offset, u64 size) { - REQUIRE(offset == WORD); - REQUIRE(size == PAGE); - }); -} - -TEST_CASE("BufferBase: Border upload partial range", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 2); - buffer.UnmarkRegionAsCpuModified(c, WORD * 2); - buffer.MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); - buffer.ForEachUploadRange(c + WORD - 1, 2, [](u64 offset, u64 size) { - REQUIRE(offset == WORD - PAGE); - REQUIRE(size == PAGE * 2); - }); - buffer.MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); - buffer.ForEachUploadRange(c + WORD - 1, 1, [](u64 offset, u64 size) { - REQUIRE(offset == WORD - PAGE); - REQUIRE(size == PAGE); - }); - buffer.ForEachUploadRange(c + WORD + 50, 1, [](u64 offset, u64 size) { - REQUIRE(offset == WORD); - REQUIRE(size == PAGE); - }); -} - -TEST_CASE("BufferBase: Partial word uploads", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, 0x9d000); - int num = 0; - buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { - REQUIRE(offset == 0U); - REQUIRE(size == WORD); - ++num; - }); - REQUIRE(num == 1); - buffer.ForEachUploadRange(c + WORD, WORD, [&](u64 offset, u64 size) { - REQUIRE(offset == WORD); - REQUIRE(size == WORD); - ++num; - }); - REQUIRE(num == 2); - buffer.ForEachUploadRange(c + 0x79000, 0x24000, [&](u64 offset, u64 size) { - REQUIRE(offset == WORD * 2); - REQUIRE(size == PAGE * 0x1d); - ++num; - }); - REQUIRE(num == 3); -} - -TEST_CASE("BufferBase: Partial page upload", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - int num = 0; - buffer.MarkRegionAsCpuModified(c + PAGE * 2, PAGE); - buffer.MarkRegionAsCpuModified(c + PAGE * 9, PAGE); - buffer.ForEachUploadRange(c, PAGE * 3, [&](u64 offset, u64 size) { - REQUIRE(offset == PAGE * 2); - REQUIRE(size == PAGE); - ++num; - }); - REQUIRE(num == 1); - buffer.ForEachUploadRange(c + PAGE * 7, PAGE * 3, [&](u64 offset, u64 size) { - REQUIRE(offset == PAGE * 9); - REQUIRE(size == PAGE); - ++num; - }); - REQUIRE(num == 2); -} - -TEST_CASE("BufferBase: Partial page upload with multiple words on the right") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 8); - buffer.UnmarkRegionAsCpuModified(c, WORD * 8); - buffer.MarkRegionAsCpuModified(c + PAGE * 13, WORD * 7); - int num = 0; - buffer.ForEachUploadRange(c + PAGE * 10, WORD * 7, [&](u64 offset, u64 size) { - REQUIRE(offset == PAGE * 13); - REQUIRE(size == WORD * 7 - PAGE * 3); - ++num; - }); - REQUIRE(num == 1); - buffer.ForEachUploadRange(c + PAGE, WORD * 8, [&](u64 offset, u64 size) { - REQUIRE(offset == WORD * 7 + PAGE * 10); - REQUIRE(size == PAGE * 3); - ++num; - }); - REQUIRE(num == 2); -} - -TEST_CASE("BufferBase: Partial page upload with multiple words on the left", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 8); - buffer.UnmarkRegionAsCpuModified(c, WORD * 8); - buffer.MarkRegionAsCpuModified(c + PAGE * 13, WORD * 7); - int num = 0; - buffer.ForEachUploadRange(c + PAGE * 16, WORD * 7, [&](u64 offset, u64 size) { - REQUIRE(offset == PAGE * 16); - REQUIRE(size == WORD * 7 - PAGE * 3); - ++num; - }); - REQUIRE(num == 1); - buffer.ForEachUploadRange(c + PAGE, WORD, [&](u64 offset, u64 size) { - REQUIRE(offset == PAGE * 13); - REQUIRE(size == PAGE * 3); - ++num; - }); - REQUIRE(num == 2); -} - -TEST_CASE("BufferBase: Partial page upload with multiple words in the middle", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 8); - buffer.UnmarkRegionAsCpuModified(c, WORD * 8); - buffer.MarkRegionAsCpuModified(c + PAGE * 13, PAGE * 140); - int num = 0; - buffer.ForEachUploadRange(c + PAGE * 16, WORD, [&](u64 offset, u64 size) { - REQUIRE(offset == PAGE * 16); - REQUIRE(size == WORD); - ++num; - }); - REQUIRE(num == 1); - buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { - REQUIRE(offset == PAGE * 13); - REQUIRE(size == PAGE * 3); - ++num; - }); - REQUIRE(num == 2); - buffer.ForEachUploadRange(c, WORD * 8, [&](u64 offset, u64 size) { - REQUIRE(offset == WORD + PAGE * 16); - REQUIRE(size == PAGE * 73); - ++num; - }); - REQUIRE(num == 3); -} - -TEST_CASE("BufferBase: Empty right bits", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 2048); - buffer.UnmarkRegionAsCpuModified(c, WORD * 2048); - buffer.MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); - buffer.ForEachUploadRange(c, WORD * 2048, [](u64 offset, u64 size) { - REQUIRE(offset == WORD - PAGE); - REQUIRE(size == PAGE * 2); - }); -} - -TEST_CASE("BufferBase: Out of bound ranges 1", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - buffer.MarkRegionAsCpuModified(c, PAGE); - int num = 0; - buffer.ForEachUploadRange(c - WORD, WORD, [&](u64 offset, u64 size) { ++num; }); - buffer.ForEachUploadRange(c + WORD, WORD, [&](u64 offset, u64 size) { ++num; }); - buffer.ForEachUploadRange(c - PAGE, PAGE, [&](u64 offset, u64 size) { ++num; }); - REQUIRE(num == 0); - buffer.ForEachUploadRange(c - PAGE, PAGE * 2, [&](u64 offset, u64 size) { ++num; }); - REQUIRE(num == 1); - buffer.MarkRegionAsCpuModified(c, WORD); - REQUIRE(rasterizer.Count() == 0); -} - -TEST_CASE("BufferBase: Out of bound ranges 2", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, 0x22000); - REQUIRE_NOTHROW(buffer.UnmarkRegionAsCpuModified(c + 0x22000, PAGE)); - REQUIRE_NOTHROW(buffer.UnmarkRegionAsCpuModified(c + 0x28000, PAGE)); - REQUIRE(rasterizer.Count() == 0); - REQUIRE_NOTHROW(buffer.UnmarkRegionAsCpuModified(c + 0x21100, PAGE - 0x100)); - REQUIRE(rasterizer.Count() == 1); - REQUIRE_NOTHROW(buffer.UnmarkRegionAsCpuModified(c - 0x1000, PAGE * 2)); - buffer.UnmarkRegionAsCpuModified(c - 0x3000, PAGE * 2); - buffer.UnmarkRegionAsCpuModified(c - 0x2000, PAGE * 2); - REQUIRE(rasterizer.Count() == 2); -} - -TEST_CASE("BufferBase: Out of bound ranges 3", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, 0x310720); - buffer.UnmarkRegionAsCpuModified(c, 0x310720); - REQUIRE(rasterizer.Count(c) == 1); - REQUIRE(rasterizer.Count(c + PAGE) == 1); - REQUIRE(rasterizer.Count(c + WORD) == 1); - REQUIRE(rasterizer.Count(c + WORD + PAGE) == 1); -} - -TEST_CASE("BufferBase: Sparse regions 1", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - buffer.MarkRegionAsCpuModified(c + PAGE * 1, PAGE); - buffer.MarkRegionAsCpuModified(c + PAGE * 3, PAGE * 4); - buffer.ForEachUploadRange(c, WORD, [i = 0](u64 offset, u64 size) mutable { - static constexpr std::array<u64, 2> offsets{PAGE, PAGE * 3}; - static constexpr std::array<u64, 2> sizes{PAGE, PAGE * 4}; - REQUIRE(offset == offsets.at(i)); - REQUIRE(size == sizes.at(i)); - ++i; - }); -} - -TEST_CASE("BufferBase: Sparse regions 2", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, 0x22000); - buffer.UnmarkRegionAsCpuModified(c, 0x22000); - REQUIRE(rasterizer.Count() == 0x22); - buffer.MarkRegionAsCpuModified(c + PAGE * 0x1B, PAGE); - buffer.MarkRegionAsCpuModified(c + PAGE * 0x21, PAGE); - buffer.ForEachUploadRange(c, WORD, [i = 0](u64 offset, u64 size) mutable { - static constexpr std::array<u64, 2> offsets{PAGE * 0x1B, PAGE * 0x21}; - static constexpr std::array<u64, 2> sizes{PAGE, PAGE}; - REQUIRE(offset == offsets.at(i)); - REQUIRE(size == sizes.at(i)); - ++i; - }); -} - -TEST_CASE("BufferBase: Single page modified range", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, PAGE); - REQUIRE(buffer.IsRegionCpuModified(c, PAGE)); - buffer.UnmarkRegionAsCpuModified(c, PAGE); - REQUIRE(!buffer.IsRegionCpuModified(c, PAGE)); -} - -TEST_CASE("BufferBase: Two page modified range", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, PAGE * 2); - REQUIRE(buffer.IsRegionCpuModified(c, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c, PAGE * 2)); - buffer.UnmarkRegionAsCpuModified(c, PAGE); - REQUIRE(!buffer.IsRegionCpuModified(c, PAGE)); -} - -TEST_CASE("BufferBase: Multi word modified ranges", "[video_core]") { - for (int offset = 0; offset < 4; ++offset) { - const VAddr address = c + WORD * offset; - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, address, WORD * 4); - REQUIRE(buffer.IsRegionCpuModified(address, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(address + PAGE * 48, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(address + PAGE * 56, PAGE)); - - buffer.UnmarkRegionAsCpuModified(address + PAGE * 32, PAGE); - REQUIRE(buffer.IsRegionCpuModified(address + PAGE, WORD)); - REQUIRE(buffer.IsRegionCpuModified(address + PAGE * 31, PAGE)); - REQUIRE(!buffer.IsRegionCpuModified(address + PAGE * 32, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(address + PAGE * 33, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(address + PAGE * 31, PAGE * 2)); - REQUIRE(buffer.IsRegionCpuModified(address + PAGE * 32, PAGE * 2)); - - buffer.UnmarkRegionAsCpuModified(address + PAGE * 33, PAGE); - REQUIRE(!buffer.IsRegionCpuModified(address + PAGE * 32, PAGE * 2)); - } -} - -TEST_CASE("BufferBase: Single page in large buffer", "[video_core]") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 16); - buffer.UnmarkRegionAsCpuModified(c, WORD * 16); - REQUIRE(!buffer.IsRegionCpuModified(c, WORD * 16)); - - buffer.MarkRegionAsCpuModified(c + WORD * 12 + PAGE * 8, PAGE); - REQUIRE(buffer.IsRegionCpuModified(c, WORD * 16)); - REQUIRE(buffer.IsRegionCpuModified(c + WORD * 10, WORD * 2)); - REQUIRE(buffer.IsRegionCpuModified(c + WORD * 11, WORD * 2)); - REQUIRE(buffer.IsRegionCpuModified(c + WORD * 12, WORD * 2)); - REQUIRE(buffer.IsRegionCpuModified(c + WORD * 12 + PAGE * 4, PAGE * 8)); - REQUIRE(buffer.IsRegionCpuModified(c + WORD * 12 + PAGE * 6, PAGE * 8)); - REQUIRE(!buffer.IsRegionCpuModified(c + WORD * 12 + PAGE * 6, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + WORD * 12 + PAGE * 7, PAGE * 2)); - REQUIRE(buffer.IsRegionCpuModified(c + WORD * 12 + PAGE * 8, PAGE * 2)); -} - -TEST_CASE("BufferBase: Out of bounds region query") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 16); - REQUIRE(!buffer.IsRegionCpuModified(c - PAGE, PAGE)); - REQUIRE(!buffer.IsRegionCpuModified(c - PAGE * 2, PAGE)); - REQUIRE(!buffer.IsRegionCpuModified(c + WORD * 16, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + WORD * 16 - PAGE, WORD * 64)); - REQUIRE(!buffer.IsRegionCpuModified(c + WORD * 16, WORD * 64)); -} - -TEST_CASE("BufferBase: Wrap word regions") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD * 2); - buffer.UnmarkRegionAsCpuModified(c, WORD * 2); - buffer.MarkRegionAsCpuModified(c + PAGE * 63, PAGE * 2); - REQUIRE(buffer.IsRegionCpuModified(c, WORD * 2)); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE * 62, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 63, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 64, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 63, PAGE * 2)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 63, PAGE * 8)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 60, PAGE * 8)); - - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE * 127, WORD * 16)); - buffer.MarkRegionAsCpuModified(c + PAGE * 127, PAGE); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 127, WORD * 16)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 127, PAGE)); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE * 126, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 126, PAGE * 2)); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE * 128, WORD * 16)); -} - -TEST_CASE("BufferBase: Unaligned page region query") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - buffer.MarkRegionAsCpuModified(c + 4000, 1000); - REQUIRE(buffer.IsRegionCpuModified(c, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1000)); - REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1)); -} - -TEST_CASE("BufferBase: Cached write") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - buffer.CachedCpuWrite(c + PAGE, PAGE); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); - buffer.FlushCachedWrites(); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); - buffer.MarkRegionAsCpuModified(c, WORD); - REQUIRE(rasterizer.Count() == 0); -} - -TEST_CASE("BufferBase: Multiple cached write") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - buffer.CachedCpuWrite(c + PAGE, PAGE); - buffer.CachedCpuWrite(c + PAGE * 3, PAGE); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE * 3, PAGE)); - buffer.FlushCachedWrites(); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 3, PAGE)); - buffer.MarkRegionAsCpuModified(c, WORD); - REQUIRE(rasterizer.Count() == 0); -} - -TEST_CASE("BufferBase: Cached write unmarked") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - buffer.CachedCpuWrite(c + PAGE, PAGE); - buffer.UnmarkRegionAsCpuModified(c + PAGE, PAGE); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); - buffer.FlushCachedWrites(); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); - buffer.MarkRegionAsCpuModified(c, WORD); - REQUIRE(rasterizer.Count() == 0); -} - -TEST_CASE("BufferBase: Cached write iterated") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - buffer.CachedCpuWrite(c + PAGE, PAGE); - int num = 0; - buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); - REQUIRE(num == 0); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); - buffer.FlushCachedWrites(); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); - buffer.MarkRegionAsCpuModified(c, WORD); - REQUIRE(rasterizer.Count() == 0); -} - -TEST_CASE("BufferBase: Cached write downloads") { - RasterizerInterface rasterizer; - BufferBase buffer(rasterizer, c, WORD); - buffer.UnmarkRegionAsCpuModified(c, WORD); - REQUIRE(rasterizer.Count() == 64); - buffer.CachedCpuWrite(c + PAGE, PAGE); - REQUIRE(rasterizer.Count() == 63); - buffer.MarkRegionAsGpuModified(c + PAGE, PAGE); - int num = 0; - buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); - buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); - REQUIRE(num == 0); - REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); - REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE)); - buffer.FlushCachedWrites(); - REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); - REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE)); - buffer.MarkRegionAsCpuModified(c, WORD); - REQUIRE(rasterizer.Count() == 0); -} diff --git a/src/tests/video_core/memory_tracker.cpp b/src/tests/video_core/memory_tracker.cpp new file mode 100644 index 000000000..3981907a2 --- /dev/null +++ b/src/tests/video_core/memory_tracker.cpp @@ -0,0 +1,549 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#include <memory> +#include <stdexcept> +#include <unordered_map> + +#include <catch2/catch_test_macros.hpp> + +#include "common/alignment.h" +#include "common/common_types.h" +#include "video_core/buffer_cache/memory_tracker_base.h" + +namespace { +using Range = std::pair<u64, u64>; + +constexpr u64 PAGE = 4096; +constexpr u64 WORD = 4096 * 64; +constexpr u64 HIGH_PAGE_BITS = 22; +constexpr u64 HIGH_PAGE_SIZE = 1ULL << HIGH_PAGE_BITS; + +constexpr VAddr c = 16 * HIGH_PAGE_SIZE; + +class RasterizerInterface { +public: + void UpdatePagesCachedCount(VAddr addr, u64 size, int delta) { + const u64 page_start{addr >> Core::Memory::YUZU_PAGEBITS}; + const u64 page_end{(addr + size + Core::Memory::YUZU_PAGESIZE - 1) >> + Core::Memory::YUZU_PAGEBITS}; + for (u64 page = page_start; page < page_end; ++page) { + int& value = page_table[page]; + value += delta; + if (value < 0) { + throw std::logic_error{"negative page"}; + } + if (value == 0) { + page_table.erase(page); + } + } + } + + [[nodiscard]] int Count(VAddr addr) const noexcept { + const auto it = page_table.find(addr >> Core::Memory::YUZU_PAGEBITS); + return it == page_table.end() ? 0 : it->second; + } + + [[nodiscard]] unsigned Count() const noexcept { + unsigned count = 0; + for (const auto& [index, value] : page_table) { + count += value; + } + return count; + } + +private: + std::unordered_map<u64, int> page_table; +}; +} // Anonymous namespace + +using MemoryTracker = VideoCommon::MemoryTrackerBase<RasterizerInterface>; + +TEST_CASE("MemoryTracker: Small region", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + REQUIRE(rasterizer.Count() == 0); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == WORD / PAGE); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD) == Range{0, 0}); + + memory_track->MarkRegionAsCpuModified(c + PAGE, 1); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD) == Range{c + PAGE * 1, c + PAGE * 2}); +} + +TEST_CASE("MemoryTracker: Large region", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 32); + memory_track->MarkRegionAsCpuModified(c + 4096, WORD * 4); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD + PAGE * 2) == + Range{c + PAGE, c + WORD + PAGE * 2}); + REQUIRE(memory_track->ModifiedCpuRegion(c + PAGE * 2, PAGE * 6) == + Range{c + PAGE * 2, c + PAGE * 8}); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD * 32) == Range{c + PAGE, c + WORD * 4 + PAGE}); + REQUIRE(memory_track->ModifiedCpuRegion(c + WORD * 4, PAGE) == + Range{c + WORD * 4, c + WORD * 4 + PAGE}); + REQUIRE(memory_track->ModifiedCpuRegion(c + WORD * 3 + PAGE * 63, PAGE) == + Range{c + WORD * 3 + PAGE * 63, c + WORD * 4}); + + memory_track->MarkRegionAsCpuModified(c + WORD * 5 + PAGE * 6, PAGE); + memory_track->MarkRegionAsCpuModified(c + WORD * 5 + PAGE * 8, PAGE); + REQUIRE(memory_track->ModifiedCpuRegion(c + WORD * 5, WORD) == + Range{c + WORD * 5 + PAGE * 6, c + WORD * 5 + PAGE * 9}); + + memory_track->UnmarkRegionAsCpuModified(c + WORD * 5 + PAGE * 8, PAGE); + REQUIRE(memory_track->ModifiedCpuRegion(c + WORD * 5, WORD) == + Range{c + WORD * 5 + PAGE * 6, c + WORD * 5 + PAGE * 7}); + + memory_track->MarkRegionAsCpuModified(c + PAGE, WORD * 31 + PAGE * 63); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD * 32) == Range{c + PAGE, c + WORD * 32}); + + memory_track->UnmarkRegionAsCpuModified(c + PAGE * 4, PAGE); + memory_track->UnmarkRegionAsCpuModified(c + PAGE * 6, PAGE); + + memory_track->UnmarkRegionAsCpuModified(c, WORD * 32); + REQUIRE(memory_track->ModifiedCpuRegion(c, WORD * 32) == Range{0, 0}); +} + +TEST_CASE("MemoryTracker: Rasterizer counting", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + REQUIRE(rasterizer.Count() == 0); + memory_track->UnmarkRegionAsCpuModified(c, PAGE); + REQUIRE(rasterizer.Count() == 1); + memory_track->MarkRegionAsCpuModified(c, PAGE * 2); + REQUIRE(rasterizer.Count() == 0); + memory_track->UnmarkRegionAsCpuModified(c, PAGE); + memory_track->UnmarkRegionAsCpuModified(c + PAGE, PAGE); + REQUIRE(rasterizer.Count() == 2); + memory_track->MarkRegionAsCpuModified(c, PAGE * 2); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Basic range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->MarkRegionAsCpuModified(c, PAGE); + int num = 0; + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c); + REQUIRE(size == PAGE); + ++num; + }); + REQUIRE(num == 1U); +} + +TEST_CASE("MemoryTracker: Border upload", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c, WORD * 2, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE * 2); + }); +} + +TEST_CASE("MemoryTracker: Border upload range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c + WORD - PAGE, PAGE * 2, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE * 2); + }); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c + WORD - PAGE, PAGE, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE); + }); + memory_track->ForEachUploadRange(c + WORD, PAGE, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD); + REQUIRE(size == PAGE); + }); +} + +TEST_CASE("MemoryTracker: Border upload partial range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c + WORD - 1, 2, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE * 2); + }); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c + WORD - 1, 1, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE); + }); + memory_track->ForEachUploadRange(c + WORD + 50, 1, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD); + REQUIRE(size == PAGE); + }); +} + +TEST_CASE("MemoryTracker: Partial word uploads", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + int num = 0; + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c); + REQUIRE(size == WORD); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c + WORD, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c + WORD); + REQUIRE(size == WORD); + ++num; + }); + REQUIRE(num == 2); + memory_track->ForEachUploadRange(c + 0x79000, 0x24000, [&](u64 offset, u64 size) { + REQUIRE(offset == c + WORD * 2); + REQUIRE(size == PAGE * 0x1d); + ++num; + }); + REQUIRE(num == 3); +} + +TEST_CASE("MemoryTracker: Partial page upload", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + int num = 0; + memory_track->MarkRegionAsCpuModified(c + PAGE * 2, PAGE); + memory_track->MarkRegionAsCpuModified(c + PAGE * 9, PAGE); + memory_track->ForEachUploadRange(c, PAGE * 3, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 2); + REQUIRE(size == PAGE); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c + PAGE * 7, PAGE * 3, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 9); + REQUIRE(size == PAGE); + ++num; + }); + REQUIRE(num == 2); +} + +TEST_CASE("MemoryTracker: Partial page upload with multiple words on the right") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 9); + memory_track->MarkRegionAsCpuModified(c + PAGE * 13, WORD * 7); + int num = 0; + memory_track->ForEachUploadRange(c + PAGE * 10, WORD * 7, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 13); + REQUIRE(size == WORD * 7 - PAGE * 3); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c + PAGE, WORD * 8, [&](u64 offset, u64 size) { + REQUIRE(offset == c + WORD * 7 + PAGE * 10); + REQUIRE(size == PAGE * 3); + ++num; + }); + REQUIRE(num == 2); +} + +TEST_CASE("MemoryTracker: Partial page upload with multiple words on the left", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 8); + memory_track->MarkRegionAsCpuModified(c + PAGE * 13, WORD * 7); + int num = 0; + memory_track->ForEachUploadRange(c + PAGE * 16, WORD * 7, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 16); + REQUIRE(size == WORD * 7 - PAGE * 3); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c + PAGE, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 13); + REQUIRE(size == PAGE * 3); + ++num; + }); + REQUIRE(num == 2); +} + +TEST_CASE("MemoryTracker: Partial page upload with multiple words in the middle", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 8); + memory_track->MarkRegionAsCpuModified(c + PAGE * 13, PAGE * 140); + int num = 0; + memory_track->ForEachUploadRange(c + PAGE * 16, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 16); + REQUIRE(size == WORD); + ++num; + }); + REQUIRE(num == 1); + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { + REQUIRE(offset == c + PAGE * 13); + REQUIRE(size == PAGE * 3); + ++num; + }); + REQUIRE(num == 2); + memory_track->ForEachUploadRange(c, WORD * 8, [&](u64 offset, u64 size) { + REQUIRE(offset == c + WORD + PAGE * 16); + REQUIRE(size == PAGE * 73); + ++num; + }); + REQUIRE(num == 3); +} + +TEST_CASE("MemoryTracker: Empty right bits", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 2048); + memory_track->MarkRegionAsCpuModified(c + WORD - PAGE, PAGE * 2); + memory_track->ForEachUploadRange(c, WORD * 2048, [](u64 offset, u64 size) { + REQUIRE(offset == c + WORD - PAGE); + REQUIRE(size == PAGE * 2); + }); +} + +TEST_CASE("MemoryTracker: Out of bound ranges 1", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c - WORD, 3 * WORD); + memory_track->MarkRegionAsCpuModified(c, PAGE); + REQUIRE(rasterizer.Count() == (3 * WORD - PAGE) / PAGE); + int num = 0; + memory_track->ForEachUploadRange(c - WORD, WORD, [&](u64 offset, u64 size) { ++num; }); + memory_track->ForEachUploadRange(c + WORD, WORD, [&](u64 offset, u64 size) { ++num; }); + memory_track->ForEachUploadRange(c - PAGE, PAGE, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 0); + memory_track->ForEachUploadRange(c - PAGE, PAGE * 2, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 1); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 2 * WORD / PAGE); +} + +TEST_CASE("MemoryTracker: Out of bound ranges 2", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + REQUIRE_NOTHROW(memory_track->UnmarkRegionAsCpuModified(c + 0x22000, PAGE)); + REQUIRE_NOTHROW(memory_track->UnmarkRegionAsCpuModified(c + 0x28000, PAGE)); + REQUIRE(rasterizer.Count() == 2); + REQUIRE_NOTHROW(memory_track->UnmarkRegionAsCpuModified(c + 0x21100, PAGE - 0x100)); + REQUIRE(rasterizer.Count() == 3); + REQUIRE_NOTHROW(memory_track->UnmarkRegionAsCpuModified(c - PAGE, PAGE * 2)); + memory_track->UnmarkRegionAsCpuModified(c - PAGE * 3, PAGE * 2); + memory_track->UnmarkRegionAsCpuModified(c - PAGE * 2, PAGE * 2); + REQUIRE(rasterizer.Count() == 7); +} + +TEST_CASE("MemoryTracker: Out of bound ranges 3", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, 0x310720); + REQUIRE(rasterizer.Count(c) == 1); + REQUIRE(rasterizer.Count(c + PAGE) == 1); + REQUIRE(rasterizer.Count(c + WORD) == 1); + REQUIRE(rasterizer.Count(c + WORD + PAGE) == 1); +} + +TEST_CASE("MemoryTracker: Sparse regions 1", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->MarkRegionAsCpuModified(c + PAGE * 1, PAGE); + memory_track->MarkRegionAsCpuModified(c + PAGE * 3, PAGE * 4); + memory_track->ForEachUploadRange(c, WORD, [i = 0](u64 offset, u64 size) mutable { + static constexpr std::array<u64, 2> offsets{c + PAGE, c + PAGE * 3}; + static constexpr std::array<u64, 2> sizes{PAGE, PAGE * 4}; + REQUIRE(offset == offsets.at(i)); + REQUIRE(size == sizes.at(i)); + ++i; + }); +} + +TEST_CASE("MemoryTracker: Sparse regions 2", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, PAGE * 0x23); + REQUIRE(rasterizer.Count() == 0x23); + memory_track->MarkRegionAsCpuModified(c + PAGE * 0x1B, PAGE); + memory_track->MarkRegionAsCpuModified(c + PAGE * 0x21, PAGE); + memory_track->ForEachUploadRange(c, PAGE * 0x23, [i = 0](u64 offset, u64 size) mutable { + static constexpr std::array<u64, 3> offsets{c + PAGE * 0x1B, c + PAGE * 0x21}; + static constexpr std::array<u64, 3> sizes{PAGE, PAGE}; + REQUIRE(offset == offsets.at(i)); + REQUIRE(size == sizes.at(i)); + ++i; + }); +} + +TEST_CASE("MemoryTracker: Single page modified range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + REQUIRE(memory_track->IsRegionCpuModified(c, PAGE)); + memory_track->UnmarkRegionAsCpuModified(c, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c, PAGE)); +} + +TEST_CASE("MemoryTracker: Two page modified range", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + REQUIRE(memory_track->IsRegionCpuModified(c, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c, PAGE * 2)); + memory_track->UnmarkRegionAsCpuModified(c, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c, PAGE)); +} + +TEST_CASE("MemoryTracker: Multi word modified ranges", "[video_core]") { + for (int offset = 0; offset < 4; ++offset) { + const VAddr address = c + WORD * offset; + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + REQUIRE(memory_track->IsRegionCpuModified(address, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 48, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 56, PAGE)); + + memory_track->UnmarkRegionAsCpuModified(address + PAGE * 32, PAGE); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE, WORD)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 31, PAGE)); + REQUIRE(!memory_track->IsRegionCpuModified(address + PAGE * 32, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 33, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 31, PAGE * 2)); + REQUIRE(memory_track->IsRegionCpuModified(address + PAGE * 32, PAGE * 2)); + + memory_track->UnmarkRegionAsCpuModified(address + PAGE * 33, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(address + PAGE * 32, PAGE * 2)); + } +} + +TEST_CASE("MemoryTracker: Single page in large region", "[video_core]") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 16); + REQUIRE(!memory_track->IsRegionCpuModified(c, WORD * 16)); + + memory_track->MarkRegionAsCpuModified(c + WORD * 12 + PAGE * 8, PAGE); + REQUIRE(memory_track->IsRegionCpuModified(c, WORD * 16)); + REQUIRE(!memory_track->IsRegionCpuModified(c + WORD * 10, WORD * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 11, WORD * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12, WORD * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 4, PAGE * 8)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 6, PAGE * 8)); + REQUIRE(!memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 6, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 7, PAGE * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + WORD * 12 + PAGE * 8, PAGE * 2)); +} + +TEST_CASE("MemoryTracker: Wrap word regions") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD * 32); + memory_track->MarkRegionAsCpuModified(c + PAGE * 63, PAGE * 2); + REQUIRE(memory_track->IsRegionCpuModified(c, WORD * 2)); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 62, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 63, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 64, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 63, PAGE * 2)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 63, PAGE * 8)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 60, PAGE * 8)); + + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 127, WORD * 16)); + memory_track->MarkRegionAsCpuModified(c + PAGE * 127, PAGE); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 127, WORD * 16)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 127, PAGE)); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 126, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 126, PAGE * 2)); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 128, WORD * 16)); +} + +TEST_CASE("MemoryTracker: Unaligned page region query") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->MarkRegionAsCpuModified(c + 4000, 1000); + REQUIRE(memory_track->IsRegionCpuModified(c, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + 4000, 1000)); + REQUIRE(memory_track->IsRegionCpuModified(c + 4000, 1)); +} + +TEST_CASE("MemoryTracker: Cached write") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->CachedCpuWrite(c + PAGE, c + PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Multiple cached write") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + memory_track->CachedCpuWrite(c + PAGE * 3, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE * 3, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE * 3, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Cached write unmarked") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + memory_track->UnmarkRegionAsCpuModified(c + PAGE, PAGE); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Cached write iterated") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + int num = 0; + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 0); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("MemoryTracker: Cached write downloads") { + RasterizerInterface rasterizer; + std::unique_ptr<MemoryTracker> memory_track(std::make_unique<MemoryTracker>(rasterizer)); + memory_track->UnmarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 64); + memory_track->CachedCpuWrite(c + PAGE, PAGE); + REQUIRE(rasterizer.Count() == 63); + memory_track->MarkRegionAsGpuModified(c + PAGE, PAGE); + int num = 0; + memory_track->ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 1); + num = 0; + memory_track->ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 0); + REQUIRE(!memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!memory_track->IsRegionGpuModified(c + PAGE, PAGE)); + memory_track->FlushCachedWrites(); + REQUIRE(memory_track->IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!memory_track->IsRegionGpuModified(c + PAGE, PAGE)); + memory_track->MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +}
\ No newline at end of file diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e904573d7..a0009a36f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -11,8 +11,11 @@ endif() add_library(video_core STATIC buffer_cache/buffer_base.h + buffer_cache/buffer_cache_base.h buffer_cache/buffer_cache.cpp buffer_cache/buffer_cache.h + buffer_cache/memory_tracker_base.h + buffer_cache/word_manager.h cache_types.h cdma_pusher.cpp cdma_pusher.h @@ -104,6 +107,7 @@ add_library(video_core STATIC renderer_null/renderer_null.h renderer_opengl/blit_image.cpp renderer_opengl/blit_image.h + renderer_opengl/gl_buffer_cache_base.cpp renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h renderer_opengl/gl_compute_pipeline.cpp @@ -154,6 +158,7 @@ add_library(video_core STATIC renderer_vulkan/renderer_vulkan.cpp renderer_vulkan/vk_blit_screen.cpp renderer_vulkan/vk_blit_screen.h + renderer_vulkan/vk_buffer_cache_base.cpp renderer_vulkan/vk_buffer_cache.cpp renderer_vulkan/vk_buffer_cache.h renderer_vulkan/vk_command_pool.cpp @@ -174,6 +179,8 @@ add_library(video_core STATIC renderer_vulkan/vk_master_semaphore.h renderer_vulkan/vk_pipeline_cache.cpp renderer_vulkan/vk_pipeline_cache.h + renderer_vulkan/vk_present_manager.cpp + renderer_vulkan/vk_present_manager.h renderer_vulkan/vk_query_cache.cpp renderer_vulkan/vk_query_cache.h renderer_vulkan/vk_rasterizer.cpp diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index 1b4d63616..9cbd95c4b 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once @@ -11,9 +11,7 @@ #include "common/alignment.h" #include "common/common_funcs.h" #include "common/common_types.h" -#include "common/div_ceil.h" -#include "common/settings.h" -#include "core/memory.h" +#include "video_core/buffer_cache/word_manager.h" namespace VideoCommon { @@ -36,116 +34,12 @@ struct NullBufferParams {}; */ template <class RasterizerInterface> class BufferBase { - static constexpr u64 PAGES_PER_WORD = 64; - static constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; - static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; - - /// Vector tracking modified pages tightly packed with small vector optimization - union WordsArray { - /// Returns the pointer to the words state - [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { - return is_short ? &stack : heap; - } - - /// Returns the pointer to the words state - [[nodiscard]] u64* Pointer(bool is_short) noexcept { - return is_short ? &stack : heap; - } - - u64 stack = 0; ///< Small buffers storage - u64* heap; ///< Not-small buffers pointer to the storage - }; - - struct Words { - explicit Words() = default; - explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { - if (IsShort()) { - cpu.stack = ~u64{0}; - gpu.stack = 0; - cached_cpu.stack = 0; - untracked.stack = ~u64{0}; - } else { - // Share allocation between CPU and GPU pages and set their default values - const size_t num_words = NumWords(); - u64* const alloc = new u64[num_words * 4]; - cpu.heap = alloc; - gpu.heap = alloc + num_words; - cached_cpu.heap = alloc + num_words * 2; - untracked.heap = alloc + num_words * 3; - std::fill_n(cpu.heap, num_words, ~u64{0}); - std::fill_n(gpu.heap, num_words, 0); - std::fill_n(cached_cpu.heap, num_words, 0); - std::fill_n(untracked.heap, num_words, ~u64{0}); - } - // Clean up tailing bits - const u64 last_word_size = size_bytes % BYTES_PER_WORD; - const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); - const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; - const u64 last_word = (~u64{0} << shift) >> shift; - cpu.Pointer(IsShort())[NumWords() - 1] = last_word; - untracked.Pointer(IsShort())[NumWords() - 1] = last_word; - } - - ~Words() { - Release(); - } - - Words& operator=(Words&& rhs) noexcept { - Release(); - size_bytes = rhs.size_bytes; - cpu = rhs.cpu; - gpu = rhs.gpu; - cached_cpu = rhs.cached_cpu; - untracked = rhs.untracked; - rhs.cpu.heap = nullptr; - return *this; - } - - Words(Words&& rhs) noexcept - : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, - cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { - rhs.cpu.heap = nullptr; - } - - Words& operator=(const Words&) = delete; - Words(const Words&) = delete; - - /// Returns true when the buffer fits in the small vector optimization - [[nodiscard]] bool IsShort() const noexcept { - return size_bytes <= BYTES_PER_WORD; - } - - /// Returns the number of words of the buffer - [[nodiscard]] size_t NumWords() const noexcept { - return Common::DivCeil(size_bytes, BYTES_PER_WORD); - } - - /// Release buffer resources - void Release() { - if (!IsShort()) { - // CPU written words is the base for the heap allocation - delete[] cpu.heap; - } - } - - u64 size_bytes = 0; - WordsArray cpu; - WordsArray gpu; - WordsArray cached_cpu; - WordsArray untracked; - }; - - enum class Type { - CPU, - GPU, - CachedCPU, - Untracked, - }; - public: - explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes) - : rasterizer{&rasterizer_}, cpu_addr{Common::AlignDown(cpu_addr_, BYTES_PER_PAGE)}, - words(Common::AlignUp(size_bytes + (cpu_addr_ - cpu_addr), BYTES_PER_PAGE)) {} + static constexpr u64 BASE_PAGE_BITS = 16; + static constexpr u64 BASE_PAGE_SIZE = 1ULL << BASE_PAGE_BITS; + + explicit BufferBase(RasterizerInterface& rasterizer_, VAddr cpu_addr_, u64 size_bytes_) + : cpu_addr{cpu_addr_}, size_bytes{size_bytes_} {} explicit BufferBase(NullBufferParams) {} @@ -155,100 +49,6 @@ public: BufferBase& operator=(BufferBase&&) = default; BufferBase(BufferBase&&) = default; - /// Returns the inclusive CPU modified range in a begin end pair - [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, - u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion<Type::CPU>(offset, query_size); - } - - /// Returns the inclusive GPU modified range in a begin end pair - [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, - u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion<Type::GPU>(offset, query_size); - } - - /// Returns true if a region has been modified from the CPU - [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified<Type::CPU>(offset, query_size); - } - - /// Returns true if a region has been modified from the GPU - [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { - const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified<Type::GPU>(offset, query_size); - } - - /// Mark region as CPU modified, notifying the rasterizer about this change - void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState<Type::CPU, true>(dirty_cpu_addr, size); - } - - /// Unmark region as CPU modified, notifying the rasterizer about this change - void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState<Type::CPU, false>(dirty_cpu_addr, size); - } - - /// Mark region as modified from the host GPU - void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState<Type::GPU, true>(dirty_cpu_addr, size); - } - - /// Unmark region as modified from the host GPU - void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState<Type::GPU, false>(dirty_cpu_addr, size); - } - - /// Mark region as modified from the CPU - /// but don't mark it as modified until FlusHCachedWrites is called. - void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { - flags |= BufferFlagBits::CachedWrites; - ChangeRegionState<Type::CachedCPU, true>(dirty_cpu_addr, size); - } - - /// Flushes cached CPU writes, and notify the rasterizer about the deltas - void FlushCachedWrites() noexcept { - flags &= ~BufferFlagBits::CachedWrites; - const u64 num_words = NumWords(); - u64* const cached_words = Array<Type::CachedCPU>(); - u64* const untracked_words = Array<Type::Untracked>(); - u64* const cpu_words = Array<Type::CPU>(); - for (u64 word_index = 0; word_index < num_words; ++word_index) { - const u64 cached_bits = cached_words[word_index]; - NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits); - untracked_words[word_index] |= cached_bits; - cpu_words[word_index] |= cached_bits; - if (!Settings::values.use_pessimistic_flushes) { - cached_words[word_index] = 0; - } - } - } - - /// Call 'func' for each CPU modified range and unmark those pages as CPU modified - template <typename Func> - void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange<Type::CPU>(query_cpu_range, size, true, func); - } - - /// Call 'func' for each GPU modified range and unmark those pages as GPU modified - template <typename Func> - void ForEachDownloadRange(VAddr query_cpu_range, u64 size, bool clear, Func&& func) { - ForEachModifiedRange<Type::GPU>(query_cpu_range, size, clear, func); - } - - template <typename Func> - void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange<Type::GPU>(query_cpu_range, size, true, func); - } - - /// Call 'func' for each GPU modified range and unmark those pages as GPU modified - template <typename Func> - void ForEachDownloadRange(Func&& func) { - ForEachModifiedRange<Type::GPU>(cpu_addr, SizeBytes(), true, func); - } - /// Mark buffer as picked void Pick() noexcept { flags |= BufferFlagBits::Picked; @@ -295,11 +95,6 @@ public: return static_cast<u32>(other_cpu_addr - cpu_addr); } - /// Returns the size in bytes of the buffer - [[nodiscard]] u64 SizeBytes() const noexcept { - return words.size_bytes; - } - size_t getLRUID() const noexcept { return lru_id; } @@ -308,305 +103,16 @@ public: lru_id = lru_id_; } -private: - template <Type type> - u64* Array() noexcept { - if constexpr (type == Type::CPU) { - return words.cpu.Pointer(IsShort()); - } else if constexpr (type == Type::GPU) { - return words.gpu.Pointer(IsShort()); - } else if constexpr (type == Type::CachedCPU) { - return words.cached_cpu.Pointer(IsShort()); - } else if constexpr (type == Type::Untracked) { - return words.untracked.Pointer(IsShort()); - } - } - - template <Type type> - const u64* Array() const noexcept { - if constexpr (type == Type::CPU) { - return words.cpu.Pointer(IsShort()); - } else if constexpr (type == Type::GPU) { - return words.gpu.Pointer(IsShort()); - } else if constexpr (type == Type::CachedCPU) { - return words.cached_cpu.Pointer(IsShort()); - } else if constexpr (type == Type::Untracked) { - return words.untracked.Pointer(IsShort()); - } - } - - /** - * Change the state of a range of pages - * - * @param dirty_addr Base address to mark or unmark as modified - * @param size Size in bytes to mark or unmark as modified - */ - template <Type type, bool enable> - void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { - const s64 difference = dirty_addr - cpu_addr; - const u64 offset = std::max<s64>(difference, 0); - size += std::min<s64>(difference, 0); - if (offset >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array<Type::Untracked>(); - u64* const state_words = Array<type>(); - const u64 offset_end = std::min(offset + size, SizeBytes()); - const u64 begin_page_index = offset / BYTES_PER_PAGE; - const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; - const u64 end_page_index = Common::DivCeil(offset_end, BYTES_PER_PAGE); - const u64 end_word_index = Common::DivCeil(end_page_index, PAGES_PER_WORD); - u64 page_index = begin_page_index % PAGES_PER_WORD; - u64 word_index = begin_word_index; - while (word_index < end_word_index) { - const u64 next_word_first_page = (word_index + 1) * PAGES_PER_WORD; - const u64 left_offset = - std::min(next_word_first_page - end_page_index, PAGES_PER_WORD) % PAGES_PER_WORD; - const u64 right_offset = page_index; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - NotifyRasterizer<!enable>(word_index, untracked_words[word_index], bits); - } - if constexpr (enable) { - state_words[word_index] |= bits; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] |= bits; - } - } else { - state_words[word_index] &= ~bits; - if constexpr (type == Type::CPU || type == Type::CachedCPU) { - untracked_words[word_index] &= ~bits; - } - } - page_index = 0; - ++word_index; - } - } - - /** - * Notify rasterizer about changes in the CPU tracking state of a word in the buffer - * - * @param word_index Index to the word to notify to the rasterizer - * @param current_bits Current state of the word - * @param new_bits New state of the word - * - * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages - */ - template <bool add_to_rasterizer> - void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { - u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; - VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; - while (changed_bits != 0) { - const int empty_bits = std::countr_zero(changed_bits); - addr += empty_bits * BYTES_PER_PAGE; - changed_bits >>= empty_bits; - - const u32 continuous_bits = std::countr_one(changed_bits); - const u64 size = continuous_bits * BYTES_PER_PAGE; - const VAddr begin_addr = addr; - addr += size; - changed_bits = continuous_bits < PAGES_PER_WORD ? (changed_bits >> continuous_bits) : 0; - rasterizer->UpdatePagesCachedCount(begin_addr, size, add_to_rasterizer ? 1 : -1); - } - } - - /** - * Loop over each page in the given range, turn off those bits and notify the rasterizer if - * needed. Call the given function on each turned off range. - * - * @param query_cpu_range Base CPU address to loop over - * @param size Size in bytes of the CPU range to loop over - * @param func Function to call for each turned off region - */ - template <Type type, typename Func> - void ForEachModifiedRange(VAddr query_cpu_range, s64 size, bool clear, Func&& func) { - static_assert(type != Type::Untracked); - - const s64 difference = query_cpu_range - cpu_addr; - const u64 query_begin = std::max<s64>(difference, 0); - size += std::min<s64>(difference, 0); - if (query_begin >= SizeBytes() || size < 0) { - return; - } - u64* const untracked_words = Array<Type::Untracked>(); - u64* const state_words = Array<type>(); - const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes()); - u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; - u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); - - const auto modified = [](u64 word) { return word != 0; }; - const auto first_modified_word = std::find_if(words_begin, words_end, modified); - if (first_modified_word == words_end) { - // Exit early when the buffer is not modified - return; - } - const auto last_modified_word = std::find_if_not(first_modified_word, words_end, modified); - - const u64 word_index_begin = std::distance(state_words, first_modified_word); - const u64 word_index_end = std::distance(state_words, last_modified_word); - - const unsigned local_page_begin = std::countr_zero(*first_modified_word); - const unsigned local_page_end = - static_cast<unsigned>(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); - const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; - const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; - const u64 query_page_begin = query_begin / BYTES_PER_PAGE; - const u64 query_page_end = Common::DivCeil(query_end, BYTES_PER_PAGE); - const u64 page_index_begin = std::max(word_page_begin + local_page_begin, query_page_begin); - const u64 page_index_end = std::min(word_page_end + local_page_end, query_page_end); - const u64 first_word_page_begin = page_index_begin % PAGES_PER_WORD; - const u64 last_word_page_end = (page_index_end - 1) % PAGES_PER_WORD + 1; - - u64 page_begin = first_word_page_begin; - u64 current_base = 0; - u64 current_size = 0; - bool on_going = false; - for (u64 word_index = word_index_begin; word_index < word_index_end; ++word_index) { - const bool is_last_word = word_index + 1 == word_index_end; - const u64 page_end = is_last_word ? last_word_page_end : PAGES_PER_WORD; - const u64 right_offset = page_begin; - const u64 left_offset = PAGES_PER_WORD - page_end; - u64 bits = ~u64{0}; - bits = (bits >> right_offset) << right_offset; - bits = (bits << left_offset) >> left_offset; - - const u64 current_word = state_words[word_index] & bits; - if (clear) { - state_words[word_index] &= ~bits; - } - - if constexpr (type == Type::CPU) { - const u64 current_bits = untracked_words[word_index] & bits; - untracked_words[word_index] &= ~bits; - NotifyRasterizer<true>(word_index, current_bits, ~u64{0}); - } - // Exclude CPU modified pages when visiting GPU pages - const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); - u64 page = page_begin; - page_begin = 0; - - while (page < page_end) { - const int empty_bits = std::countr_zero(word >> page); - if (on_going && empty_bits != 0) { - InvokeModifiedRange(func, current_size, current_base); - current_size = 0; - on_going = false; - } - if (empty_bits == PAGES_PER_WORD) { - break; - } - page += empty_bits; - - const int continuous_bits = std::countr_one(word >> page); - if (!on_going && continuous_bits != 0) { - current_base = word_index * PAGES_PER_WORD + page; - on_going = true; - } - current_size += continuous_bits; - page += continuous_bits; - } - } - if (on_going && current_size > 0) { - InvokeModifiedRange(func, current_size, current_base); - } - } - - template <typename Func> - void InvokeModifiedRange(Func&& func, u64 current_size, u64 current_base) { - const u64 current_size_bytes = current_size * BYTES_PER_PAGE; - const u64 offset_begin = current_base * BYTES_PER_PAGE; - const u64 offset_end = std::min(offset_begin + current_size_bytes, SizeBytes()); - func(offset_begin, offset_end - offset_begin); + size_t SizeBytes() const { + return size_bytes; } - /** - * Returns true when a region has been modified - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template <Type type> - [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const u64* const untracked_words = Array<Type::Untracked>(); - const u64* const state_words = Array<type>(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min<u64>(word_begin + num_query_words, NumWords()); - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; - for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 page_end = std::min((word_index + 1) * PAGES_PER_WORD, page_limit); - const u64 local_page_end = page_end % PAGES_PER_WORD; - const u64 page_end_shift = (PAGES_PER_WORD - local_page_end) % PAGES_PER_WORD; - if (((word >> page_index) << page_index) << page_end_shift != 0) { - return true; - } - } - return false; - } - - /** - * Returns a begin end pair with the inclusive modified region - * - * @param offset Offset in bytes from the start of the buffer - * @param size Size in bytes of the region to query for modifications - */ - template <Type type> - [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { - static_assert(type != Type::Untracked); - - const u64* const untracked_words = Array<Type::Untracked>(); - const u64* const state_words = Array<type>(); - const u64 num_query_words = size / BYTES_PER_WORD + 1; - const u64 word_begin = offset / BYTES_PER_WORD; - const u64 word_end = std::min<u64>(word_begin + num_query_words, NumWords()); - const u64 page_base = offset / BYTES_PER_PAGE; - const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); - u64 begin = std::numeric_limits<u64>::max(); - u64 end = 0; - for (u64 word_index = word_begin; word_index < word_end; ++word_index) { - const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; - const u64 word = state_words[word_index] & ~off_word; - if (word == 0) { - continue; - } - const u64 local_page_begin = std::countr_zero(word); - const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); - const u64 page_index = word_index * PAGES_PER_WORD; - const u64 page_begin = std::max(page_index + local_page_begin, page_base); - const u64 page_end = std::min(page_index + local_page_end, page_limit); - begin = std::min(begin, page_begin); - end = std::max(end, page_end); - } - static constexpr std::pair<u64, u64> EMPTY{0, 0}; - return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; - } - - /// Returns the number of words of the buffer - [[nodiscard]] size_t NumWords() const noexcept { - return words.NumWords(); - } - - /// Returns true when the buffer fits in the small vector optimization - [[nodiscard]] bool IsShort() const noexcept { - return words.IsShort(); - } - - RasterizerInterface* rasterizer = nullptr; +private: VAddr cpu_addr = 0; - Words words; BufferFlagBits flags{}; int stream_score = 0; size_t lru_id = SIZE_MAX; + size_t size_bytes = 0; }; } // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp index a16308b60..40db243d2 100644 --- a/src/video_core/buffer_cache/buffer_cache.cpp +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -1,5 +1,5 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #include "common/microprofile.h" diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index abdc593df..e534e1e9c 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -1,485 +1,29 @@ -// SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later #pragma once #include <algorithm> -#include <array> #include <memory> -#include <mutex> #include <numeric> -#include <span> -#include <vector> - -#include <boost/container/small_vector.hpp> -#include <boost/icl/interval_set.hpp> - -#include "common/common_types.h" -#include "common/div_ceil.h" -#include "common/literals.h" -#include "common/lru_cache.h" -#include "common/microprofile.h" -#include "common/polyfill_ranges.h" -#include "common/scratch_buffer.h" -#include "common/settings.h" -#include "core/memory.h" -#include "video_core/buffer_cache/buffer_base.h" -#include "video_core/control/channel_state_cache.h" -#include "video_core/delayed_destruction_ring.h" -#include "video_core/dirty_flags.h" -#include "video_core/engines/draw_manager.h" -#include "video_core/engines/kepler_compute.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h" -#include "video_core/surface.h" -#include "video_core/texture_cache/slot_vector.h" -#include "video_core/texture_cache/types.h" -namespace VideoCommon { - -MICROPROFILE_DECLARE(GPU_PrepareBuffers); -MICROPROFILE_DECLARE(GPU_BindUploadBuffers); -MICROPROFILE_DECLARE(GPU_DownloadMemory); - -using BufferId = SlotId; - -using VideoCore::Surface::PixelFormat; -using namespace Common::Literals; - -constexpr u32 NUM_VERTEX_BUFFERS = 32; -constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; -constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; -constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; -constexpr u32 NUM_STORAGE_BUFFERS = 16; -constexpr u32 NUM_TEXTURE_BUFFERS = 16; -constexpr u32 NUM_STAGES = 5; - -enum class ObtainBufferSynchronize : u32 { - NoSynchronize = 0, - FullSynchronize = 1, - SynchronizeNoDirty = 2, -}; - -enum class ObtainBufferOperation : u32 { - DoNothing = 0, - MarkAsWritten = 1, - DiscardWrite = 2, - MarkQuery = 3, -}; - -using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>; -using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; - -template <typename P> -class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { - - // Page size for caching purposes. - // This is unrelated to the CPU page size and it can be changed as it seems optimal. - static constexpr u32 YUZU_PAGEBITS = 16; - static constexpr u64 YUZU_PAGESIZE = u64{1} << YUZU_PAGEBITS; - - static constexpr bool IS_OPENGL = P::IS_OPENGL; - static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = - P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; - static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = - P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; - static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; - static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; - static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; - static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; - - static constexpr BufferId NULL_BUFFER_ID{0}; - - static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; - static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; - static constexpr s64 TARGET_THRESHOLD = 4_GiB; - - using Maxwell = Tegra::Engines::Maxwell3D::Regs; - - using Runtime = typename P::Runtime; - using Buffer = typename P::Buffer; - - using IntervalSet = boost::icl::interval_set<VAddr>; - using IntervalType = typename IntervalSet::interval_type; - - struct Empty {}; - - struct OverlapResult { - std::vector<BufferId> ids; - VAddr begin; - VAddr end; - bool has_stream_leap = false; - }; - - struct Binding { - VAddr cpu_addr{}; - u32 size{}; - BufferId buffer_id; - }; - - struct TextureBufferBinding : Binding { - PixelFormat format; - }; - - static constexpr Binding NULL_BINDING{ - .cpu_addr = 0, - .size = 0, - .buffer_id = NULL_BUFFER_ID, - }; - -public: - static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); - - explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, - Core::Memory::Memory& cpu_memory_, Runtime& runtime_); - - void TickFrame(); - - void WriteMemory(VAddr cpu_addr, u64 size); - - void CachedWriteMemory(VAddr cpu_addr, u64 size); - - void DownloadMemory(VAddr cpu_addr, u64 size); - - bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer); - - void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); - - void DisableGraphicsUniformBuffer(size_t stage, u32 index); - - void UpdateGraphicsBuffers(bool is_indexed); - - void UpdateComputeBuffers(); - - void BindHostGeometryBuffers(bool is_indexed); - - void BindHostStageBuffers(size_t stage); - - void BindHostComputeBuffers(); - - void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask, - const UniformBufferSizes* sizes); - - void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); - - void UnbindGraphicsStorageBuffers(size_t stage); - - void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, - bool is_written); - - void UnbindGraphicsTextureBuffers(size_t stage); - - void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, - PixelFormat format, bool is_written, bool is_image); - - void UnbindComputeStorageBuffers(); - - void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, - bool is_written); - - void UnbindComputeTextureBuffers(); - - void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, - bool is_written, bool is_image); - - void FlushCachedWrites(); - - /// Return true when there are uncommitted buffers to be downloaded - [[nodiscard]] bool HasUncommittedFlushes() const noexcept; - - void AccumulateFlushes(); - - /// Return true when the caller should wait for async downloads - [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; - - /// Commit asynchronous downloads - void CommitAsyncFlushes(); - void CommitAsyncFlushesHigh(); - - /// Pop asynchronous downloads - void PopAsyncFlushes(); - - bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); - - bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); - - [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, - ObtainBufferSynchronize sync_info, - ObtainBufferOperation post_op); - - /// Return true when a CPU region is modified from the GPU - [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); - - /// Return true when a region is registered on the cache - [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); - - /// Return true when a CPU region is modified from the CPU - [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); - - void SetDrawIndirect( - const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { - current_draw_indirect = current_draw_indirect_; - } - - [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount(); - - [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); - - std::recursive_mutex mutex; - Runtime& runtime; - -private: - template <typename Func> - static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { - for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { - const int disabled_bits = std::countr_zero(enabled_mask); - index += disabled_bits; - enabled_mask >>= disabled_bits; - func(index); - } - } - - template <typename Func> - void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { - const u64 page_end = Common::DivCeil(cpu_addr + size, YUZU_PAGESIZE); - for (u64 page = cpu_addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId buffer_id = page_table[page]; - if (!buffer_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[buffer_id]; - func(buffer_id, buffer); - - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - } - - template <typename Func> - void ForEachWrittenRange(VAddr cpu_addr, u64 size, Func&& func) { - const VAddr start_address = cpu_addr; - const VAddr end_address = start_address + size; - const VAddr search_base = - static_cast<VAddr>(std::min<s64>(0LL, static_cast<s64>(start_address - size))); - const IntervalType search_interval{search_base, search_base + 1}; - auto it = common_ranges.lower_bound(search_interval); - if (it == common_ranges.end()) { - it = common_ranges.begin(); - } - for (; it != common_ranges.end(); it++) { - VAddr inter_addr_end = it->upper(); - VAddr inter_addr = it->lower(); - if (inter_addr >= end_address) { - break; - } - if (inter_addr_end <= start_address) { - continue; - } - if (inter_addr_end > end_address) { - inter_addr_end = end_address; - } - if (inter_addr < start_address) { - inter_addr = start_address; - } - func(inter_addr, inter_addr_end); - } - } - - static bool IsRangeGranular(VAddr cpu_addr, size_t size) { - return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == - ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); - } - - void RunGarbageCollector(); - - void BindHostIndexBuffer(); - - void BindHostVertexBuffers(); - - void BindHostDrawIndirectBuffers(); - - void BindHostGraphicsUniformBuffers(size_t stage); - - void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); - - void BindHostGraphicsStorageBuffers(size_t stage); - - void BindHostGraphicsTextureBuffers(size_t stage); - - void BindHostTransformFeedbackBuffers(); - - void BindHostComputeUniformBuffers(); - - void BindHostComputeStorageBuffers(); - - void BindHostComputeTextureBuffers(); - - void DoUpdateGraphicsBuffers(bool is_indexed); - - void DoUpdateComputeBuffers(); - - void UpdateIndexBuffer(); - - void UpdateVertexBuffers(); - - void UpdateVertexBuffer(u32 index); - - void UpdateDrawIndirect(); - - void UpdateUniformBuffers(size_t stage); - - void UpdateStorageBuffers(size_t stage); - - void UpdateTextureBuffers(size_t stage); - - void UpdateTransformFeedbackBuffers(); - - void UpdateTransformFeedbackBuffer(u32 index); - - void UpdateComputeUniformBuffers(); - - void UpdateComputeStorageBuffers(); - - void UpdateComputeTextureBuffers(); - - void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); - - [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); - - [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); - - void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); - - [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); - - void Register(BufferId buffer_id); - - void Unregister(BufferId buffer_id); - - template <bool insert> - void ChangeRegister(BufferId buffer_id); - - void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; - - bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); - - bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); - - void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, - std::span<BufferCopy> copies); - - void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, - std::span<const BufferCopy> copies); - - void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); - - void DownloadBufferMemory(Buffer& buffer_id); - - void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); - - void DeleteBuffer(BufferId buffer_id); - - void NotifyBufferDeletion(); - - [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, - bool is_written = false) const; - - [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, - PixelFormat format); - - [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); - - [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); - - [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; - - void ClearDownload(IntervalType subtract_interval); - - VideoCore::RasterizerInterface& rasterizer; - Core::Memory::Memory& cpu_memory; - - SlotVector<Buffer> slot_buffers; - DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; - - const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; - - u32 last_index_count = 0; - - Binding index_buffer; - std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; - std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; - std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; - std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers; - std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; - Binding count_buffer_binding; - Binding indirect_buffer_binding; - - std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; - std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; - std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers; - - std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{}; - u32 enabled_compute_uniform_buffer_mask = 0; - - const UniformBufferSizes* uniform_buffer_sizes{}; - const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; - - std::array<u32, NUM_STAGES> enabled_storage_buffers{}; - std::array<u32, NUM_STAGES> written_storage_buffers{}; - u32 enabled_compute_storage_buffers = 0; - u32 written_compute_storage_buffers = 0; - - std::array<u32, NUM_STAGES> enabled_texture_buffers{}; - std::array<u32, NUM_STAGES> written_texture_buffers{}; - std::array<u32, NUM_STAGES> image_texture_buffers{}; - u32 enabled_compute_texture_buffers = 0; - u32 written_compute_texture_buffers = 0; - u32 image_compute_texture_buffers = 0; - - std::array<u32, 16> uniform_cache_hits{}; - std::array<u32, 16> uniform_cache_shots{}; - - u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; - - bool has_deleted_buffers = false; +#include "video_core/buffer_cache/buffer_cache_base.h" - std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> - dirty_uniform_buffers{}; - std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{}; - std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, - std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty> - uniform_buffer_binding_sizes{}; - - std::vector<BufferId> cached_write_buffer_ids; - - IntervalSet uncommitted_ranges; - IntervalSet common_ranges; - std::deque<IntervalSet> committed_ranges; - - Common::ScratchBuffer<u8> immediate_buffer_alloc; - - struct LRUItemParams { - using ObjectType = BufferId; - using TickType = u64; - }; - Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; - u64 frame_tick = 0; - u64 total_used_memory = 0; - u64 minimum_memory = 0; - u64 critical_memory = 0; +namespace VideoCommon { - std::array<BufferId, ((1ULL << 39) >> YUZU_PAGEBITS)> page_table; -}; +using Core::Memory::YUZU_PAGESIZE; template <class P> BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, Core::Memory::Memory& cpu_memory_, Runtime& runtime_) - : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_} { + : runtime{runtime_}, rasterizer{rasterizer_}, cpu_memory{cpu_memory_}, memory_tracker{ + rasterizer} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); common_ranges.clear(); + inline_buffer_id = NULL_BUFFER_ID; + + active_async_buffers = !Settings::IsGPULevelHigh(); if (!runtime.CanReportMemoryUsage()) { minimum_memory = DEFAULT_EXPECTED_MEMORY; @@ -531,6 +75,8 @@ void BufferCache<P>::TickFrame() { uniform_cache_hits[0] = 0; uniform_cache_shots[0] = 0; + active_async_buffers = !Settings::IsGPULevelHigh(); + const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; @@ -543,35 +89,62 @@ void BufferCache<P>::TickFrame() { } ++frame_tick; delayed_destruction_ring.Tick(); + + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + for (auto& buffer : async_buffers_death_ring) { + runtime.FreeDeferredStagingBuffer(buffer); + } + async_buffers_death_ring.clear(); + } } template <class P> void BufferCache<P>::WriteMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { - buffer.MarkRegionAsCpuModified(cpu_addr, size); - }); + memory_tracker.MarkRegionAsCpuModified(cpu_addr, size); + if (memory_tracker.IsRegionGpuModified(cpu_addr, size)) { + const IntervalType subtract_interval{cpu_addr, cpu_addr + size}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + } } template <class P> void BufferCache<P>::CachedWriteMemory(VAddr cpu_addr, u64 size) { - ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - if (!buffer.HasCachedWrites()) { - cached_write_buffer_ids.push_back(buffer_id); - } - buffer.CachedCpuWrite(cpu_addr, size); - }); + memory_tracker.CachedCpuWrite(cpu_addr, size); + const IntervalType add_interval{Common::AlignDown(cpu_addr, YUZU_PAGESIZE), + Common::AlignUp(cpu_addr + size, YUZU_PAGESIZE)}; + cached_ranges.add(add_interval); } template <class P> void BufferCache<P>::DownloadMemory(VAddr cpu_addr, u64 size) { + WaitOnAsyncFlushes(cpu_addr, size); ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { DownloadBufferMemory(buffer, cpu_addr, size); }); } template <class P> +void BufferCache<P>::WaitOnAsyncFlushes(VAddr cpu_addr, u64 size) { + bool must_wait = false; + ForEachInOverlapCounter(async_downloads, cpu_addr, size, + [&](VAddr, VAddr, int) { must_wait = true; }); + bool must_release = false; + ForEachInRangeSet(pending_ranges, cpu_addr, size, [&](VAddr, VAddr) { must_release = true; }); + if (must_release) { + std::function<void()> tmp([]() {}); + rasterizer.SignalFence(std::move(tmp)); + } + if (must_wait || must_release) { + rasterizer.ReleaseFences(); + } +} + +template <class P> void BufferCache<P>::ClearDownload(IntervalType subtract_interval) { + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1024); uncommitted_ranges.subtract(subtract_interval); + pending_ranges.subtract(subtract_interval); for (auto& interval_set : committed_ranges) { interval_set.subtract(subtract_interval); } @@ -591,6 +164,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } const IntervalType subtract_interval{*cpu_dest_address, *cpu_dest_address + amount}; + WaitOnAsyncFlushes(*cpu_src_address, static_cast<u32>(amount)); ClearDownload(subtract_interval); BufferId buffer_a; @@ -616,10 +190,11 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am const VAddr diff = base_address - *cpu_src_address; const VAddr new_base_address = *cpu_dest_address + diff; const IntervalType add_interval{new_base_address, new_base_address + size}; - uncommitted_ranges.add(add_interval); tmp_intervals.push_back(add_interval); + uncommitted_ranges.add(add_interval); + pending_ranges.add(add_interval); }; - ForEachWrittenRange(*cpu_src_address, amount, mirror); + ForEachInRangeSet(common_ranges, *cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. common_ranges.subtract(subtract_interval); const bool has_new_downloads = tmp_intervals.size() != 0; @@ -628,7 +203,7 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am } runtime.CopyBuffer(dest_buffer, src_buffer, copies); if (has_new_downloads) { - dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); + memory_tracker.MarkRegionAsGpuModified(*cpu_dest_address, amount); } std::vector<u8> tmp_buffer(amount); cpu_memory.ReadBlockUnsafe(*cpu_src_address, tmp_buffer.data(), amount); @@ -866,10 +441,9 @@ void BufferCache<P>::BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_add template <class P> void BufferCache<P>::FlushCachedWrites() { - for (const BufferId buffer_id : cached_write_buffer_ids) { - slot_buffers[buffer_id].FlushCachedWrites(); - } cached_write_buffer_ids.clear(); + memory_tracker.FlushCachedWrites(); + cached_ranges.clear(); } template <class P> @@ -879,10 +453,6 @@ bool BufferCache<P>::HasUncommittedFlushes() const noexcept { template <class P> void BufferCache<P>::AccumulateFlushes() { - if (Settings::values.gpu_accuracy.GetValue() != Settings::GPUAccuracy::High) { - uncommitted_ranges.clear(); - return; - } if (uncommitted_ranges.empty()) { return; } @@ -891,7 +461,11 @@ void BufferCache<P>::AccumulateFlushes() { template <class P> bool BufferCache<P>::ShouldWaitAsyncFlushes() const noexcept { - return false; + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + return (!async_buffers.empty() && async_buffers.front().has_value()); + } else { + return false; + } } template <class P> @@ -899,12 +473,16 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { AccumulateFlushes(); if (committed_ranges.empty()) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { + async_buffers.emplace_back(std::optional<Async_Buffer>{}); + } + } return; } MICROPROFILE_SCOPE(GPU_DownloadMemory); - const bool is_accuracy_normal = - Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::Normal; + pending_ranges.clear(); auto it = committed_ranges.begin(); while (it != committed_ranges.end()) { auto& current_intervals = *it; @@ -926,11 +504,12 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { const std::size_t size = interval.upper() - interval.lower(); const VAddr cpu_addr = interval.lower(); ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { - buffer.ForEachDownloadRangeAndClear( - cpu_addr, size, [&](u64 range_offset, u64 range_size) { - if (is_accuracy_normal) { - return; - } + const VAddr buffer_start = buffer.CpuAddr(); + const VAddr buffer_end = buffer_start + buffer.SizeBytes(); + const VAddr new_start = std::max(buffer_start, cpu_addr); + const VAddr new_end = std::min(buffer_end, cpu_addr + size); + memory_tracker.ForEachDownloadRange( + new_start, new_end - new_start, false, [&](u64 cpu_addr_out, u64 range_size) { const VAddr buffer_addr = buffer.CpuAddr(); const auto add_download = [&](VAddr start, VAddr end) { const u64 new_offset = start - buffer_addr; @@ -944,92 +523,142 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { buffer_id, }); // Align up to avoid cache conflicts - constexpr u64 align = 8ULL; + constexpr u64 align = 64ULL; constexpr u64 mask = ~(align - 1ULL); total_size_bytes += (new_size + align - 1) & mask; largest_copy = std::max(largest_copy, new_size); }; - const VAddr start_address = buffer_addr + range_offset; - const VAddr end_address = start_address + range_size; - ForEachWrittenRange(start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - common_ranges.subtract(subtract_interval); + ForEachInRangeSet(common_ranges, cpu_addr_out, range_size, add_download); }); }); } } committed_ranges.clear(); if (downloads.empty()) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + if (active_async_buffers) { + async_buffers.emplace_back(std::optional<Async_Buffer>{}); + } + } return; } - if constexpr (USE_MEMORY_MAPS) { - auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); - runtime.PreCopyBarrier(); - for (auto& [copy, buffer_id] : downloads) { - // Have in mind the staging buffer offset for the copy - copy.dst_offset += download_staging.offset; - const std::array copies{copy}; - runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); - } - runtime.PostCopyBarrier(); - runtime.Finish(); - for (const auto& [copy, buffer_id] : downloads) { - const Buffer& buffer = slot_buffers[buffer_id]; - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - // Undo the modified offset - const u64 dst_offset = copy.dst_offset - download_staging.offset; - const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + if (active_async_buffers) { + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes, true); + boost::container::small_vector<BufferCopy, 4> normalized_copies; + IntervalSet new_async_range{}; + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + BufferCopy second_copy{copy}; + Buffer& buffer = slot_buffers[buffer_id]; + second_copy.src_offset = static_cast<size_t>(buffer.CpuAddr()) + copy.src_offset; + VAddr orig_cpu_addr = static_cast<VAddr>(second_copy.src_offset); + const IntervalType base_interval{orig_cpu_addr, orig_cpu_addr + copy.size}; + async_downloads += std::make_pair(base_interval, 1); + runtime.CopyBuffer(download_staging.buffer, buffer, copies, false); + normalized_copies.push_back(second_copy); + } + runtime.PostCopyBarrier(); + pending_downloads.emplace_back(std::move(normalized_copies)); + async_buffers.emplace_back(download_staging); + } else { + committed_ranges.clear(); + uncommitted_ranges.clear(); } } else { - const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); - for (const auto& [copy, buffer_id] : downloads) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); - const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; - cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + runtime.PreCopyBarrier(); + for (auto& [copy, buffer_id] : downloads) { + // Have in mind the staging buffer offset for the copy + copy.dst_offset += download_staging.offset; + const std::array copies{copy}; + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies, false); + } + runtime.PostCopyBarrier(); + runtime.Finish(); + for (const auto& [copy, buffer_id] : downloads) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + // Undo the modified offset + const u64 dst_offset = copy.dst_offset - download_staging.offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + dst_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + } + } else { + const std::span<u8> immediate_buffer = ImmediateBuffer(largest_copy); + for (const auto& [copy, buffer_id] : downloads) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + } } } } template <class P> void BufferCache<P>::CommitAsyncFlushes() { - if (Settings::values.gpu_accuracy.GetValue() == Settings::GPUAccuracy::High) { - CommitAsyncFlushesHigh(); - } else { - uncommitted_ranges.clear(); - committed_ranges.clear(); - } + CommitAsyncFlushesHigh(); } template <class P> -void BufferCache<P>::PopAsyncFlushes() {} +void BufferCache<P>::PopAsyncFlushes() { + MICROPROFILE_SCOPE(GPU_DownloadMemory); + PopAsyncBuffers(); +} template <class P> -bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId image_id = page_table[page]; - if (!image_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[image_id]; - if (buffer.IsRegionGpuModified(addr, size)) { - return true; +void BufferCache<P>::PopAsyncBuffers() { + if (async_buffers.empty()) { + return; + } + if (!async_buffers.front().has_value()) { + async_buffers.pop_front(); + return; + } + if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { + auto& downloads = pending_downloads.front(); + auto& async_buffer = async_buffers.front(); + u8* base = async_buffer->mapped_span.data(); + const size_t base_offset = async_buffer->offset; + for (const auto& copy : downloads) { + const VAddr cpu_addr = static_cast<VAddr>(copy.src_offset); + const u64 dst_offset = copy.dst_offset - base_offset; + const u8* read_mapped_memory = base + dst_offset; + ForEachInOverlapCounter( + async_downloads, cpu_addr, copy.size, [&](VAddr start, VAddr end, int count) { + cpu_memory.WriteBlockUnsafe(start, &read_mapped_memory[start - cpu_addr], + end - start); + if (count == 1) { + const IntervalType base_interval{start, end}; + common_ranges.subtract(base_interval); + } + }); + const IntervalType subtract_interval{cpu_addr, cpu_addr + copy.size}; + RemoveEachInOverlapCounter(async_downloads, subtract_interval, -1); } - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); + async_buffers_death_ring.emplace_back(*async_buffer); + async_buffers.pop_front(); + pending_downloads.pop_front(); } - return false; +} + +template <class P> +bool BufferCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { + bool is_dirty = false; + ForEachInRangeSet(common_ranges, addr, size, [&](VAddr, VAddr) { is_dirty = true; }); + return is_dirty; } template <class P> bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { const VAddr end_addr = addr + size; - const u64 page_end = Common::DivCeil(end_addr, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { + const u64 page_end = Common::DivCeil(end_addr, CACHING_PAGESIZE); + for (u64 page = addr >> CACHING_PAGEBITS; page < page_end;) { const BufferId buffer_id = page_table[page]; if (!buffer_id) { ++page; @@ -1041,28 +670,14 @@ bool BufferCache<P>::IsRegionRegistered(VAddr addr, size_t size) { if (buf_start_addr < end_addr && addr < buf_end_addr) { return true; } - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); + page = Common::DivCeil(end_addr, CACHING_PAGESIZE); } return false; } template <class P> bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { - const u64 page_end = Common::DivCeil(addr + size, YUZU_PAGESIZE); - for (u64 page = addr >> YUZU_PAGEBITS; page < page_end;) { - const BufferId image_id = page_table[page]; - if (!image_id) { - ++page; - continue; - } - Buffer& buffer = slot_buffers[image_id]; - if (buffer.IsRegionCpuModified(addr, size)) { - return true; - } - const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); - page = Common::DivCeil(end_addr, YUZU_PAGESIZE); - } - return false; + return memory_tracker.IsRegionCpuModified(addr, size); } template <class P> @@ -1072,7 +687,7 @@ void BufferCache<P>::BindHostIndexBuffer() { const u32 offset = buffer.Offset(index_buffer.cpu_addr); const u32 size = index_buffer.size; const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - if (!draw_state.inline_index_draw_indexes.empty()) { + if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { if constexpr (USE_MEMORY_MAPS) { auto upload_staging = runtime.UploadStagingBuffer(size); std::array<BufferCopy, 1> copies{ @@ -1155,7 +770,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 TouchBuffer(buffer, binding.buffer_id); const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && size <= uniform_buffer_skip_cache_size && - !buffer.IsRegionGpuModified(cpu_addr, size); + !memory_tracker.IsRegionGpuModified(cpu_addr, size); if (use_fast_buffer) { if constexpr (IS_OPENGL) { if (runtime.HasFastBufferSubData()) { @@ -1378,27 +993,36 @@ void BufferCache<P>::UpdateIndexBuffer() { // We have to check for the dirty flags and index count // The index count is currently changed without updating the dirty flags const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); - const auto& index_array = draw_state.index_buffer; + const auto& index_buffer_ref = draw_state.index_buffer; auto& flags = maxwell3d->dirty.flags; if (!flags[Dirty::IndexBuffer]) { return; } flags[Dirty::IndexBuffer] = false; - last_index_count = index_array.count; - if (!draw_state.inline_index_draw_indexes.empty()) { + if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { auto inline_index_size = static_cast<u32>(draw_state.inline_index_draw_indexes.size()); + u32 buffer_size = Common::AlignUp(inline_index_size, CACHING_PAGESIZE); + if (inline_buffer_id == NULL_BUFFER_ID) [[unlikely]] { + inline_buffer_id = CreateBuffer(0, buffer_size); + } + if (slot_buffers[inline_buffer_id].SizeBytes() < buffer_size) [[unlikely]] { + slot_buffers.erase(inline_buffer_id); + inline_buffer_id = CreateBuffer(0, buffer_size); + } index_buffer = Binding{ .cpu_addr = 0, .size = inline_index_size, - .buffer_id = CreateBuffer(0, inline_index_size), + .buffer_id = inline_buffer_id, }; return; } - const GPUVAddr gpu_addr_begin = index_array.StartAddress(); - const GPUVAddr gpu_addr_end = index_array.EndAddress(); + + const GPUVAddr gpu_addr_begin = index_buffer_ref.StartAddress(); + const GPUVAddr gpu_addr_end = index_buffer_ref.EndAddress(); const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); - const u32 draw_size = (index_array.count + index_array.first) * index_array.FormatSizeInBytes(); + const u32 draw_size = + (index_buffer_ref.count + index_buffer_ref.first) * index_buffer_ref.FormatSizeInBytes(); const u32 size = std::min(address_size, draw_size); if (size == 0 || !cpu_addr) { index_buffer = NULL_BINDING; @@ -1434,17 +1058,15 @@ void BufferCache<P>::UpdateVertexBuffer(u32 index) { const GPUVAddr gpu_addr_begin = array.Address(); const GPUVAddr gpu_addr_end = limit.Address() + 1; const std::optional<VAddr> cpu_addr = gpu_memory->GpuToCpuAddress(gpu_addr_begin); - u32 address_size = static_cast<u32>( - std::min(gpu_addr_end - gpu_addr_begin, static_cast<u64>(std::numeric_limits<u32>::max()))); - if (array.enable == 0 || address_size == 0 || !cpu_addr) { + const u32 address_size = static_cast<u32>(gpu_addr_end - gpu_addr_begin); + u32 size = address_size; // TODO: Analyze stride and number of vertices + if (array.enable == 0 || size == 0 || !cpu_addr) { vertex_buffers[index] = NULL_BINDING; return; } if (!gpu_memory->IsWithinGPUAddressRange(gpu_addr_end)) { - address_size = - static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, address_size)); + size = static_cast<u32>(gpu_memory->MaxContinuousRange(gpu_addr_begin, size)); } - const u32 size = address_size; // TODO: Analyze stride and number of vertices vertex_buffers[index] = Binding{ .cpu_addr = *cpu_addr, .size = size, @@ -1591,17 +1213,16 @@ void BufferCache<P>::UpdateComputeTextureBuffers() { template <class P> void BufferCache<P>::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkRegionAsGpuModified(cpu_addr, size); + memory_tracker.MarkRegionAsGpuModified(cpu_addr, size); + + if (memory_tracker.IsRegionCpuModified(cpu_addr, size)) { + SynchronizeBuffer(slot_buffers[buffer_id], cpu_addr, size); + } const IntervalType base_interval{cpu_addr, cpu_addr + size}; common_ranges.add(base_interval); - - const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); - if (!is_async) { - return; - } uncommitted_ranges.add(base_interval); + pending_ranges.add(base_interval); } template <class P> @@ -1609,7 +1230,7 @@ BufferId BufferCache<P>::FindBuffer(VAddr cpu_addr, u32 size) { if (cpu_addr == 0) { return NULL_BUFFER_ID; } - const u64 page = cpu_addr >> YUZU_PAGEBITS; + const u64 page = cpu_addr >> CACHING_PAGEBITS; const BufferId buffer_id = page_table[page]; if (!buffer_id) { return CreateBuffer(cpu_addr, size); @@ -1638,9 +1259,9 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu .has_stream_leap = has_stream_leap, }; } - for (; cpu_addr >> YUZU_PAGEBITS < Common::DivCeil(end, YUZU_PAGESIZE); - cpu_addr += YUZU_PAGESIZE) { - const BufferId overlap_id = page_table[cpu_addr >> YUZU_PAGEBITS]; + for (; cpu_addr >> CACHING_PAGEBITS < Common::DivCeil(end, CACHING_PAGESIZE); + cpu_addr += CACHING_PAGESIZE) { + const BufferId overlap_id = page_table[cpu_addr >> CACHING_PAGEBITS]; if (!overlap_id) { continue; } @@ -1666,11 +1287,11 @@ typename BufferCache<P>::OverlapResult BufferCache<P>::ResolveOverlaps(VAddr cpu // as a stream buffer. Increase the size to skip constantly recreating buffers. has_stream_leap = true; if (expands_right) { - begin -= YUZU_PAGESIZE * 256; + begin -= CACHING_PAGESIZE * 256; cpu_addr = begin; } if (expands_left) { - end += YUZU_PAGESIZE * 256; + end += CACHING_PAGESIZE * 256; } } } @@ -1690,25 +1311,22 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, if (accumulate_stream_score) { new_buffer.IncreaseStreamScore(overlap.StreamScore() + 1); } - std::vector<BufferCopy> copies; + boost::container::small_vector<BufferCopy, 1> copies; const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); - overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { - copies.push_back(BufferCopy{ - .src_offset = begin, - .dst_offset = dst_base_offset + begin, - .size = range_size, - }); - new_buffer.UnmarkRegionAsCpuModified(begin, range_size); - new_buffer.MarkRegionAsGpuModified(begin, range_size); + copies.push_back(BufferCopy{ + .src_offset = 0, + .dst_offset = dst_base_offset, + .size = overlap.SizeBytes(), }); - if (!copies.empty()) { - runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); - } - DeleteBuffer(overlap_id); + runtime.CopyBuffer(new_buffer, overlap, copies); + DeleteBuffer(overlap_id, true); } template <class P> BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { + VAddr cpu_addr_end = Common::AlignUp(cpu_addr + wanted_size, CACHING_PAGESIZE); + cpu_addr = Common::AlignDown(cpu_addr, CACHING_PAGESIZE); + wanted_size = static_cast<u32>(cpu_addr_end - cpu_addr); const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); const u32 size = static_cast<u32>(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); @@ -1718,7 +1336,7 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } Register(new_buffer_id); - TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id); + TouchBuffer(new_buffer, new_buffer_id); return new_buffer_id; } @@ -1746,8 +1364,8 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { } const VAddr cpu_addr_begin = buffer.CpuAddr(); const VAddr cpu_addr_end = cpu_addr_begin + size; - const u64 page_begin = cpu_addr_begin / YUZU_PAGESIZE; - const u64 page_end = Common::DivCeil(cpu_addr_end, YUZU_PAGESIZE); + const u64 page_begin = cpu_addr_begin / CACHING_PAGESIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, CACHING_PAGESIZE); for (u64 page = page_begin; page != page_end; ++page) { if constexpr (insert) { page_table[page] = buffer_id; @@ -1766,9 +1384,6 @@ void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept { template <class P> bool BufferCache<P>::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { - if (buffer.CpuAddr() == 0) { - return true; - } return SynchronizeBufferImpl(buffer, cpu_addr, size); } @@ -1777,10 +1392,11 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s boost::container::small_vector<BufferCopy, 4> copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + VAddr buffer_start = buffer.CpuAddr(); + memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { copies.push_back(BufferCopy{ .src_offset = total_size_bytes, - .dst_offset = range_offset, + .dst_offset = cpu_addr_out - buffer_start, .size = range_size, }); total_size_bytes += range_size; @@ -1795,6 +1411,51 @@ bool BufferCache<P>::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 s } template <class P> +bool BufferCache<P>::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector<BufferCopy, 4> copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + IntervalSet found_sets{}; + auto make_copies = [&] { + for (auto& interval : found_sets) { + const std::size_t sub_size = interval.upper() - interval.lower(); + const VAddr cpu_addr_ = interval.lower(); + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = cpu_addr_ - buffer.CpuAddr(), + .size = sub_size, + }); + total_size_bytes += sub_size; + largest_copy = std::max<u64>(largest_copy, sub_size); + } + const std::span<BufferCopy> copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); + }; + memory_tracker.ForEachUploadRange(cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { + const VAddr base_adr = cpu_addr_out; + const VAddr end_adr = base_adr + range_size; + const IntervalType add_interval{base_adr, end_adr}; + found_sets.add(add_interval); + }); + if (found_sets.empty()) { + return true; + } + const IntervalType search_interval{cpu_addr, cpu_addr + size}; + auto it = common_ranges.lower_bound(search_interval); + auto it_end = common_ranges.upper_bound(search_interval); + if (it == common_ranges.end()) { + make_copies(); + return false; + } + while (it != it_end) { + found_sets.subtract(*it); + it++; + } + make_copies(); + return false; +} + +template <class P> void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span<BufferCopy> copies) { if constexpr (USE_MEMORY_MAPS) { @@ -1805,39 +1466,45 @@ void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 larg } template <class P> -void BufferCache<P>::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, - std::span<const BufferCopy> copies) { - std::span<u8> immediate_buffer; - for (const BufferCopy& copy : copies) { - std::span<const u8> upload_span; - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; - if (IsRangeGranular(cpu_addr, copy.size)) { - upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); - } else { - if (immediate_buffer.empty()) { - immediate_buffer = ImmediateBuffer(largest_copy); +void BufferCache<P>::ImmediateUploadMemory([[maybe_unused]] Buffer& buffer, + [[maybe_unused]] u64 largest_copy, + [[maybe_unused]] std::span<const BufferCopy> copies) { + if constexpr (!USE_MEMORY_MAPS) { + std::span<u8> immediate_buffer; + for (const BufferCopy& copy : copies) { + std::span<const u8> upload_span; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + if (IsRangeGranular(cpu_addr, copy.size)) { + upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); + } else { + if (immediate_buffer.empty()) { + immediate_buffer = ImmediateBuffer(largest_copy); + } + cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + upload_span = immediate_buffer.subspan(0, copy.size); } - cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); - upload_span = immediate_buffer.subspan(0, copy.size); + buffer.ImmediateUpload(copy.dst_offset, upload_span); } - buffer.ImmediateUpload(copy.dst_offset, upload_span); } } template <class P> -void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, - std::span<BufferCopy> copies) { - auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); - const std::span<u8> staging_pointer = upload_staging.mapped_span; - for (BufferCopy& copy : copies) { - u8* const src_pointer = staging_pointer.data() + copy.src_offset; - const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; - cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); +void BufferCache<P>::MappedUploadMemory([[maybe_unused]] Buffer& buffer, + [[maybe_unused]] u64 total_size_bytes, + [[maybe_unused]] std::span<BufferCopy> copies) { + if constexpr (USE_MEMORY_MAPS) { + auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); + const std::span<u8> staging_pointer = upload_staging.mapped_span; + for (BufferCopy& copy : copies) { + u8* const src_pointer = staging_pointer.data() + copy.src_offset; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); - // Apply the staging offset - copy.src_offset += upload_staging.offset; + // Apply the staging offset + copy.src_offset += upload_staging.offset; + } + runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } - runtime.CopyBuffer(buffer, upload_staging.buffer, copies); } template <class P> @@ -1847,7 +1514,9 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, if (!is_dirty) { return false; } - if (!IsRegionGpuModified(dest_address, copy_size)) { + VAddr aligned_start = Common::AlignDown(dest_address, YUZU_PAGESIZE); + VAddr aligned_end = Common::AlignUp(dest_address + copy_size, YUZU_PAGESIZE); + if (!IsRegionGpuModified(aligned_start, aligned_end - aligned_start)) { return false; } @@ -1886,30 +1555,31 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si boost::container::small_vector<BufferCopy, 1> copies; u64 total_size_bytes = 0; u64 largest_copy = 0; - buffer.ForEachDownloadRangeAndClear(cpu_addr, size, [&](u64 range_offset, u64 range_size) { - const VAddr buffer_addr = buffer.CpuAddr(); - const auto add_download = [&](VAddr start, VAddr end) { - const u64 new_offset = start - buffer_addr; - const u64 new_size = end - start; - copies.push_back(BufferCopy{ - .src_offset = new_offset, - .dst_offset = total_size_bytes, - .size = new_size, - }); - // Align up to avoid cache conflicts - constexpr u64 align = 256ULL; - constexpr u64 mask = ~(align - 1ULL); - total_size_bytes += (new_size + align - 1) & mask; - largest_copy = std::max(largest_copy, new_size); - }; - - const VAddr start_address = buffer_addr + range_offset; - const VAddr end_address = start_address + range_size; - ForEachWrittenRange(start_address, range_size, add_download); - const IntervalType subtract_interval{start_address, end_address}; - ClearDownload(subtract_interval); - common_ranges.subtract(subtract_interval); - }); + memory_tracker.ForEachDownloadRangeAndClear( + cpu_addr, size, [&](u64 cpu_addr_out, u64 range_size) { + const VAddr buffer_addr = buffer.CpuAddr(); + const auto add_download = [&](VAddr start, VAddr end) { + const u64 new_offset = start - buffer_addr; + const u64 new_size = end - start; + copies.push_back(BufferCopy{ + .src_offset = new_offset, + .dst_offset = total_size_bytes, + .size = new_size, + }); + // Align up to avoid cache conflicts + constexpr u64 align = 64ULL; + constexpr u64 mask = ~(align - 1ULL); + total_size_bytes += (new_size + align - 1) & mask; + largest_copy = std::max(largest_copy, new_size); + }; + + const VAddr start_address = cpu_addr_out; + const VAddr end_address = start_address + range_size; + ForEachInRangeSet(common_ranges, start_address, range_size, add_download); + const IntervalType subtract_interval{start_address, end_address}; + ClearDownload(subtract_interval); + common_ranges.subtract(subtract_interval); + }); if (total_size_bytes == 0) { return; } @@ -1943,7 +1613,7 @@ void BufferCache<P>::DownloadBufferMemory(Buffer& buffer, VAddr cpu_addr, u64 si } template <class P> -void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { +void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) { const auto scalar_replace = [buffer_id](Binding& binding) { if (binding.buffer_id == buffer_id) { binding.buffer_id = BufferId{}; @@ -1962,8 +1632,10 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id) { std::erase(cached_write_buffer_ids, buffer_id); // Mark the whole buffer as CPU written to stop tracking CPU writes - Buffer& buffer = slot_buffers[buffer_id]; - buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + if (!do_not_mark) { + Buffer& buffer = slot_buffers[buffer_id]; + memory_tracker.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + } Unregister(buffer_id); delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); @@ -2011,7 +1683,7 @@ typename BufferCache<P>::Binding BufferCache<P>::StorageBufferBinding(GPUVAddr s LOG_WARNING(HW_GPU, "Failed to find storage buffer for cbuf index {}", cbuf_index); return NULL_BINDING; } - const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, Core::Memory::YUZU_PAGESIZE); + const VAddr cpu_end = Common::AlignUp(*cpu_addr + size, YUZU_PAGESIZE); const Binding binding{ .cpu_addr = *cpu_addr, .size = is_written ? size : static_cast<u32>(cpu_end - *cpu_addr), diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h new file mode 100644 index 000000000..656baa550 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -0,0 +1,580 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <algorithm> +#include <array> +#include <functional> +#include <memory> +#include <mutex> +#include <numeric> +#include <span> +#include <unordered_map> +#include <vector> + +#include <boost/container/small_vector.hpp> +#define BOOST_NO_MT +#include <boost/pool/detail/mutex.hpp> +#undef BOOST_NO_MT +#include <boost/icl/interval.hpp> +#include <boost/icl/interval_base_set.hpp> +#include <boost/icl/interval_set.hpp> +#include <boost/icl/split_interval_map.hpp> +#include <boost/pool/pool.hpp> +#include <boost/pool/pool_alloc.hpp> +#include <boost/pool/poolfwd.hpp> + +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "common/literals.h" +#include "common/lru_cache.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "core/memory.h" +#include "video_core/buffer_cache/buffer_base.h" +#include "video_core/control/channel_state_cache.h" +#include "video_core/delayed_destruction_ring.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/draw_manager.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" + +namespace boost { +template <typename T> +class fast_pool_allocator<T, default_user_allocator_new_delete, details::pool::null_mutex, 4096, 0>; +} + +namespace VideoCommon { + +MICROPROFILE_DECLARE(GPU_PrepareBuffers); +MICROPROFILE_DECLARE(GPU_BindUploadBuffers); +MICROPROFILE_DECLARE(GPU_DownloadMemory); + +using BufferId = SlotId; + +using VideoCore::Surface::PixelFormat; +using namespace Common::Literals; + +constexpr u32 NUM_VERTEX_BUFFERS = 32; +constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; +constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; +constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; +constexpr u32 NUM_STORAGE_BUFFERS = 16; +constexpr u32 NUM_TEXTURE_BUFFERS = 16; +constexpr u32 NUM_STAGES = 5; + +using UniformBufferSizes = std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>; +using ComputeUniformBufferSizes = std::array<u32, NUM_COMPUTE_UNIFORM_BUFFERS>; + +enum class ObtainBufferSynchronize : u32 { + NoSynchronize = 0, + FullSynchronize = 1, + SynchronizeNoDirty = 2, +}; + +enum class ObtainBufferOperation : u32 { + DoNothing = 0, + MarkAsWritten = 1, + DiscardWrite = 2, + MarkQuery = 3, +}; + +template <typename P> +class BufferCache : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { + // Page size for caching purposes. + // This is unrelated to the CPU page size and it can be changed as it seems optimal. + static constexpr u32 CACHING_PAGEBITS = 16; + static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS; + + static constexpr bool IS_OPENGL = P::IS_OPENGL; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = + P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = + P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; + static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; + static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS; + + static constexpr BufferId NULL_BUFFER_ID{0}; + + static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; + static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; + static constexpr s64 TARGET_THRESHOLD = 4_GiB; + + // Debug Flags. + + static constexpr bool DISABLE_DOWNLOADS = true; + + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + using Runtime = typename P::Runtime; + using Buffer = typename P::Buffer; + using Async_Buffer = typename P::Async_Buffer; + using MemoryTracker = typename P::MemoryTracker; + + using IntervalCompare = std::less<VAddr>; + using IntervalInstance = boost::icl::interval_type_default<VAddr, std::less>; + using IntervalAllocator = boost::fast_pool_allocator<VAddr>; + using IntervalSet = boost::icl::interval_set<VAddr>; + using IntervalType = typename IntervalSet::interval_type; + + template <typename Type> + struct counter_add_functor : public boost::icl::identity_based_inplace_combine<Type> { + // types + typedef counter_add_functor<Type> type; + typedef boost::icl::identity_based_inplace_combine<Type> base_type; + + // public member functions + void operator()(Type& current, const Type& added) const { + current += added; + if (current < base_type::identity_element()) { + current = base_type::identity_element(); + } + } + + // public static functions + static void version(Type&){}; + }; + + using OverlapCombine = counter_add_functor<int>; + using OverlapSection = boost::icl::inter_section<int>; + using OverlapCounter = boost::icl::split_interval_map<VAddr, int>; + + struct Empty {}; + + struct OverlapResult { + std::vector<BufferId> ids; + VAddr begin; + VAddr end; + bool has_stream_leap = false; + }; + + struct Binding { + VAddr cpu_addr{}; + u32 size{}; + BufferId buffer_id; + }; + + struct TextureBufferBinding : Binding { + PixelFormat format; + }; + + static constexpr Binding NULL_BINDING{ + .cpu_addr = 0, + .size = 0, + .buffer_id = NULL_BUFFER_ID, + }; + +public: + static constexpr u32 DEFAULT_SKIP_CACHE_SIZE = static_cast<u32>(4_KiB); + + explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, Runtime& runtime_); + + void TickFrame(); + + void WriteMemory(VAddr cpu_addr, u64 size); + + void CachedWriteMemory(VAddr cpu_addr, u64 size); + + void DownloadMemory(VAddr cpu_addr, u64 size); + + bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer); + + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); + + void DisableGraphicsUniformBuffer(size_t stage, u32 index); + + void UpdateGraphicsBuffers(bool is_indexed); + + void UpdateComputeBuffers(); + + void BindHostGeometryBuffers(bool is_indexed); + + void BindHostStageBuffers(size_t stage); + + void BindHostComputeBuffers(); + + void SetUniformBuffersState(const std::array<u32, NUM_STAGES>& mask, + const UniformBufferSizes* sizes); + + void SetComputeUniformBufferState(u32 mask, const ComputeUniformBufferSizes* sizes); + + void UnbindGraphicsStorageBuffers(size_t stage); + + void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindGraphicsTextureBuffers(size_t stage); + + void BindGraphicsTextureBuffer(size_t stage, size_t tbo_index, GPUVAddr gpu_addr, u32 size, + PixelFormat format, bool is_written, bool is_image); + + void UnbindComputeStorageBuffers(); + + void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindComputeTextureBuffers(); + + void BindComputeTextureBuffer(size_t tbo_index, GPUVAddr gpu_addr, u32 size, PixelFormat format, + bool is_written, bool is_image); + + [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, + ObtainBufferSynchronize sync_info, + ObtainBufferOperation post_op); + void FlushCachedWrites(); + + /// Return true when there are uncommitted buffers to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; + + void AccumulateFlushes(); + + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; + + /// Commit asynchronous downloads + void CommitAsyncFlushes(); + void CommitAsyncFlushesHigh(); + + /// Pop asynchronous downloads + void PopAsyncFlushes(); + void PopAsyncBuffers(); + + bool DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 amount); + + bool DMAClear(GPUVAddr src_address, u64 amount, u32 value); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + + /// Return true when a region is registered on the cache + [[nodiscard]] bool IsRegionRegistered(VAddr addr, size_t size); + + /// Return true when a CPU region is modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr addr, size_t size); + + void SetDrawIndirect( + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect_) { + current_draw_indirect = current_draw_indirect_; + } + + [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectCount(); + + [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); + + std::recursive_mutex mutex; + Runtime& runtime; + +private: + template <typename Func> + static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { + for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { + const int disabled_bits = std::countr_zero(enabled_mask); + index += disabled_bits; + enabled_mask >>= disabled_bits; + func(index); + } + } + + template <typename Func> + void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { + const u64 page_end = Common::DivCeil(cpu_addr + size, CACHING_PAGESIZE); + for (u64 page = cpu_addr >> CACHING_PAGEBITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; + continue; + } + Buffer& buffer = slot_buffers[buffer_id]; + func(buffer_id, buffer); + + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, CACHING_PAGESIZE); + } + } + + template <typename Func> + void ForEachInRangeSet(IntervalSet& current_range, VAddr cpu_addr, u64 size, Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + VAddr inter_addr_end = it->upper(); + VAddr inter_addr = it->lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end); + } + } + + template <typename Func> + void ForEachInOverlapCounter(OverlapCounter& current_range, VAddr cpu_addr, u64 size, + Func&& func) { + const VAddr start_address = cpu_addr; + const VAddr end_address = start_address + size; + const IntervalType search_interval{start_address, end_address}; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + auto& inter = it->first; + VAddr inter_addr_end = inter.upper(); + VAddr inter_addr = inter.lower(); + if (inter_addr_end > end_address) { + inter_addr_end = end_address; + } + if (inter_addr < start_address) { + inter_addr = start_address; + } + func(inter_addr, inter_addr_end, it->second); + } + } + + void RemoveEachInOverlapCounter(OverlapCounter& current_range, + const IntervalType search_interval, int subtract_value) { + bool any_removals = false; + current_range.add(std::make_pair(search_interval, subtract_value)); + do { + any_removals = false; + auto it = current_range.lower_bound(search_interval); + if (it == current_range.end()) { + return; + } + auto end_it = current_range.upper_bound(search_interval); + for (; it != end_it; it++) { + if (it->second <= 0) { + any_removals = true; + current_range.erase(it); + break; + } + } + } while (any_removals); + } + + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { + return (cpu_addr & ~Core::Memory::YUZU_PAGEMASK) == + ((cpu_addr + size) & ~Core::Memory::YUZU_PAGEMASK); + } + + void RunGarbageCollector(); + + void WaitOnAsyncFlushes(VAddr cpu_addr, u64 size); + + void BindHostIndexBuffer(); + + void BindHostVertexBuffers(); + + void BindHostDrawIndirectBuffers(); + + void BindHostGraphicsUniformBuffers(size_t stage); + + void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); + + void BindHostGraphicsStorageBuffers(size_t stage); + + void BindHostGraphicsTextureBuffers(size_t stage); + + void BindHostTransformFeedbackBuffers(); + + void BindHostComputeUniformBuffers(); + + void BindHostComputeStorageBuffers(); + + void BindHostComputeTextureBuffers(); + + void DoUpdateGraphicsBuffers(bool is_indexed); + + void DoUpdateComputeBuffers(); + + void UpdateIndexBuffer(); + + void UpdateVertexBuffers(); + + void UpdateVertexBuffer(u32 index); + + void UpdateDrawIndirect(); + + void UpdateUniformBuffers(size_t stage); + + void UpdateStorageBuffers(size_t stage); + + void UpdateTextureBuffers(size_t stage); + + void UpdateTransformFeedbackBuffers(); + + void UpdateTransformFeedbackBuffer(u32 index); + + void UpdateComputeUniformBuffers(); + + void UpdateComputeStorageBuffers(); + + void UpdateComputeTextureBuffers(); + + void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); + + [[nodiscard]] OverlapResult ResolveOverlaps(VAddr cpu_addr, u32 wanted_size); + + void JoinOverlap(BufferId new_buffer_id, BufferId overlap_id, bool accumulate_stream_score); + + [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); + + void Register(BufferId buffer_id); + + void Unregister(BufferId buffer_id); + + template <bool insert> + void ChangeRegister(BufferId buffer_id); + + void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; + + bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); + + bool SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + + bool SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, u32 size); + + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span<BufferCopy> copies); + + void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span<const BufferCopy> copies); + + void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, std::span<BufferCopy> copies); + + void DownloadBufferMemory(Buffer& buffer_id); + + void DownloadBufferMemory(Buffer& buffer_id, VAddr cpu_addr, u64 size); + + void DeleteBuffer(BufferId buffer_id, bool do_not_mark = false); + + void NotifyBufferDeletion(); + + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr, u32 cbuf_index, + bool is_written) const; + + [[nodiscard]] TextureBufferBinding GetTextureBufferBinding(GPUVAddr gpu_addr, u32 size, + PixelFormat format); + + [[nodiscard]] std::span<const u8> ImmediateBufferWithData(VAddr cpu_addr, size_t size); + + [[nodiscard]] std::span<u8> ImmediateBuffer(size_t wanted_capacity); + + [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; + + void ClearDownload(IntervalType subtract_interval); + + VideoCore::RasterizerInterface& rasterizer; + Core::Memory::Memory& cpu_memory; + + SlotVector<Buffer> slot_buffers; + DelayedDestructionRing<Buffer, 8> delayed_destruction_ring; + + const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{}; + + u32 last_index_count = 0; + + Binding index_buffer; + std::array<Binding, NUM_VERTEX_BUFFERS> vertex_buffers; + std::array<std::array<Binding, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES> uniform_buffers; + std::array<std::array<Binding, NUM_STORAGE_BUFFERS>, NUM_STAGES> storage_buffers; + std::array<std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS>, NUM_STAGES> texture_buffers; + std::array<Binding, NUM_TRANSFORM_FEEDBACK_BUFFERS> transform_feedback_buffers; + Binding count_buffer_binding; + Binding indirect_buffer_binding; + + std::array<Binding, NUM_COMPUTE_UNIFORM_BUFFERS> compute_uniform_buffers; + std::array<Binding, NUM_STORAGE_BUFFERS> compute_storage_buffers; + std::array<TextureBufferBinding, NUM_TEXTURE_BUFFERS> compute_texture_buffers; + + std::array<u32, NUM_STAGES> enabled_uniform_buffer_masks{}; + u32 enabled_compute_uniform_buffer_mask = 0; + + const UniformBufferSizes* uniform_buffer_sizes{}; + const ComputeUniformBufferSizes* compute_uniform_buffer_sizes{}; + + std::array<u32, NUM_STAGES> enabled_storage_buffers{}; + std::array<u32, NUM_STAGES> written_storage_buffers{}; + u32 enabled_compute_storage_buffers = 0; + u32 written_compute_storage_buffers = 0; + + std::array<u32, NUM_STAGES> enabled_texture_buffers{}; + std::array<u32, NUM_STAGES> written_texture_buffers{}; + std::array<u32, NUM_STAGES> image_texture_buffers{}; + u32 enabled_compute_texture_buffers = 0; + u32 written_compute_texture_buffers = 0; + u32 image_compute_texture_buffers = 0; + + std::array<u32, 16> uniform_cache_hits{}; + std::array<u32, 16> uniform_cache_shots{}; + + u32 uniform_buffer_skip_cache_size = DEFAULT_SKIP_CACHE_SIZE; + + bool has_deleted_buffers = false; + + std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, std::array<u32, NUM_STAGES>, Empty> + dirty_uniform_buffers{}; + std::conditional_t<IS_OPENGL, std::array<u32, NUM_STAGES>, Empty> fast_bound_uniform_buffers{}; + std::conditional_t<HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS, + std::array<std::array<u32, NUM_GRAPHICS_UNIFORM_BUFFERS>, NUM_STAGES>, Empty> + uniform_buffer_binding_sizes{}; + + std::vector<BufferId> cached_write_buffer_ids; + + MemoryTracker memory_tracker; + IntervalSet uncommitted_ranges; + IntervalSet common_ranges; + IntervalSet cached_ranges; + IntervalSet pending_ranges; + std::deque<IntervalSet> committed_ranges; + + // Async Buffers + OverlapCounter async_downloads; + std::deque<std::optional<Async_Buffer>> async_buffers; + std::deque<boost::container::small_vector<BufferCopy, 4>> pending_downloads; + std::optional<Async_Buffer> current_buffer; + + std::deque<Async_Buffer> async_buffers_death_ring; + + size_t immediate_buffer_capacity = 0; + Common::ScratchBuffer<u8> immediate_buffer_alloc; + + struct LRUItemParams { + using ObjectType = BufferId; + using TickType = u64; + }; + Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; + u64 frame_tick = 0; + u64 total_used_memory = 0; + u64 minimum_memory = 0; + u64 critical_memory = 0; + BufferId inline_buffer_id; + + bool active_async_buffers = false; + + std::array<BufferId, ((1ULL << 39) >> CACHING_PAGEBITS)> page_table; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/memory_tracker_base.h b/src/video_core/buffer_cache/memory_tracker_base.h new file mode 100644 index 000000000..dc4ebfcaa --- /dev/null +++ b/src/video_core/buffer_cache/memory_tracker_base.h @@ -0,0 +1,273 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <algorithm> +#include <bit> +#include <deque> +#include <limits> +#include <type_traits> +#include <unordered_set> +#include <utility> + +#include "common/alignment.h" +#include "common/common_types.h" +#include "video_core/buffer_cache/word_manager.h" + +namespace VideoCommon { + +template <class RasterizerInterface> +class MemoryTrackerBase { + static constexpr size_t MAX_CPU_PAGE_BITS = 39; + static constexpr size_t HIGHER_PAGE_BITS = 22; + static constexpr size_t HIGHER_PAGE_SIZE = 1ULL << HIGHER_PAGE_BITS; + static constexpr size_t HIGHER_PAGE_MASK = HIGHER_PAGE_SIZE - 1ULL; + static constexpr size_t NUM_HIGH_PAGES = 1ULL << (MAX_CPU_PAGE_BITS - HIGHER_PAGE_BITS); + static constexpr size_t MANAGER_POOL_SIZE = 32; + static constexpr size_t WORDS_STACK_NEEDED = HIGHER_PAGE_SIZE / BYTES_PER_WORD; + using Manager = WordManager<RasterizerInterface, WORDS_STACK_NEEDED>; + +public: + MemoryTrackerBase(RasterizerInterface& rasterizer_) : rasterizer{&rasterizer_} {} + ~MemoryTrackerBase() = default; + + /// Returns the inclusive CPU modified range in a begin end pair + [[nodiscard]] std::pair<u64, u64> ModifiedCpuRegion(VAddr query_cpu_addr, + u64 query_size) noexcept { + return IteratePairs<true>( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template ModifiedRegion<Type::CPU>(offset, size); + }); + } + + /// Returns the inclusive GPU modified range in a begin end pair + [[nodiscard]] std::pair<u64, u64> ModifiedGpuRegion(VAddr query_cpu_addr, + u64 query_size) noexcept { + return IteratePairs<false>( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template ModifiedRegion<Type::GPU>(offset, size); + }); + } + + /// Returns true if a region has been modified from the CPU + [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages<true>( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified<Type::CPU>(offset, size); + }); + } + + /// Returns true if a region has been modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) noexcept { + return IteratePages<false>( + query_cpu_addr, query_size, [](Manager* manager, u64 offset, size_t size) { + return manager->template IsRegionModified<Type::GPU>(offset, size); + }); + } + + /// Mark region as CPU modified, notifying the rasterizer about this change + void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages<true>(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState<Type::CPU, true>( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as CPU modified, notifying the rasterizer about this change + void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages<true>(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState<Type::CPU, false>( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the host GPU + void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages<true>(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState<Type::GPU, true>( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Unmark region as modified from the host GPU + void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 query_size) noexcept { + IteratePages<true>(dirty_cpu_addr, query_size, + [](Manager* manager, u64 offset, size_t size) { + manager->template ChangeRegionState<Type::GPU, false>( + manager->GetCpuAddr() + offset, size); + }); + } + + /// Mark region as modified from the CPU + /// but don't mark it as modified until FlusHCachedWrites is called. + void CachedCpuWrite(VAddr dirty_cpu_addr, u64 query_size) { + IteratePages<true>( + dirty_cpu_addr, query_size, [this](Manager* manager, u64 offset, size_t size) { + const VAddr cpu_address = manager->GetCpuAddr() + offset; + manager->template ChangeRegionState<Type::CachedCPU, true>(cpu_address, size); + cached_pages.insert(static_cast<u32>(cpu_address >> HIGHER_PAGE_BITS)); + }); + } + + /// Flushes cached CPU writes, and notify the rasterizer about the deltas + void FlushCachedWrites(VAddr query_cpu_addr, u64 query_size) noexcept { + IteratePages<false>(query_cpu_addr, query_size, + [](Manager* manager, [[maybe_unused]] u64 offset, + [[maybe_unused]] size_t size) { manager->FlushCachedWrites(); }); + } + + void FlushCachedWrites() noexcept { + for (auto id : cached_pages) { + top_tier[id]->FlushCachedWrites(); + } + cached_pages.clear(); + } + + /// Call 'func' for each CPU modified range and unmark those pages as CPU modified + template <typename Func> + void ForEachUploadRange(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages<true>(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->template ForEachModifiedRange<Type::CPU, true>( + manager->GetCpuAddr() + offset, size, func); + }); + } + + /// Call 'func' for each GPU modified range and unmark those pages as GPU modified + template <typename Func> + void ForEachDownloadRange(VAddr query_cpu_range, u64 query_size, bool clear, Func&& func) { + IteratePages<false>(query_cpu_range, query_size, + [&func, clear](Manager* manager, u64 offset, size_t size) { + if (clear) { + manager->template ForEachModifiedRange<Type::GPU, true>( + manager->GetCpuAddr() + offset, size, func); + } else { + manager->template ForEachModifiedRange<Type::GPU, false>( + manager->GetCpuAddr() + offset, size, func); + } + }); + } + + template <typename Func> + void ForEachDownloadRangeAndClear(VAddr query_cpu_range, u64 query_size, Func&& func) { + IteratePages<false>(query_cpu_range, query_size, + [&func](Manager* manager, u64 offset, size_t size) { + manager->template ForEachModifiedRange<Type::GPU, true>( + manager->GetCpuAddr() + offset, size, func); + }); + } + +private: + template <bool create_region_on_fail, typename Func> + bool IteratePages(VAddr cpu_address, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result<Func, Manager*, u64, size_t>::type; + static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>; + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min<std::size_t>(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + if (manager) { + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + if constexpr (BOOL_BREAK) { + if (func(manager, page_offset, copy_amount)) { + return true; + } + } else { + func(manager, page_offset, copy_amount); + } + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + return false; + } + + template <bool create_region_on_fail, typename Func> + std::pair<u64, u64> IteratePairs(VAddr cpu_address, size_t size, Func&& func) { + std::size_t remaining_size{size}; + std::size_t page_index{cpu_address >> HIGHER_PAGE_BITS}; + u64 page_offset{cpu_address & HIGHER_PAGE_MASK}; + u64 begin = std::numeric_limits<u64>::max(); + u64 end = 0; + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min<std::size_t>(HIGHER_PAGE_SIZE - page_offset, remaining_size)}; + auto* manager{top_tier[page_index]}; + const auto execute = [&] { + auto [new_begin, new_end] = func(manager, page_offset, copy_amount); + if (new_begin != 0 || new_end != 0) { + const u64 base_address = page_index << HIGHER_PAGE_BITS; + begin = std::min(new_begin + base_address, begin); + end = std::max(new_end + base_address, end); + } + }; + if (manager) { + execute(); + } else if constexpr (create_region_on_fail) { + CreateRegion(page_index); + manager = top_tier[page_index]; + execute(); + } + page_index++; + page_offset = 0; + remaining_size -= copy_amount; + } + if (begin < end) { + return std::make_pair(begin, end); + } else { + return std::make_pair(0ULL, 0ULL); + } + } + + void CreateRegion(std::size_t page_index) { + const VAddr base_cpu_addr = page_index << HIGHER_PAGE_BITS; + top_tier[page_index] = GetNewManager(base_cpu_addr); + } + + Manager* GetNewManager(VAddr base_cpu_addess) { + const auto on_return = [&] { + auto* new_manager = free_managers.front(); + new_manager->SetCpuAddress(base_cpu_addess); + free_managers.pop_front(); + return new_manager; + }; + if (!free_managers.empty()) { + return on_return(); + } + manager_pool.emplace_back(); + auto& last_pool = manager_pool.back(); + for (size_t i = 0; i < MANAGER_POOL_SIZE; i++) { + new (&last_pool[i]) Manager(0, *rasterizer, HIGHER_PAGE_SIZE); + free_managers.push_back(&last_pool[i]); + } + return on_return(); + } + + std::deque<std::array<Manager, MANAGER_POOL_SIZE>> manager_pool; + std::deque<Manager*> free_managers; + + std::array<Manager*, NUM_HIGH_PAGES> top_tier{}; + + std::unordered_set<u32> cached_pages; + + RasterizerInterface* rasterizer = nullptr; +}; + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/word_manager.h b/src/video_core/buffer_cache/word_manager.h new file mode 100644 index 000000000..a42455045 --- /dev/null +++ b/src/video_core/buffer_cache/word_manager.h @@ -0,0 +1,462 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <algorithm> +#include <bit> +#include <limits> +#include <span> +#include <utility> + +#include "common/alignment.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "core/memory.h" + +namespace VideoCommon { + +constexpr u64 PAGES_PER_WORD = 64; +constexpr u64 BYTES_PER_PAGE = Core::Memory::YUZU_PAGESIZE; +constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; + +enum class Type { + CPU, + GPU, + CachedCPU, + Untracked, +}; + +/// Vector tracking modified pages tightly packed with small vector optimization +template <size_t stack_words = 1> +struct WordsArray { + /// Returns the pointer to the words state + [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { + return is_short ? stack.data() : heap; + } + + /// Returns the pointer to the words state + [[nodiscard]] u64* Pointer(bool is_short) noexcept { + return is_short ? stack.data() : heap; + } + + std::array<u64, stack_words> stack{}; ///< Small buffers storage + u64* heap; ///< Not-small buffers pointer to the storage +}; + +template <size_t stack_words = 1> +struct Words { + explicit Words() = default; + explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { + num_words = Common::DivCeil(size_bytes, BYTES_PER_WORD); + if (IsShort()) { + cpu.stack.fill(~u64{0}); + gpu.stack.fill(0); + cached_cpu.stack.fill(0); + untracked.stack.fill(~u64{0}); + } else { + // Share allocation between CPU and GPU pages and set their default values + u64* const alloc = new u64[num_words * 4]; + cpu.heap = alloc; + gpu.heap = alloc + num_words; + cached_cpu.heap = alloc + num_words * 2; + untracked.heap = alloc + num_words * 3; + std::fill_n(cpu.heap, num_words, ~u64{0}); + std::fill_n(gpu.heap, num_words, 0); + std::fill_n(cached_cpu.heap, num_words, 0); + std::fill_n(untracked.heap, num_words, ~u64{0}); + } + // Clean up tailing bits + const u64 last_word_size = size_bytes % BYTES_PER_WORD; + const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); + const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; + const u64 last_word = (~u64{0} << shift) >> shift; + cpu.Pointer(IsShort())[NumWords() - 1] = last_word; + untracked.Pointer(IsShort())[NumWords() - 1] = last_word; + } + + ~Words() { + Release(); + } + + Words& operator=(Words&& rhs) noexcept { + Release(); + size_bytes = rhs.size_bytes; + num_words = rhs.num_words; + cpu = rhs.cpu; + gpu = rhs.gpu; + cached_cpu = rhs.cached_cpu; + untracked = rhs.untracked; + rhs.cpu.heap = nullptr; + return *this; + } + + Words(Words&& rhs) noexcept + : size_bytes{rhs.size_bytes}, num_words{rhs.num_words}, cpu{rhs.cpu}, gpu{rhs.gpu}, + cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { + rhs.cpu.heap = nullptr; + } + + Words& operator=(const Words&) = delete; + Words(const Words&) = delete; + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return num_words <= stack_words; + } + + /// Returns the number of words of the buffer + [[nodiscard]] size_t NumWords() const noexcept { + return num_words; + } + + /// Release buffer resources + void Release() { + if (!IsShort()) { + // CPU written words is the base for the heap allocation + delete[] cpu.heap; + } + } + + template <Type type> + std::span<u64> Span() noexcept { + if constexpr (type == Type::CPU) { + return std::span<u64>(cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::GPU) { + return std::span<u64>(gpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::CachedCPU) { + return std::span<u64>(cached_cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::Untracked) { + return std::span<u64>(untracked.Pointer(IsShort()), num_words); + } + } + + template <Type type> + std::span<const u64> Span() const noexcept { + if constexpr (type == Type::CPU) { + return std::span<const u64>(cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::GPU) { + return std::span<const u64>(gpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::CachedCPU) { + return std::span<const u64>(cached_cpu.Pointer(IsShort()), num_words); + } else if constexpr (type == Type::Untracked) { + return std::span<const u64>(untracked.Pointer(IsShort()), num_words); + } + } + + u64 size_bytes = 0; + size_t num_words = 0; + WordsArray<stack_words> cpu; + WordsArray<stack_words> gpu; + WordsArray<stack_words> cached_cpu; + WordsArray<stack_words> untracked; +}; + +template <class RasterizerInterface, size_t stack_words = 1> +class WordManager { +public: + explicit WordManager(VAddr cpu_addr_, RasterizerInterface& rasterizer_, u64 size_bytes) + : cpu_addr{cpu_addr_}, rasterizer{&rasterizer_}, words{size_bytes} {} + + explicit WordManager() = default; + + void SetCpuAddress(VAddr new_cpu_addr) { + cpu_addr = new_cpu_addr; + } + + VAddr GetCpuAddr() const { + return cpu_addr; + } + + static u64 ExtractBits(u64 word, size_t page_start, size_t page_end) { + constexpr size_t number_bits = sizeof(u64) * 8; + const size_t limit_page_end = number_bits - std::min(page_end, number_bits); + u64 bits = (word >> page_start) << page_start; + bits = (bits << limit_page_end) >> limit_page_end; + return bits; + } + + static std::pair<size_t, size_t> GetWordPage(VAddr address) { + const size_t converted_address = static_cast<size_t>(address); + const size_t word_number = converted_address / BYTES_PER_WORD; + const size_t amount_pages = converted_address % BYTES_PER_WORD; + return std::make_pair(word_number, amount_pages / BYTES_PER_PAGE); + } + + template <typename Func> + void IterateWords(size_t offset, size_t size, Func&& func) const { + using FuncReturn = std::invoke_result_t<Func, std::size_t, u64>; + static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>; + const size_t start = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset), 0LL)); + const size_t end = static_cast<size_t>(std::max<s64>(static_cast<s64>(offset + size), 0LL)); + if (start >= SizeBytes() || end <= start) { + return; + } + auto [start_word, start_page] = GetWordPage(start); + auto [end_word, end_page] = GetWordPage(end + BYTES_PER_PAGE - 1ULL); + const size_t num_words = NumWords(); + start_word = std::min(start_word, num_words); + end_word = std::min(end_word, num_words); + const size_t diff = end_word - start_word; + end_word += (end_page + PAGES_PER_WORD - 1ULL) / PAGES_PER_WORD; + end_word = std::min(end_word, num_words); + end_page += diff * PAGES_PER_WORD; + constexpr u64 base_mask{~0ULL}; + for (size_t word_index = start_word; word_index < end_word; word_index++) { + const u64 mask = ExtractBits(base_mask, start_page, end_page); + start_page = 0; + end_page -= PAGES_PER_WORD; + if constexpr (BOOL_BREAK) { + if (func(word_index, mask)) { + return; + } + } else { + func(word_index, mask); + } + } + } + + template <typename Func> + void IteratePages(u64 mask, Func&& func) const { + size_t offset = 0; + while (mask != 0) { + const size_t empty_bits = std::countr_zero(mask); + offset += empty_bits; + mask = mask >> empty_bits; + + const size_t continuous_bits = std::countr_one(mask); + func(offset, continuous_bits); + mask = continuous_bits < PAGES_PER_WORD ? (mask >> continuous_bits) : 0; + offset += continuous_bits; + } + } + + /** + * Change the state of a range of pages + * + * @param dirty_addr Base address to mark or unmark as modified + * @param size Size in bytes to mark or unmark as modified + */ + template <Type type, bool enable> + void ChangeRegionState(u64 dirty_addr, u64 size) noexcept(type == Type::GPU) { + std::span<u64> state_words = words.template Span<type>(); + [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>(); + [[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>(); + IterateWords(dirty_addr - cpu_addr, size, [&](size_t index, u64 mask) { + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer<!enable>(index, untracked_words[index], mask); + } + if constexpr (enable) { + state_words[index] |= mask; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[index] |= mask; + } + if constexpr (type == Type::CPU) { + cached_words[index] &= ~mask; + } + } else { + if constexpr (type == Type::CPU) { + const u64 word = state_words[index] & mask; + cached_words[index] &= ~word; + } + state_words[index] &= ~mask; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[index] &= ~mask; + } + } + }); + } + + /** + * Loop over each page in the given range, turn off those bits and notify the rasterizer if + * needed. Call the given function on each turned off range. + * + * @param query_cpu_range Base CPU address to loop over + * @param size Size in bytes of the CPU range to loop over + * @param func Function to call for each turned off region + */ + template <Type type, bool clear, typename Func> + void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { + static_assert(type != Type::Untracked); + + std::span<u64> state_words = words.template Span<type>(); + [[maybe_unused]] std::span<u64> untracked_words = words.template Span<Type::Untracked>(); + [[maybe_unused]] std::span<u64> cached_words = words.template Span<Type::CachedCPU>(); + const size_t offset = query_cpu_range - cpu_addr; + bool pending = false; + size_t pending_offset{}; + size_t pending_pointer{}; + const auto release = [&]() { + func(cpu_addr + pending_offset * BYTES_PER_PAGE, + (pending_pointer - pending_offset) * BYTES_PER_PAGE); + }; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; + if constexpr (clear) { + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer<true>(index, untracked_words[index], mask); + } + state_words[index] &= ~mask; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[index] &= ~mask; + } + if constexpr (type == Type::CPU) { + cached_words[index] &= ~word; + } + } + const size_t base_offset = index * PAGES_PER_WORD; + IteratePages(word, [&](size_t pages_offset, size_t pages_size) { + const auto reset = [&]() { + pending_offset = base_offset + pages_offset; + pending_pointer = base_offset + pages_offset + pages_size; + }; + if (!pending) { + reset(); + pending = true; + return; + } + if (pending_pointer == base_offset + pages_offset) { + pending_pointer += pages_size; + return; + } + release(); + reset(); + }); + }); + if (pending) { + release(); + } + } + + /** + * Returns true when a region has been modified + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template <Type type> + [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + + const std::span<const u64> state_words = words.template Span<type>(); + bool result = false; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; + if (word != 0) { + result = true; + return true; + } + return false; + }); + return result; + } + + /** + * Returns a begin end pair with the inclusive modified region + * + * @param offset Offset in bytes from the start of the buffer + * @param size Size in bytes of the region to query for modifications + */ + template <Type type> + [[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept { + static_assert(type != Type::Untracked); + const std::span<const u64> state_words = words.template Span<type>(); + u64 begin = std::numeric_limits<u64>::max(); + u64 end = 0; + IterateWords(offset, size, [&](size_t index, u64 mask) { + const u64 word = state_words[index] & mask; + if (word == 0) { + return; + } + const u64 local_page_begin = std::countr_zero(word); + const u64 local_page_end = PAGES_PER_WORD - std::countl_zero(word); + const u64 page_index = index * PAGES_PER_WORD; + begin = std::min(begin, page_index + local_page_begin); + end = page_index + local_page_end; + }); + static constexpr std::pair<u64, u64> EMPTY{0, 0}; + return begin < end ? std::make_pair(begin * BYTES_PER_PAGE, end * BYTES_PER_PAGE) : EMPTY; + } + + /// Returns the number of words of the manager + [[nodiscard]] size_t NumWords() const noexcept { + return words.NumWords(); + } + + /// Returns the size in bytes of the manager + [[nodiscard]] u64 SizeBytes() const noexcept { + return words.size_bytes; + } + + /// Returns true when the buffer fits in the small vector optimization + [[nodiscard]] bool IsShort() const noexcept { + return words.IsShort(); + } + + void FlushCachedWrites() noexcept { + const u64 num_words = NumWords(); + u64* const cached_words = Array<Type::CachedCPU>(); + u64* const untracked_words = Array<Type::Untracked>(); + u64* const cpu_words = Array<Type::CPU>(); + for (u64 word_index = 0; word_index < num_words; ++word_index) { + const u64 cached_bits = cached_words[word_index]; + NotifyRasterizer<false>(word_index, untracked_words[word_index], cached_bits); + untracked_words[word_index] |= cached_bits; + cpu_words[word_index] |= cached_bits; + cached_words[word_index] = 0; + } + } + +private: + template <Type type> + u64* Array() noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + template <Type type> + const u64* Array() const noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + /** + * Notify rasterizer about changes in the CPU tracking state of a word in the buffer + * + * @param word_index Index to the word to notify to the rasterizer + * @param current_bits Current state of the word + * @param new_bits New state of the word + * + * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages + */ + template <bool add_to_rasterizer> + void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { + u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; + VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; + IteratePages(changed_bits, [&](size_t offset, size_t size) { + rasterizer->UpdatePagesCachedCount(addr + offset * BYTES_PER_PAGE, + size * BYTES_PER_PAGE, add_to_rasterizer ? 1 : -1); + }); + } + + VAddr cpu_addr = 0; + RasterizerInterface* rasterizer = nullptr; + Words<stack_words> words; +}; + +} // namespace VideoCommon diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp index 4e75f33ca..ab4f4d407 100644 --- a/src/video_core/compatible_formats.cpp +++ b/src/video_core/compatible_formats.cpp @@ -126,15 +126,14 @@ constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{ PixelFormat::ASTC_2D_8X8_SRGB, }; -// Missing formats: -// PixelFormat::ASTC_2D_10X5_UNORM -// PixelFormat::ASTC_2D_10X5_SRGB - -// Missing formats: -// PixelFormat::ASTC_2D_10X6_SRGB +constexpr std::array VIEW_CLASS_ASTC_10x5_RGBA{ + PixelFormat::ASTC_2D_10X5_UNORM, + PixelFormat::ASTC_2D_10X5_SRGB, +}; constexpr std::array VIEW_CLASS_ASTC_10x6_RGBA{ PixelFormat::ASTC_2D_10X6_UNORM, + PixelFormat::ASTC_2D_10X6_SRGB, }; constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{ @@ -147,9 +146,10 @@ constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{ PixelFormat::ASTC_2D_10X10_SRGB, }; -// Missing formats -// ASTC_2D_12X10_UNORM, -// ASTC_2D_12X10_SRGB, +constexpr std::array VIEW_CLASS_ASTC_12x10_RGBA{ + PixelFormat::ASTC_2D_12X10_UNORM, + PixelFormat::ASTC_2D_12X10_SRGB, +}; constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{ PixelFormat::ASTC_2D_12X12_UNORM, @@ -229,9 +229,11 @@ constexpr Table MakeViewTable() { EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA); EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA); EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_10x5_RGBA); EnableRange(view, VIEW_CLASS_ASTC_10x6_RGBA); EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA); EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_12x10_RGBA); EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA); return view; } diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index c390ac91b..3b2f6aab6 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -4,13 +4,20 @@ #pragma once #include <algorithm> +#include <condition_variable> #include <cstring> #include <deque> #include <functional> #include <memory> +#include <mutex> +#include <thread> #include <queue> #include "common/common_types.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" +#include "common/settings.h" +#include "common/thread.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/gpu.h" #include "video_core/host1x/host1x.h" @@ -23,15 +30,26 @@ class FenceBase { public: explicit FenceBase(bool is_stubbed_) : is_stubbed{is_stubbed_} {} + bool IsStubbed() const { + return is_stubbed; + } + protected: bool is_stubbed; }; -template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache> +template <typename Traits> class FenceManager { + using TFence = typename Traits::FenceType; + using TTextureCache = typename Traits::TextureCacheType; + using TBufferCache = typename Traits::BufferCacheType; + using TQueryCache = typename Traits::QueryCacheType; + static constexpr bool can_async_check = Traits::HAS_ASYNC_CHECK; + public: /// Notify the fence manager about a new frame void TickFrame() { + std::unique_lock lock(ring_guard); delayed_destruction_ring.Tick(); } @@ -46,17 +64,33 @@ public: } void SignalFence(std::function<void()>&& func) { - TryReleasePendingFences(); + rasterizer.InvalidateGPUCache(); + bool delay_fence = Settings::IsGPULevelHigh(); + if constexpr (!can_async_check) { + TryReleasePendingFences<false>(); + } const bool should_flush = ShouldFlush(); CommitAsyncFlushes(); - uncommitted_operations.emplace_back(std::move(func)); - CommitOperations(); TFence new_fence = CreateFence(!should_flush); - fences.push(new_fence); + if constexpr (can_async_check) { + guard.lock(); + } + if (delay_fence) { + uncommitted_operations.emplace_back(std::move(func)); + } + pending_operations.emplace_back(std::move(uncommitted_operations)); QueueFence(new_fence); + if (!delay_fence) { + func(); + } + fences.push(std::move(new_fence)); if (should_flush) { rasterizer.FlushCommands(); } + if constexpr (can_async_check) { + guard.unlock(); + cv.notify_all(); + } } void SignalSyncPoint(u32 value) { @@ -66,29 +100,30 @@ public: } void WaitPendingFences() { - while (!fences.empty()) { - TFence& current_fence = fences.front(); - if (ShouldWait()) { - WaitFence(current_fence); - } - PopAsyncFlushes(); - auto operations = std::move(pending_operations.front()); - pending_operations.pop_front(); - for (auto& operation : operations) { - operation(); - } - PopFence(); + if constexpr (!can_async_check) { + TryReleasePendingFences<true>(); } } protected: explicit FenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, - TTextureCache& texture_cache_, TTBufferCache& buffer_cache_, + TTextureCache& texture_cache_, TBufferCache& buffer_cache_, TQueryCache& query_cache_) : rasterizer{rasterizer_}, gpu{gpu_}, syncpoint_manager{gpu.Host1x().GetSyncpointManager()}, - texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} {} + texture_cache{texture_cache_}, buffer_cache{buffer_cache_}, query_cache{query_cache_} { + if constexpr (can_async_check) { + fence_thread = + std::jthread([this](std::stop_token token) { ReleaseThreadFunc(token); }); + } + } - virtual ~FenceManager() = default; + virtual ~FenceManager() { + if constexpr (can_async_check) { + fence_thread.request_stop(); + cv.notify_all(); + fence_thread.join(); + } + } /// Creates a Fence Interface, does not create a backend fence if 'is_stubbed' is /// true @@ -104,15 +139,20 @@ protected: Tegra::GPU& gpu; Tegra::Host1x::SyncpointManager& syncpoint_manager; TTextureCache& texture_cache; - TTBufferCache& buffer_cache; + TBufferCache& buffer_cache; TQueryCache& query_cache; private: + template <bool force_wait> void TryReleasePendingFences() { while (!fences.empty()) { TFence& current_fence = fences.front(); if (ShouldWait() && !IsFenceSignaled(current_fence)) { - return; + if constexpr (force_wait) { + WaitFence(current_fence); + } else { + return; + } } PopAsyncFlushes(); auto operations = std::move(pending_operations.front()); @@ -120,7 +160,49 @@ private: for (auto& operation : operations) { operation(); } - PopFence(); + { + std::unique_lock lock(ring_guard); + delayed_destruction_ring.Push(std::move(current_fence)); + } + fences.pop(); + } + } + + void ReleaseThreadFunc(std::stop_token stop_token) { + std::string name = "GPUFencingThread"; + MicroProfileOnThreadCreate(name.c_str()); + + // Cleanup + SCOPE_EXIT({ MicroProfileOnThreadExit(); }); + + Common::SetCurrentThreadName(name.c_str()); + Common::SetCurrentThreadPriority(Common::ThreadPriority::High); + + TFence current_fence; + std::deque<std::function<void()>> current_operations; + while (!stop_token.stop_requested()) { + { + std::unique_lock lock(guard); + cv.wait(lock, [&] { return stop_token.stop_requested() || !fences.empty(); }); + if (stop_token.stop_requested()) [[unlikely]] { + return; + } + current_fence = std::move(fences.front()); + current_operations = std::move(pending_operations.front()); + fences.pop(); + pending_operations.pop_front(); + } + if (!current_fence->IsStubbed()) { + WaitFence(current_fence); + } + PopAsyncFlushes(); + for (auto& operation : current_operations) { + operation(); + } + { + std::unique_lock lock(ring_guard); + delayed_destruction_ring.Push(std::move(current_fence)); + } } } @@ -154,19 +236,16 @@ private: query_cache.CommitAsyncFlushes(); } - void PopFence() { - delayed_destruction_ring.Push(std::move(fences.front())); - fences.pop(); - } - - void CommitOperations() { - pending_operations.emplace_back(std::move(uncommitted_operations)); - } - std::queue<TFence> fences; std::deque<std::function<void()>> uncommitted_operations; std::deque<std::deque<std::function<void()>>> pending_operations; + std::mutex guard; + std::mutex ring_guard; + std::condition_variable cv; + + std::jthread fence_thread; + DelayedDestructionRing<TFence, 6> delayed_destruction_ring; }; diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 01fb5b546..7b2cde7a7 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -82,6 +82,7 @@ void MemoryManager::SetEntry(size_t position, MemoryManager::EntryType entry) { } PTEKind MemoryManager::GetPageKind(GPUVAddr gpu_addr) const { + std::unique_lock<std::mutex> lock(guard); return kind_map.GetValueAt(gpu_addr); } @@ -160,7 +161,10 @@ GPUVAddr MemoryManager::BigPageTableOp(GPUVAddr gpu_addr, [[maybe_unused]] VAddr } remaining_size -= big_page_size; } - kind_map.Map(gpu_addr, gpu_addr + size, kind); + { + std::unique_lock<std::mutex> lock(guard); + kind_map.Map(gpu_addr, gpu_addr + size, kind); + } return gpu_addr; } @@ -553,6 +557,7 @@ size_t MemoryManager::MaxContinuousRange(GPUVAddr gpu_addr, size_t size) const { } size_t MemoryManager::GetMemoryLayoutSize(GPUVAddr gpu_addr, size_t max_size) const { + std::unique_lock<std::mutex> lock(guard); return kind_map.GetContinuousSizeFrom(gpu_addr); } @@ -745,10 +750,10 @@ void MemoryManager::FlushCaching() { return; } accumulator->Callback([this](GPUVAddr addr, size_t size) { - GetSubmappedRangeImpl<false>(addr, size, page_stash); + GetSubmappedRangeImpl<false>(addr, size, page_stash2); }); - rasterizer->InnerInvalidation(page_stash); - page_stash.clear(); + rasterizer->InnerInvalidation(page_stash2); + page_stash2.clear(); accumulator->Clear(); } diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index fbbe856c4..794535122 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -5,6 +5,7 @@ #include <atomic> #include <map> +#include <mutex> #include <optional> #include <vector> @@ -215,6 +216,9 @@ private: std::vector<u64> big_page_continuous; std::vector<std::pair<VAddr, std::size_t>> page_stash{}; + std::vector<std::pair<VAddr, std::size_t>> page_stash2{}; + + mutable std::mutex guard; static constexpr size_t continuous_bits = 64; diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index 8906ba6d8..941de95c1 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -6,6 +6,7 @@ #include <algorithm> #include <array> #include <cstring> +#include <functional> #include <iterator> #include <list> #include <memory> @@ -17,13 +18,19 @@ #include "common/assert.h" #include "common/settings.h" +#include "core/memory.h" #include "video_core/control/channel_state_cache.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/texture_cache/slot_vector.h" namespace VideoCommon { +using AsyncJobId = SlotId; + +static constexpr AsyncJobId NULL_ASYNC_JOB_ID{0}; + template <class QueryCache, class HostCounter> class CounterStreamBase { public: @@ -93,9 +100,13 @@ private: template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { public: - explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_) - : rasterizer{rasterizer_}, streams{{CounterStream{static_cast<QueryCache&>(*this), - VideoCore::QueryType::SamplesPassed}}} {} + explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_) + : rasterizer{rasterizer_}, + cpu_memory{cpu_memory_}, streams{{CounterStream{static_cast<QueryCache&>(*this), + VideoCore::QueryType::SamplesPassed}}} { + (void)slot_async_jobs.insert(); // Null value + } void InvalidateRegion(VAddr addr, std::size_t size) { std::unique_lock lock{mutex}; @@ -126,10 +137,15 @@ public: query = Register(type, *cpu_addr, host_ptr, timestamp.has_value()); } - query->BindCounter(Stream(type).Current(), timestamp); - if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - AsyncFlushQuery(*cpu_addr); + auto result = query->BindCounter(Stream(type).Current(), timestamp); + if (result) { + auto async_job_id = query->GetAsyncJob(); + auto& async_job = slot_async_jobs[async_job_id]; + async_job.collected = true; + async_job.value = *result; + query->SetAsyncJob(NULL_ASYNC_JOB_ID); } + AsyncFlushQuery(query, timestamp, lock); } /// Updates counters from GPU state. Expected to be called once per draw, clear or dispatch. @@ -173,15 +189,18 @@ public: } void CommitAsyncFlushes() { + std::unique_lock lock{mutex}; committed_flushes.push_back(uncommitted_flushes); uncommitted_flushes.reset(); } bool HasUncommittedFlushes() const { + std::unique_lock lock{mutex}; return uncommitted_flushes != nullptr; } bool ShouldWaitAsyncFlushes() const { + std::unique_lock lock{mutex}; if (committed_flushes.empty()) { return false; } @@ -189,6 +208,7 @@ public: } void PopAsyncFlushes() { + std::unique_lock lock{mutex}; if (committed_flushes.empty()) { return; } @@ -197,15 +217,25 @@ public: committed_flushes.pop_front(); return; } - for (VAddr query_address : *flush_list) { - FlushAndRemoveRegion(query_address, 4); + for (AsyncJobId async_job_id : *flush_list) { + AsyncJob& async_job = slot_async_jobs[async_job_id]; + if (!async_job.collected) { + FlushAndRemoveRegion(async_job.query_location, 2, true); + } } committed_flushes.pop_front(); } private: + struct AsyncJob { + bool collected = false; + u64 value = 0; + VAddr query_location = 0; + std::optional<u64> timestamp{}; + }; + /// Flushes a memory range to guest memory and removes it from the cache. - void FlushAndRemoveRegion(VAddr addr, std::size_t size) { + void FlushAndRemoveRegion(VAddr addr, std::size_t size, bool async = false) { const u64 addr_begin = addr; const u64 addr_end = addr_begin + size; const auto in_range = [addr_begin, addr_end](const CachedQuery& query) { @@ -226,7 +256,16 @@ private: continue; } rasterizer.UpdatePagesCachedCount(query.GetCpuAddr(), query.SizeInBytes(), -1); - query.Flush(); + AsyncJobId async_job_id = query.GetAsyncJob(); + auto flush_result = query.Flush(async); + if (async_job_id == NULL_ASYNC_JOB_ID) { + ASSERT_MSG(false, "This should not be reachable at all"); + continue; + } + AsyncJob& async_job = slot_async_jobs[async_job_id]; + async_job.collected = true; + async_job.value = flush_result; + query.SetAsyncJob(NULL_ASYNC_JOB_ID); } std::erase_if(contents, in_range); } @@ -253,26 +292,60 @@ private: return found != std::end(contents) ? &*found : nullptr; } - void AsyncFlushQuery(VAddr addr) { - if (!uncommitted_flushes) { - uncommitted_flushes = std::make_shared<std::vector<VAddr>>(); + void AsyncFlushQuery(CachedQuery* query, std::optional<u64> timestamp, + std::unique_lock<std::recursive_mutex>& lock) { + const AsyncJobId new_async_job_id = slot_async_jobs.insert(); + { + AsyncJob& async_job = slot_async_jobs[new_async_job_id]; + query->SetAsyncJob(new_async_job_id); + async_job.query_location = query->GetCpuAddr(); + async_job.collected = false; + + if (!uncommitted_flushes) { + uncommitted_flushes = std::make_shared<std::vector<AsyncJobId>>(); + } + uncommitted_flushes->push_back(new_async_job_id); } - uncommitted_flushes->push_back(addr); + lock.unlock(); + std::function<void()> operation([this, new_async_job_id, timestamp] { + std::unique_lock local_lock{mutex}; + AsyncJob& async_job = slot_async_jobs[new_async_job_id]; + u64 value = async_job.value; + VAddr address = async_job.query_location; + slot_async_jobs.erase(new_async_job_id); + local_lock.unlock(); + if (timestamp) { + u64 timestamp_value = *timestamp; + cpu_memory.WriteBlockUnsafe(address + sizeof(u64), ×tamp_value, sizeof(u64)); + cpu_memory.WriteBlockUnsafe(address, &value, sizeof(u64)); + rasterizer.InvalidateRegion(address, sizeof(u64) * 2, + VideoCommon::CacheType::NoQueryCache); + } else { + u32 small_value = static_cast<u32>(value); + cpu_memory.WriteBlockUnsafe(address, &small_value, sizeof(u32)); + rasterizer.InvalidateRegion(address, sizeof(u32), + VideoCommon::CacheType::NoQueryCache); + } + }); + rasterizer.SyncOperation(std::move(operation)); } static constexpr std::uintptr_t YUZU_PAGESIZE = 4096; static constexpr unsigned YUZU_PAGEBITS = 12; + SlotVector<AsyncJob> slot_async_jobs; + VideoCore::RasterizerInterface& rasterizer; + Core::Memory::Memory& cpu_memory; - std::recursive_mutex mutex; + mutable std::recursive_mutex mutex; std::unordered_map<u64, std::vector<CachedQuery>> cached_queries; std::array<CounterStream, VideoCore::NumQueryTypes> streams; - std::shared_ptr<std::vector<VAddr>> uncommitted_flushes{}; - std::list<std::shared_ptr<std::vector<VAddr>>> committed_flushes; + std::shared_ptr<std::vector<AsyncJobId>> uncommitted_flushes{}; + std::list<std::shared_ptr<std::vector<AsyncJobId>>> committed_flushes; }; template <class QueryCache, class HostCounter> @@ -291,12 +364,12 @@ public: virtual ~HostCounterBase() = default; /// Returns the current value of the query. - u64 Query() { + u64 Query(bool async = false) { if (result) { return *result; } - u64 value = BlockingQuery() + base_result; + u64 value = BlockingQuery(async) + base_result; if (dependency) { value += dependency->Query(); dependency = nullptr; @@ -317,7 +390,7 @@ public: protected: /// Returns the value of query from the backend API blocking as needed. - virtual u64 BlockingQuery() const = 0; + virtual u64 BlockingQuery(bool async = false) const = 0; private: std::shared_ptr<HostCounter> dependency; ///< Counter to add to this value. @@ -340,26 +413,33 @@ public: CachedQueryBase& operator=(const CachedQueryBase&) = delete; /// Flushes the query to guest memory. - virtual void Flush() { + virtual u64 Flush(bool async = false) { // When counter is nullptr it means that it's just been reset. We are supposed to write a // zero in these cases. - const u64 value = counter ? counter->Query() : 0; + const u64 value = counter ? counter->Query(async) : 0; + if (async) { + return value; + } std::memcpy(host_ptr, &value, sizeof(u64)); if (timestamp) { std::memcpy(host_ptr + TIMESTAMP_OFFSET, &*timestamp, sizeof(u64)); } + return value; } /// Binds a counter to this query. - void BindCounter(std::shared_ptr<HostCounter> counter_, std::optional<u64> timestamp_) { + std::optional<u64> BindCounter(std::shared_ptr<HostCounter> counter_, + std::optional<u64> timestamp_) { + std::optional<u64> result{}; if (counter) { // If there's an old counter set it means the query is being rewritten by the game. // To avoid losing the data forever, flush here. - Flush(); + result = std::make_optional(Flush()); } counter = std::move(counter_); timestamp = timestamp_; + return result; } VAddr GetCpuAddr() const noexcept { @@ -374,6 +454,14 @@ public: return with_timestamp ? LARGE_QUERY_SIZE : SMALL_QUERY_SIZE; } + void SetAsyncJob(AsyncJobId assigned_async_job_) { + assigned_async_job = assigned_async_job_; + } + + AsyncJobId GetAsyncJob() const { + return assigned_async_job; + } + protected: /// Returns true when querying the counter may potentially block. bool WaitPending() const noexcept { @@ -389,6 +477,7 @@ private: u8* host_ptr; ///< Writable host pointer. std::shared_ptr<HostCounter> counter; ///< Host counter to query, owns the dependency tree. std::optional<u64> timestamp; ///< Timestamp to flush to guest memory. + AsyncJobId assigned_async_job; }; } // namespace VideoCommon diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index a8c3f8b67..18d3c3ac0 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -8,6 +8,7 @@ #include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" @@ -200,6 +201,8 @@ private: struct BufferCacheParams { using Runtime = OpenGL::BufferCacheRuntime; using Buffer = OpenGL::Buffer; + using Async_Buffer = u32; + using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>; static constexpr bool IS_OPENGL = true; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; @@ -208,6 +211,7 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; static constexpr bool USE_MEMORY_MAPS = false; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = false; }; using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp new file mode 100644 index 000000000..f15ae8e25 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_opengl/gl_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache<OpenGL::BufferCacheParams>; +} diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h index f1446e732..e21b19dcc 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.h +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -30,7 +30,17 @@ private: }; using Fence = std::shared_ptr<GLInnerFence>; -using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>; + +struct FenceManagerParams { + using FenceType = Fence; + using BufferCacheType = BufferCache; + using TextureCacheType = TextureCache; + using QueryCacheType = QueryCache; + + static constexpr bool HAS_ASYNC_CHECK = false; +}; + +using GenericFenceManager = VideoCommon::FenceManager<FenceManagerParams>; class FenceManagerOpenGL final : public GenericFenceManager { public: diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 5070db441..99d7347f5 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -26,8 +26,8 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace -QueryCache::QueryCache(RasterizerOpenGL& rasterizer_) - : QueryCacheBase(rasterizer_), gl_rasterizer{rasterizer_} {} +QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) + : QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} QueryCache::~QueryCache() = default; @@ -74,7 +74,7 @@ void HostCounter::EndQuery() { glEndQuery(GetTarget(type)); } -u64 HostCounter::BlockingQuery() const { +u64 HostCounter::BlockingQuery([[maybe_unused]] bool async) const { GLint64 value; glGetQueryObjecti64v(query.handle, GL_QUERY_RESULT, &value); return static_cast<u64>(value); @@ -96,7 +96,7 @@ CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { return *this; } -void CachedQuery::Flush() { +u64 CachedQuery::Flush([[maybe_unused]] bool async) { // Waiting for a query while another query of the same target is enabled locks Nvidia's driver. // To avoid this disable and re-enable keeping the dependency stream. // But we only have to do this if we have pending waits to be done. @@ -106,11 +106,13 @@ void CachedQuery::Flush() { stream.Update(false); } - VideoCommon::CachedQueryBase<HostCounter>::Flush(); + auto result = VideoCommon::CachedQueryBase<HostCounter>::Flush(); if (slice_counter) { stream.Update(true); } + + return result; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index 14ce59990..872513f22 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -28,7 +28,7 @@ using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(RasterizerOpenGL& rasterizer_); + explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); ~QueryCache(); OGLQuery AllocateQuery(VideoCore::QueryType type); @@ -51,7 +51,7 @@ public: void EndQuery(); private: - u64 BlockingQuery() const override; + u64 BlockingQuery(bool async = false) const override; QueryCache& cache; const VideoCore::QueryType type; @@ -70,7 +70,7 @@ public: CachedQuery(const CachedQuery&) = delete; CachedQuery& operator=(const CachedQuery&) = delete; - void Flush() override; + u64 Flush(bool async = false) override; private: QueryCache* cache; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 4993d4709..0089b4b27 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -63,7 +63,7 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache(*this, cpu_memory_, buffer_cache_runtime), shader_cache(*this, emu_window_, device, texture_cache, buffer_cache, program_manager, state_tracker, gpu.ShaderNotify()), - query_cache(*this), accelerate_dma(buffer_cache, texture_cache), + query_cache(*this, cpu_memory_), accelerate_dma(buffer_cache, texture_cache), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), blit_image(program_manager_) {} diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 032a8ebc5..47cccd0e5 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -861,9 +861,12 @@ GLuint Image::StorageHandle() noexcept { case PixelFormat::ASTC_2D_8X5_SRGB: case PixelFormat::ASTC_2D_5X4_SRGB: case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_10X5_SRGB: + case PixelFormat::ASTC_2D_10X6_SRGB: case PixelFormat::ASTC_2D_10X8_SRGB: case PixelFormat::ASTC_2D_6X6_SRGB: case PixelFormat::ASTC_2D_10X10_SRGB: + case PixelFormat::ASTC_2D_12X10_SRGB: case PixelFormat::ASTC_2D_12X12_SRGB: case PixelFormat::ASTC_2D_8X6_SRGB: case PixelFormat::ASTC_2D_6X5_SRGB: diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index ef1190e1f..c7dc7e0a1 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -100,10 +100,13 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB {GL_COMPRESSED_RGBA_ASTC_10x6_KHR}, // ASTC_2D_10X6_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR}, // ASTC_2D_10X6_SRGB {GL_COMPRESSED_RGBA_ASTC_10x5_KHR}, // ASTC_2D_10X5_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR}, // ASTC_2D_10X5_SRGB {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB + {GL_COMPRESSED_RGBA_ASTC_12x10_KHR}, // ASTC_2D_12X10_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR}, // ASTC_2D_12X10_SRGB {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 5dce51be8..8853cf0f7 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -197,10 +197,13 @@ struct FormatTuple { {VK_FORMAT_ASTC_6x6_UNORM_BLOCK}, // ASTC_2D_6X6_UNORM {VK_FORMAT_ASTC_6x6_SRGB_BLOCK}, // ASTC_2D_6X6_SRGB {VK_FORMAT_ASTC_10x6_UNORM_BLOCK}, // ASTC_2D_10X6_UNORM + {VK_FORMAT_ASTC_10x6_SRGB_BLOCK}, // ASTC_2D_10X6_SRGB {VK_FORMAT_ASTC_10x5_UNORM_BLOCK}, // ASTC_2D_10X5_UNORM {VK_FORMAT_ASTC_10x5_SRGB_BLOCK}, // ASTC_2D_10X5_SRGB {VK_FORMAT_ASTC_10x10_UNORM_BLOCK}, // ASTC_2D_10X10_UNORM {VK_FORMAT_ASTC_10x10_SRGB_BLOCK}, // ASTC_2D_10X10_SRGB + {VK_FORMAT_ASTC_12x10_UNORM_BLOCK}, // ASTC_2D_12X10_UNORM + {VK_FORMAT_ASTC_12x10_SRGB_BLOCK}, // ASTC_2D_12X10_SRGB {VK_FORMAT_ASTC_12x12_UNORM_BLOCK}, // ASTC_2D_12X12_UNORM {VK_FORMAT_ASTC_12x12_SRGB_BLOCK}, // ASTC_2D_12X12_SRGB {VK_FORMAT_ASTC_8x6_UNORM_BLOCK}, // ASTC_2D_8X6_UNORM diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 2a8d9e377..908625c66 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -93,8 +93,9 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, state_tracker(), scheduler(device, state_tracker), swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width, render_window.GetFramebufferLayout().height, false), - blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler, - screen_info), + present_manager(render_window, device, memory_allocator, scheduler, swapchain), + blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, present_manager, + scheduler, screen_info), rasterizer(render_window, gpu, cpu_memory, screen_info, device, memory_allocator, state_tracker, scheduler) { if (Settings::values.renderer_force_max_clock.GetValue() && device.ShouldBoostClocks()) { @@ -121,46 +122,19 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { return; } // Update screen info if the framebuffer size has changed. - if (screen_info.width != framebuffer->width || screen_info.height != framebuffer->height) { - screen_info.width = framebuffer->width; - screen_info.height = framebuffer->height; - } + screen_info.width = framebuffer->width; + screen_info.height = framebuffer->height; + const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset; const bool use_accelerated = rasterizer.AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); const bool is_srgb = use_accelerated && screen_info.is_srgb; RenderScreenshot(*framebuffer, use_accelerated); - bool has_been_recreated = false; - const auto recreate_swapchain = [&](u32 width, u32 height) { - if (!has_been_recreated) { - has_been_recreated = true; - scheduler.Finish(); - } - swapchain.Create(width, height, is_srgb); - }; - - const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout(); - if (swapchain.NeedsRecreation(is_srgb) || swapchain.GetWidth() != layout.width || - swapchain.GetHeight() != layout.height) { - recreate_swapchain(layout.width, layout.height); - } - bool is_outdated; - do { - swapchain.AcquireNextImage(); - is_outdated = swapchain.IsOutDated(); - if (is_outdated) { - recreate_swapchain(layout.width, layout.height); - } - } while (is_outdated); - if (has_been_recreated) { - blit_screen.Recreate(); - } - const VkSemaphore render_semaphore = blit_screen.DrawToSwapchain(*framebuffer, use_accelerated); - const VkSemaphore present_semaphore = swapchain.CurrentPresentSemaphore(); - scheduler.Flush(render_semaphore, present_semaphore); - scheduler.WaitWorker(); - swapchain.Present(render_semaphore); + Frame* frame = present_manager.GetRenderFrame(); + blit_screen.DrawToSwapchain(frame, *framebuffer, use_accelerated, is_srgb); + scheduler.Flush(*frame->render_ready); + present_manager.Present(frame); gpu.RendererFrameEndNotify(); rasterizer.TickFrame(); @@ -246,8 +220,7 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr }); const VkExtent2D render_area{.width = layout.width, .height = layout.height}; const vk::Framebuffer screenshot_fb = blit_screen.CreateFramebuffer(*dst_view, render_area); - // Since we're not rendering to the screen, ignore the render semaphore. - void(blit_screen.Draw(framebuffer, *screenshot_fb, layout, render_area, use_accelerated)); + blit_screen.Draw(framebuffer, *screenshot_fb, layout, render_area, use_accelerated); const auto buffer_size = static_cast<VkDeviceSize>(layout.width * layout.height * 4); const VkBufferCreateInfo dst_buffer_info{ @@ -270,7 +243,7 @@ void Vulkan::RendererVulkan::RenderScreenshot(const Tegra::FramebufferConfig& fr .pNext = nullptr, .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, - .oldLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 009e75e0d..f44367cb2 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -9,6 +9,7 @@ #include "common/dynamic_library.h" #include "video_core/renderer_base.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" +#include "video_core/renderer_vulkan/vk_present_manager.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" @@ -76,6 +77,7 @@ private: StateTracker state_tracker; Scheduler scheduler; Swapchain swapchain; + PresentManager present_manager; BlitScreen blit_screen; RasterizerVulkan rasterizer; std::optional<TurboMode> turbo_mode; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 2f0cc27e8..1e0fdd3d9 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -122,10 +122,12 @@ struct BlitScreen::BufferData { BlitScreen::BlitScreen(Core::Memory::Memory& cpu_memory_, Core::Frontend::EmuWindow& render_window_, const Device& device_, MemoryAllocator& memory_allocator_, - Swapchain& swapchain_, Scheduler& scheduler_, const ScreenInfo& screen_info_) + Swapchain& swapchain_, PresentManager& present_manager_, + Scheduler& scheduler_, const ScreenInfo& screen_info_) : cpu_memory{cpu_memory_}, render_window{render_window_}, device{device_}, - memory_allocator{memory_allocator_}, swapchain{swapchain_}, scheduler{scheduler_}, - image_count{swapchain.GetImageCount()}, screen_info{screen_info_} { + memory_allocator{memory_allocator_}, swapchain{swapchain_}, present_manager{present_manager_}, + scheduler{scheduler_}, image_count{swapchain.GetImageCount()}, screen_info{screen_info_}, + current_srgb{swapchain.IsSrgb()}, image_view_format{swapchain.GetImageViewFormat()} { resource_ticks.resize(image_count); CreateStaticResources(); @@ -135,25 +137,20 @@ BlitScreen::BlitScreen(Core::Memory::Memory& cpu_memory_, Core::Frontend::EmuWin BlitScreen::~BlitScreen() = default; void BlitScreen::Recreate() { + present_manager.WaitPresent(); + scheduler.Finish(); + device.GetLogical().WaitIdle(); CreateDynamicResources(); } -VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, - const VkFramebuffer& host_framebuffer, - const Layout::FramebufferLayout layout, VkExtent2D render_area, - bool use_accelerated) { +void BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, + const VkFramebuffer& host_framebuffer, const Layout::FramebufferLayout layout, + VkExtent2D render_area, bool use_accelerated) { RefreshResources(framebuffer); // Finish any pending renderpass scheduler.RequestOutsideRenderPassOperationContext(); - if (const auto swapchain_images = swapchain.GetImageCount(); swapchain_images != image_count) { - image_count = swapchain_images; - Recreate(); - } - - const std::size_t image_index = swapchain.GetImageIndex(); - scheduler.Wait(resource_ticks[image_index]); resource_ticks[image_index] = scheduler.CurrentTick(); @@ -169,7 +166,7 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, std::memcpy(mapped_span.data(), &data, sizeof(data)); if (!use_accelerated) { - const u64 image_offset = GetRawImageOffset(framebuffer, image_index); + const u64 image_offset = GetRawImageOffset(framebuffer); const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr); @@ -204,8 +201,8 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, .depth = 1, }, }; - scheduler.Record([this, copy, image_index](vk::CommandBuffer cmdbuf) { - const VkImage image = *raw_images[image_index]; + scheduler.Record([this, copy, index = image_index](vk::CommandBuffer cmdbuf) { + const VkImage image = *raw_images[index]; const VkImageMemoryBarrier base_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, @@ -245,14 +242,15 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, const auto anti_alias_pass = Settings::values.anti_aliasing.GetValue(); if (use_accelerated && anti_alias_pass == Settings::AntiAliasing::Fxaa) { - UpdateAADescriptorSet(image_index, source_image_view, false); + UpdateAADescriptorSet(source_image_view, false); const u32 up_scale = Settings::values.resolution_info.up_scale; const u32 down_shift = Settings::values.resolution_info.down_shift; VkExtent2D size{ .width = (up_scale * framebuffer.width) >> down_shift, .height = (up_scale * framebuffer.height) >> down_shift, }; - scheduler.Record([this, image_index, size, anti_alias_pass](vk::CommandBuffer cmdbuf) { + scheduler.Record([this, index = image_index, size, + anti_alias_pass](vk::CommandBuffer cmdbuf) { const VkImageMemoryBarrier base_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, @@ -326,7 +324,7 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices)); cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *aa_pipeline_layout, 0, - aa_descriptor_sets[image_index], {}); + aa_descriptor_sets[index], {}); cmdbuf.Draw(4, 1, 0, 0); cmdbuf.EndRenderPass(); @@ -369,81 +367,99 @@ VkSemaphore BlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, }; VkImageView fsr_image_view = fsr->Draw(scheduler, image_index, source_image_view, fsr_input_size, crop_rect); - UpdateDescriptorSet(image_index, fsr_image_view, true); + UpdateDescriptorSet(fsr_image_view, true); } else { const bool is_nn = Settings::values.scaling_filter.GetValue() == Settings::ScalingFilter::NearestNeighbor; - UpdateDescriptorSet(image_index, source_image_view, is_nn); + UpdateDescriptorSet(source_image_view, is_nn); } - scheduler.Record( - [this, host_framebuffer, image_index, size = render_area](vk::CommandBuffer cmdbuf) { - const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f; - const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f; - const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f; - const VkClearValue clear_color{ - .color = {.float32 = {bg_red, bg_green, bg_blue, 1.0f}}, - }; - const VkRenderPassBeginInfo renderpass_bi{ - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, - .pNext = nullptr, - .renderPass = *renderpass, - .framebuffer = host_framebuffer, - .renderArea = - { - .offset = {0, 0}, - .extent = size, - }, - .clearValueCount = 1, - .pClearValues = &clear_color, - }; - const VkViewport viewport{ - .x = 0.0f, - .y = 0.0f, - .width = static_cast<float>(size.width), - .height = static_cast<float>(size.height), - .minDepth = 0.0f, - .maxDepth = 1.0f, - }; - const VkRect2D scissor{ - .offset = {0, 0}, - .extent = size, - }; - cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); - auto graphics_pipeline = [this]() { - switch (Settings::values.scaling_filter.GetValue()) { - case Settings::ScalingFilter::NearestNeighbor: - case Settings::ScalingFilter::Bilinear: - return *bilinear_pipeline; - case Settings::ScalingFilter::Bicubic: - return *bicubic_pipeline; - case Settings::ScalingFilter::Gaussian: - return *gaussian_pipeline; - case Settings::ScalingFilter::ScaleForce: - return *scaleforce_pipeline; - default: - return *bilinear_pipeline; - } - }(); - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipeline); - cmdbuf.SetViewport(0, viewport); - cmdbuf.SetScissor(0, scissor); - - cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices)); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline_layout, 0, - descriptor_sets[image_index], {}); - cmdbuf.Draw(4, 1, 0, 0); - cmdbuf.EndRenderPass(); - }); - return *semaphores[image_index]; + scheduler.Record([this, host_framebuffer, index = image_index, + size = render_area](vk::CommandBuffer cmdbuf) { + const f32 bg_red = Settings::values.bg_red.GetValue() / 255.0f; + const f32 bg_green = Settings::values.bg_green.GetValue() / 255.0f; + const f32 bg_blue = Settings::values.bg_blue.GetValue() / 255.0f; + const VkClearValue clear_color{ + .color = {.float32 = {bg_red, bg_green, bg_blue, 1.0f}}, + }; + const VkRenderPassBeginInfo renderpass_bi{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = nullptr, + .renderPass = *renderpass, + .framebuffer = host_framebuffer, + .renderArea = + { + .offset = {0, 0}, + .extent = size, + }, + .clearValueCount = 1, + .pClearValues = &clear_color, + }; + const VkViewport viewport{ + .x = 0.0f, + .y = 0.0f, + .width = static_cast<float>(size.width), + .height = static_cast<float>(size.height), + .minDepth = 0.0f, + .maxDepth = 1.0f, + }; + const VkRect2D scissor{ + .offset = {0, 0}, + .extent = size, + }; + cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); + auto graphics_pipeline = [this]() { + switch (Settings::values.scaling_filter.GetValue()) { + case Settings::ScalingFilter::NearestNeighbor: + case Settings::ScalingFilter::Bilinear: + return *bilinear_pipeline; + case Settings::ScalingFilter::Bicubic: + return *bicubic_pipeline; + case Settings::ScalingFilter::Gaussian: + return *gaussian_pipeline; + case Settings::ScalingFilter::ScaleForce: + return *scaleforce_pipeline; + default: + return *bilinear_pipeline; + } + }(); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, graphics_pipeline); + cmdbuf.SetViewport(0, viewport); + cmdbuf.SetScissor(0, scissor); + + cmdbuf.BindVertexBuffer(0, *buffer, offsetof(BufferData, vertices)); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, *pipeline_layout, 0, + descriptor_sets[index], {}); + cmdbuf.Draw(4, 1, 0, 0); + cmdbuf.EndRenderPass(); + }); } -VkSemaphore BlitScreen::DrawToSwapchain(const Tegra::FramebufferConfig& framebuffer, - bool use_accelerated) { - const std::size_t image_index = swapchain.GetImageIndex(); - const VkExtent2D render_area = swapchain.GetSize(); +void BlitScreen::DrawToSwapchain(Frame* frame, const Tegra::FramebufferConfig& framebuffer, + bool use_accelerated, bool is_srgb) { + // Recreate dynamic resources if the the image count or colorspace changed + if (const std::size_t swapchain_images = swapchain.GetImageCount(); + swapchain_images != image_count || current_srgb != is_srgb) { + current_srgb = is_srgb; + image_view_format = current_srgb ? VK_FORMAT_B8G8R8A8_SRGB : VK_FORMAT_B8G8R8A8_UNORM; + image_count = swapchain_images; + Recreate(); + } + + // Recreate the presentation frame if the dimensions of the window changed const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout(); - return Draw(framebuffer, *framebuffers[image_index], layout, render_area, use_accelerated); + if (layout.width != frame->width || layout.height != frame->height || + is_srgb != frame->is_srgb) { + Recreate(); + present_manager.RecreateFrame(frame, layout.width, layout.height, is_srgb, + image_view_format, *renderpass); + } + + const VkExtent2D render_area{frame->width, frame->height}; + Draw(framebuffer, *frame->framebuffer, layout, render_area, use_accelerated); + if (++image_index >= image_count) { + image_index = 0; + } } vk::Framebuffer BlitScreen::CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent) { @@ -471,13 +487,11 @@ void BlitScreen::CreateStaticResources() { } void BlitScreen::CreateDynamicResources() { - CreateSemaphores(); CreateDescriptorPool(); CreateDescriptorSetLayout(); CreateDescriptorSets(); CreatePipelineLayout(); CreateRenderPass(); - CreateFramebuffers(); CreateGraphicsPipeline(); fsr.reset(); smaa.reset(); @@ -525,11 +539,6 @@ void BlitScreen::CreateShaders() { } } -void BlitScreen::CreateSemaphores() { - semaphores.resize(image_count); - std::ranges::generate(semaphores, [this] { return device.GetLogical().CreateSemaphore(); }); -} - void BlitScreen::CreateDescriptorPool() { const std::array<VkDescriptorPoolSize, 2> pool_sizes{{ { @@ -571,10 +580,10 @@ void BlitScreen::CreateDescriptorPool() { } void BlitScreen::CreateRenderPass() { - renderpass = CreateRenderPassImpl(swapchain.GetImageViewFormat()); + renderpass = CreateRenderPassImpl(image_view_format); } -vk::RenderPass BlitScreen::CreateRenderPassImpl(VkFormat format, bool is_present) { +vk::RenderPass BlitScreen::CreateRenderPassImpl(VkFormat format) { const VkAttachmentDescription color_attachment{ .flags = 0, .format = format, @@ -584,7 +593,7 @@ vk::RenderPass BlitScreen::CreateRenderPassImpl(VkFormat format, bool is_present .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - .finalLayout = is_present ? VK_IMAGE_LAYOUT_PRESENT_SRC_KHR : VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, }; const VkAttachmentReference color_attachment_ref{ @@ -1052,16 +1061,6 @@ void BlitScreen::CreateSampler() { nn_sampler = device.GetLogical().CreateSampler(ci_nn); } -void BlitScreen::CreateFramebuffers() { - const VkExtent2D size{swapchain.GetSize()}; - framebuffers.resize(image_count); - - for (std::size_t i = 0; i < image_count; ++i) { - const VkImageView image_view{swapchain.GetImageViewIndex(i)}; - framebuffers[i] = CreateFramebuffer(image_view, size, renderpass); - } -} - void BlitScreen::ReleaseRawImages() { for (const u64 tick : resource_ticks) { scheduler.Wait(tick); @@ -1175,7 +1174,7 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { aa_framebuffer = CreateFramebuffer(*aa_image_view, size, aa_renderpass); return; } - aa_renderpass = CreateRenderPassImpl(GetFormat(framebuffer), false); + aa_renderpass = CreateRenderPassImpl(GetFormat(framebuffer)); aa_framebuffer = CreateFramebuffer(*aa_image_view, size, aa_renderpass); const std::array<VkPipelineShaderStageCreateInfo, 2> fxaa_shader_stages{{ @@ -1319,8 +1318,7 @@ void BlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { aa_pipeline = device.GetLogical().CreateGraphicsPipeline(fxaa_pipeline_ci); } -void BlitScreen::UpdateAADescriptorSet(std::size_t image_index, VkImageView image_view, - bool nn) const { +void BlitScreen::UpdateAADescriptorSet(VkImageView image_view, bool nn) const { const VkDescriptorImageInfo image_info{ .sampler = nn ? *nn_sampler : *sampler, .imageView = image_view, @@ -1356,8 +1354,7 @@ void BlitScreen::UpdateAADescriptorSet(std::size_t image_index, VkImageView imag device.GetLogical().UpdateDescriptorSets(std::array{sampler_write, sampler_write_2}, {}); } -void BlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, - bool nn) const { +void BlitScreen::UpdateDescriptorSet(VkImageView image_view, bool nn) const { const VkDescriptorBufferInfo buffer_info{ .buffer = *buffer, .offset = offsetof(BufferData, uniform), @@ -1480,8 +1477,7 @@ u64 BlitScreen::CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) return sizeof(BufferData) + GetSizeInBytes(framebuffer) * image_count; } -u64 BlitScreen::GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, - std::size_t image_index) const { +u64 BlitScreen::GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer) const { constexpr auto first_image_offset = static_cast<u64>(sizeof(BufferData)); return first_image_offset + GetSizeInBytes(framebuffer) * image_index; } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index ebe10b08b..68ec20253 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -5,6 +5,7 @@ #include <memory> +#include "core/frontend/framebuffer_layout.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -42,6 +43,9 @@ class RasterizerVulkan; class Scheduler; class SMAA; class Swapchain; +class PresentManager; + +struct Frame; struct ScreenInfo { VkImage image{}; @@ -55,18 +59,17 @@ class BlitScreen { public: explicit BlitScreen(Core::Memory::Memory& cpu_memory, Core::Frontend::EmuWindow& render_window, const Device& device, MemoryAllocator& memory_manager, Swapchain& swapchain, - Scheduler& scheduler, const ScreenInfo& screen_info); + PresentManager& present_manager, Scheduler& scheduler, + const ScreenInfo& screen_info); ~BlitScreen(); void Recreate(); - [[nodiscard]] VkSemaphore Draw(const Tegra::FramebufferConfig& framebuffer, - const VkFramebuffer& host_framebuffer, - const Layout::FramebufferLayout layout, VkExtent2D render_area, - bool use_accelerated); + void Draw(const Tegra::FramebufferConfig& framebuffer, const VkFramebuffer& host_framebuffer, + const Layout::FramebufferLayout layout, VkExtent2D render_area, bool use_accelerated); - [[nodiscard]] VkSemaphore DrawToSwapchain(const Tegra::FramebufferConfig& framebuffer, - bool use_accelerated); + void DrawToSwapchain(Frame* frame, const Tegra::FramebufferConfig& framebuffer, + bool use_accelerated, bool is_srgb); [[nodiscard]] vk::Framebuffer CreateFramebuffer(const VkImageView& image_view, VkExtent2D extent); @@ -79,10 +82,9 @@ private: void CreateStaticResources(); void CreateShaders(); - void CreateSemaphores(); void CreateDescriptorPool(); void CreateRenderPass(); - vk::RenderPass CreateRenderPassImpl(VkFormat, bool is_present = true); + vk::RenderPass CreateRenderPassImpl(VkFormat format); void CreateDescriptorSetLayout(); void CreateDescriptorSets(); void CreatePipelineLayout(); @@ -90,15 +92,14 @@ private: void CreateSampler(); void CreateDynamicResources(); - void CreateFramebuffers(); void RefreshResources(const Tegra::FramebufferConfig& framebuffer); void ReleaseRawImages(); void CreateStagingBuffer(const Tegra::FramebufferConfig& framebuffer); void CreateRawImages(const Tegra::FramebufferConfig& framebuffer); - void UpdateDescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const; - void UpdateAADescriptorSet(std::size_t image_index, VkImageView image_view, bool nn) const; + void UpdateDescriptorSet(VkImageView image_view, bool nn) const; + void UpdateAADescriptorSet(VkImageView image_view, bool nn) const; void SetUniformData(BufferData& data, const Layout::FramebufferLayout layout) const; void SetVertexData(BufferData& data, const Tegra::FramebufferConfig& framebuffer, const Layout::FramebufferLayout layout) const; @@ -107,16 +108,17 @@ private: void CreateFSR(); u64 CalculateBufferSize(const Tegra::FramebufferConfig& framebuffer) const; - u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer, - std::size_t image_index) const; + u64 GetRawImageOffset(const Tegra::FramebufferConfig& framebuffer) const; Core::Memory::Memory& cpu_memory; Core::Frontend::EmuWindow& render_window; const Device& device; MemoryAllocator& memory_allocator; Swapchain& swapchain; + PresentManager& present_manager; Scheduler& scheduler; std::size_t image_count; + std::size_t image_index{}; const ScreenInfo& screen_info; vk::ShaderModule vertex_shader; @@ -135,7 +137,6 @@ private: vk::Pipeline gaussian_pipeline; vk::Pipeline scaleforce_pipeline; vk::RenderPass renderpass; - std::vector<vk::Framebuffer> framebuffers; vk::DescriptorSets descriptor_sets; vk::Sampler nn_sampler; vk::Sampler sampler; @@ -145,7 +146,6 @@ private: std::vector<u64> resource_ticks; - std::vector<vk::Semaphore> semaphores; std::vector<vk::Image> raw_images; std::vector<vk::ImageView> raw_image_views; std::vector<MemoryCommit> raw_buffer_commits; @@ -164,6 +164,8 @@ private: u32 raw_width = 0; u32 raw_height = 0; Service::android::PixelFormat pixel_format{}; + bool current_srgb; + VkFormat image_view_format; std::unique_ptr<FSR> fsr; std::unique_ptr<SMAA> smaa; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 9cbcb3c8f..510602e8e 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -314,8 +314,12 @@ StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) { return staging_pool.Request(size, MemoryUsage::Upload); } -StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) { - return staging_pool.Request(size, MemoryUsage::Download); +StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size, bool deferred) { + return staging_pool.Request(size, MemoryUsage::Download, deferred); +} + +void BufferCacheRuntime::FreeDeferredStagingBuffer(StagingBufferRef& ref) { + staging_pool.FreeDeferred(ref); } u64 BufferCacheRuntime::GetDeviceLocalMemory() const { diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 183b33632..879f1ed94 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -3,7 +3,8 @@ #pragma once -#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/buffer_cache/buffer_cache_base.h" +#include "video_core/buffer_cache/memory_tracker_base.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" @@ -75,7 +76,9 @@ public: [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); - [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); + [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size, bool deferred = false); + + void FreeDeferredStagingBuffer(StagingBufferRef& ref); void PreCopyBarrier(); @@ -142,6 +145,8 @@ private: struct BufferCacheParams { using Runtime = Vulkan::BufferCacheRuntime; using Buffer = Vulkan::Buffer; + using Async_Buffer = Vulkan::StagingBufferRef; + using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>; static constexpr bool IS_OPENGL = false; static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; @@ -150,6 +155,7 @@ struct BufferCacheParams { static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; + static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true; }; using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp new file mode 100644 index 000000000..f9e271507 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_buffer_cache_base.cpp @@ -0,0 +1,9 @@ +// SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_vulkan/vk_buffer_cache.h" + +namespace VideoCommon { +template class VideoCommon::BufferCache<Vulkan::BufferCacheParams>; +} diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp index 0214b103a..fad9e3832 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp @@ -5,6 +5,7 @@ #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_fence_manager.h" +#include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/vulkan_common/vulkan_device.h" diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 7fe2afcd9..145359d4e 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -40,7 +40,16 @@ private: }; using Fence = std::shared_ptr<InnerFence>; -using GenericFenceManager = VideoCommon::FenceManager<Fence, TextureCache, BufferCache, QueryCache>; +struct FenceManagerParams { + using FenceType = Fence; + using BufferCacheType = BufferCache; + using TextureCacheType = TextureCache; + using QueryCacheType = QueryCache; + + static constexpr bool HAS_ASYNC_CHECK = true; +}; + +using GenericFenceManager = VideoCommon::FenceManager<FenceManagerParams>; class FenceManager final : public GenericFenceManager { public: diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 985cc3203..a318d643e 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -696,6 +696,13 @@ std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline( std::unique_ptr<ComputePipeline> PipelineCache::CreateComputePipeline( ShaderPools& pools, const ComputePipelineCacheKey& key, Shader::Environment& env, PipelineStatistics* statistics, bool build_in_parallel) try { + // TODO: Remove this when Intel fixes their shader compiler. + // https://github.com/IGCIT/Intel-GPU-Community-Issue-Tracker-IGCIT/issues/159 + if (device.GetDriverID() == VK_DRIVER_ID_INTEL_PROPRIETARY_WINDOWS) { + LOG_ERROR(Render_Vulkan, "Skipping 0x{:016x}", key.Hash()); + return nullptr; + } + LOG_INFO(Render_Vulkan, "0x{:016x}", key.Hash()); Shader::Maxwell::Flow::CFG cfg{env, pools.flow_block, env.StartAddress()}; diff --git a/src/video_core/renderer_vulkan/vk_present_manager.cpp b/src/video_core/renderer_vulkan/vk_present_manager.cpp new file mode 100644 index 000000000..c49583013 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_present_manager.cpp @@ -0,0 +1,457 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include "common/microprofile.h" +#include "common/settings.h" +#include "common/thread.h" +#include "video_core/renderer_vulkan/vk_present_manager.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_swapchain.h" +#include "video_core/vulkan_common/vulkan_device.h" + +namespace Vulkan { + +MICROPROFILE_DEFINE(Vulkan_WaitPresent, "Vulkan", "Wait For Present", MP_RGB(128, 128, 128)); +MICROPROFILE_DEFINE(Vulkan_CopyToSwapchain, "Vulkan", "Copy to swapchain", MP_RGB(192, 255, 192)); + +namespace { + +bool CanBlitToSwapchain(const vk::PhysicalDevice& physical_device, VkFormat format) { + const VkFormatProperties props{physical_device.GetFormatProperties(format)}; + return (props.optimalTilingFeatures & VK_FORMAT_FEATURE_BLIT_DST_BIT); +} + +[[nodiscard]] VkImageSubresourceLayers MakeImageSubresourceLayers() { + return VkImageSubresourceLayers{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .mipLevel = 0, + .baseArrayLayer = 0, + .layerCount = 1, + }; +} + +[[nodiscard]] VkImageBlit MakeImageBlit(s32 frame_width, s32 frame_height, s32 swapchain_width, + s32 swapchain_height) { + return VkImageBlit{ + .srcSubresource = MakeImageSubresourceLayers(), + .srcOffsets = + { + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = frame_width, + .y = frame_height, + .z = 1, + }, + }, + .dstSubresource = MakeImageSubresourceLayers(), + .dstOffsets = + { + { + .x = 0, + .y = 0, + .z = 0, + }, + { + .x = swapchain_width, + .y = swapchain_height, + .z = 1, + }, + }, + }; +} + +[[nodiscard]] VkImageCopy MakeImageCopy(u32 frame_width, u32 frame_height, u32 swapchain_width, + u32 swapchain_height) { + return VkImageCopy{ + .srcSubresource = MakeImageSubresourceLayers(), + .srcOffset = + { + .x = 0, + .y = 0, + .z = 0, + }, + .dstSubresource = MakeImageSubresourceLayers(), + .dstOffset = + { + .x = 0, + .y = 0, + .z = 0, + }, + .extent = + { + .width = std::min(frame_width, swapchain_width), + .height = std::min(frame_height, swapchain_height), + .depth = 1, + }, + }; +} + +} // Anonymous namespace + +PresentManager::PresentManager(Core::Frontend::EmuWindow& render_window_, const Device& device_, + MemoryAllocator& memory_allocator_, Scheduler& scheduler_, + Swapchain& swapchain_) + : render_window{render_window_}, device{device_}, + memory_allocator{memory_allocator_}, scheduler{scheduler_}, swapchain{swapchain_}, + blit_supported{CanBlitToSwapchain(device.GetPhysical(), swapchain.GetImageViewFormat())}, + use_present_thread{Settings::values.async_presentation.GetValue()}, + image_count{swapchain.GetImageCount()} { + + auto& dld = device.GetLogical(); + cmdpool = dld.CreateCommandPool({ + .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, + .pNext = nullptr, + .flags = + VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .queueFamilyIndex = device.GetGraphicsFamily(), + }); + auto cmdbuffers = cmdpool.Allocate(image_count); + + frames.resize(image_count); + for (u32 i = 0; i < frames.size(); i++) { + Frame& frame = frames[i]; + frame.cmdbuf = vk::CommandBuffer{cmdbuffers[i], device.GetDispatchLoader()}; + frame.render_ready = dld.CreateSemaphore({ + .sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + }); + frame.present_done = dld.CreateFence({ + .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, + .pNext = nullptr, + .flags = VK_FENCE_CREATE_SIGNALED_BIT, + }); + free_queue.push(&frame); + } + + if (use_present_thread) { + present_thread = std::jthread([this](std::stop_token token) { PresentThread(token); }); + } +} + +PresentManager::~PresentManager() = default; + +Frame* PresentManager::GetRenderFrame() { + MICROPROFILE_SCOPE(Vulkan_WaitPresent); + + // Wait for free presentation frames + std::unique_lock lock{free_mutex}; + free_cv.wait(lock, [this] { return !free_queue.empty(); }); + + // Take the frame from the queue + Frame* frame = free_queue.front(); + free_queue.pop(); + + // Wait for the presentation to be finished so all frame resources are free + frame->present_done.Wait(); + frame->present_done.Reset(); + + return frame; +} + +void PresentManager::Present(Frame* frame) { + if (!use_present_thread) { + scheduler.WaitWorker(); + CopyToSwapchain(frame); + free_queue.push(frame); + return; + } + + scheduler.Record([this, frame](vk::CommandBuffer) { + std::unique_lock lock{queue_mutex}; + present_queue.push(frame); + frame_cv.notify_one(); + }); +} + +void PresentManager::RecreateFrame(Frame* frame, u32 width, u32 height, bool is_srgb, + VkFormat image_view_format, VkRenderPass rd) { + auto& dld = device.GetLogical(); + + frame->width = width; + frame->height = height; + frame->is_srgb = is_srgb; + + frame->image = dld.CreateImage({ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = nullptr, + .flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT, + .imageType = VK_IMAGE_TYPE_2D, + .format = swapchain.GetImageFormat(), + .extent = + { + .width = width, + .height = height, + .depth = 1, + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }); + + frame->image_commit = memory_allocator.Commit(frame->image, MemoryUsage::DeviceLocal); + + frame->image_view = dld.CreateImageView({ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = *frame->image, + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = image_view_format, + .components = + { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }); + + const VkImageView image_view{*frame->image_view}; + frame->framebuffer = dld.CreateFramebuffer({ + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .renderPass = rd, + .attachmentCount = 1, + .pAttachments = &image_view, + .width = width, + .height = height, + .layers = 1, + }); +} + +void PresentManager::WaitPresent() { + if (!use_present_thread) { + return; + } + + // Wait for the present queue to be empty + { + std::unique_lock queue_lock{queue_mutex}; + frame_cv.wait(queue_lock, [this] { return present_queue.empty(); }); + } + + // The above condition will be satisfied when the last frame is taken from the queue. + // To ensure that frame has been presented as well take hold of the swapchain + // mutex. + std::scoped_lock swapchain_lock{swapchain_mutex}; +} + +void PresentManager::PresentThread(std::stop_token token) { + Common::SetCurrentThreadName("VulkanPresent"); + while (!token.stop_requested()) { + std::unique_lock lock{queue_mutex}; + + // Wait for presentation frames + Common::CondvarWait(frame_cv, lock, token, [this] { return !present_queue.empty(); }); + if (token.stop_requested()) { + return; + } + + // Take the frame and notify anyone waiting + Frame* frame = present_queue.front(); + present_queue.pop(); + frame_cv.notify_one(); + + // By exchanging the lock ownership we take the swapchain lock + // before the queue lock goes out of scope. This way the swapchain + // lock in WaitPresent is guaranteed to occur after here. + std::exchange(lock, std::unique_lock{swapchain_mutex}); + + CopyToSwapchain(frame); + + // Free the frame for reuse + std::scoped_lock fl{free_mutex}; + free_queue.push(frame); + free_cv.notify_one(); + } +} + +void PresentManager::CopyToSwapchain(Frame* frame) { + MICROPROFILE_SCOPE(Vulkan_CopyToSwapchain); + + const auto recreate_swapchain = [&] { + swapchain.Create(frame->width, frame->height, frame->is_srgb); + image_count = swapchain.GetImageCount(); + }; + + // If the size or colorspace of the incoming frames has changed, recreate the swapchain + // to account for that. + const bool srgb_changed = swapchain.NeedsRecreation(frame->is_srgb); + const bool size_changed = + swapchain.GetWidth() != frame->width || swapchain.GetHeight() != frame->height; + if (srgb_changed || size_changed) { + recreate_swapchain(); + } + + while (swapchain.AcquireNextImage()) { + recreate_swapchain(); + } + + const vk::CommandBuffer cmdbuf{frame->cmdbuf}; + cmdbuf.Begin({ + .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, + .pNext = nullptr, + .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, + .pInheritanceInfo = nullptr, + }); + + const VkImage image{swapchain.CurrentImage()}; + const VkExtent2D extent = swapchain.GetExtent(); + const std::array pre_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *frame->image, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + }; + const std::array post_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_PRESENT_SRC_KHR, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = *frame->image, + .subresourceRange{ + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + }; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, {}, + {}, {}, pre_barriers); + + if (blit_supported) { + cmdbuf.BlitImage(*frame->image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageBlit(frame->width, frame->height, extent.width, extent.height), + VK_FILTER_LINEAR); + } else { + cmdbuf.CopyImage(*frame->image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageCopy(frame->width, frame->height, extent.width, extent.height)); + } + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, {}, + {}, {}, post_barriers); + + cmdbuf.End(); + + const VkSemaphore present_semaphore = swapchain.CurrentPresentSemaphore(); + const VkSemaphore render_semaphore = swapchain.CurrentRenderSemaphore(); + const std::array wait_semaphores = {present_semaphore, *frame->render_ready}; + + static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{ + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + }; + + const VkSubmitInfo submit_info{ + .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, + .pNext = nullptr, + .waitSemaphoreCount = 2U, + .pWaitSemaphores = wait_semaphores.data(), + .pWaitDstStageMask = wait_stage_masks.data(), + .commandBufferCount = 1, + .pCommandBuffers = cmdbuf.address(), + .signalSemaphoreCount = 1U, + .pSignalSemaphores = &render_semaphore, + }; + + // Submit the image copy/blit to the swapchain + { + std::scoped_lock lock{scheduler.submit_mutex}; + switch (const VkResult result = + device.GetGraphicsQueue().Submit(submit_info, *frame->present_done)) { + case VK_SUCCESS: + break; + case VK_ERROR_DEVICE_LOST: + device.ReportLoss(); + [[fallthrough]]; + default: + vk::Check(result); + break; + } + } + + // Present + swapchain.Present(render_semaphore); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_present_manager.h b/src/video_core/renderer_vulkan/vk_present_manager.h new file mode 100644 index 000000000..420a775e2 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_present_manager.h @@ -0,0 +1,83 @@ +// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#pragma once + +#include <condition_variable> +#include <mutex> +#include <queue> + +#include "common/common_types.h" +#include "common/polyfill_thread.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Core::Frontend { +class EmuWindow; +} // namespace Core::Frontend + +namespace Vulkan { + +class Device; +class Scheduler; +class Swapchain; + +struct Frame { + u32 width; + u32 height; + bool is_srgb; + vk::Image image; + vk::ImageView image_view; + vk::Framebuffer framebuffer; + MemoryCommit image_commit; + vk::CommandBuffer cmdbuf; + vk::Semaphore render_ready; + vk::Fence present_done; +}; + +class PresentManager { +public: + PresentManager(Core::Frontend::EmuWindow& render_window, const Device& device, + MemoryAllocator& memory_allocator, Scheduler& scheduler, Swapchain& swapchain); + ~PresentManager(); + + /// Returns the last used presentation frame + Frame* GetRenderFrame(); + + /// Pushes a frame for presentation + void Present(Frame* frame); + + /// Recreates the present frame to match the provided parameters + void RecreateFrame(Frame* frame, u32 width, u32 height, bool is_srgb, + VkFormat image_view_format, VkRenderPass rd); + + /// Waits for the present thread to finish presenting all queued frames. + void WaitPresent(); + +private: + void PresentThread(std::stop_token token); + + void CopyToSwapchain(Frame* frame); + +private: + Core::Frontend::EmuWindow& render_window; + const Device& device; + MemoryAllocator& memory_allocator; + Scheduler& scheduler; + Swapchain& swapchain; + vk::CommandPool cmdpool; + std::vector<Frame> frames; + std::queue<Frame*> present_queue; + std::queue<Frame*> free_queue; + std::condition_variable_any frame_cv; + std::condition_variable free_cv; + std::mutex swapchain_mutex; + std::mutex queue_mutex; + std::mutex free_mutex; + std::jthread present_thread; + bool blit_supported; + bool use_present_thread; + std::size_t image_count; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index 929c8ece6..d67490449 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -66,9 +66,10 @@ void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { } } -QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, const Device& device_, +QueryCache::QueryCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, const Device& device_, Scheduler& scheduler_) - : QueryCacheBase{rasterizer_}, device{device_}, scheduler{scheduler_}, + : QueryCacheBase{rasterizer_, cpu_memory_}, device{device_}, scheduler{scheduler_}, query_pools{ QueryPool{device_, scheduler_, QueryType::SamplesPassed}, } {} @@ -98,8 +99,10 @@ HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> depend query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { const vk::Device* logical = &cache.GetDevice().GetLogical(); cache.GetScheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) { + const bool use_precise = Settings::IsGPULevelHigh(); logical->ResetQueryPool(query.first, query.second, 1); - cmdbuf.BeginQuery(query.first, query.second, VK_QUERY_CONTROL_PRECISE_BIT); + cmdbuf.BeginQuery(query.first, query.second, + use_precise ? VK_QUERY_CONTROL_PRECISE_BIT : 0); }); } @@ -112,8 +115,10 @@ void HostCounter::EndQuery() { [query = query](vk::CommandBuffer cmdbuf) { cmdbuf.EndQuery(query.first, query.second); }); } -u64 HostCounter::BlockingQuery() const { - cache.GetScheduler().Wait(tick); +u64 HostCounter::BlockingQuery(bool async) const { + if (!async) { + cache.GetScheduler().Wait(tick); + } u64 data; const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( query.first, query.second, 1, sizeof(data), &data, sizeof(data), diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index 26762ee09..c1b9552eb 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -52,7 +52,8 @@ private: class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, const Device& device_, + explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_, + Core::Memory::Memory& cpu_memory_, const Device& device_, Scheduler& scheduler_); ~QueryCache(); @@ -83,7 +84,7 @@ public: void EndQuery(); private: - u64 BlockingQuery() const override; + u64 BlockingQuery(bool async = false) const override; QueryCache& cache; const VideoCore::QueryType type; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 2559a3aa7..d1489fc95 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -172,7 +172,8 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra buffer_cache(*this, cpu_memory_, buffer_cache_runtime), pipeline_cache(*this, device, scheduler, descriptor_pool, update_descriptor_queue, render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), - query_cache{*this, device, scheduler}, accelerate_dma(buffer_cache, texture_cache, scheduler), + query_cache{*this, cpu_memory_, device, scheduler}, + accelerate_dma(buffer_cache, texture_cache, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()) { scheduler.SetQueryCache(query_cache); @@ -675,7 +676,8 @@ bool RasterizerVulkan::AccelerateConditionalRendering() { const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()}; Maxwell::ReportSemaphore::Compare cmp; if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp), - VideoCommon::CacheType::BufferCache)) { + VideoCommon::CacheType::BufferCache | + VideoCommon::CacheType::QueryCache)) { return true; } return false; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 057e16967..80455ec08 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -46,10 +46,11 @@ Scheduler::Scheduler(const Device& device_, StateTracker& state_tracker_) Scheduler::~Scheduler() = default; -void Scheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { +u64 Scheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { // When flushing, we only send data to the worker thread; no waiting is necessary. - SubmitExecution(signal_semaphore, wait_semaphore); + const u64 signal_value = SubmitExecution(signal_semaphore, wait_semaphore); AllocateNewContext(); + return signal_value; } void Scheduler::Finish(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { @@ -205,7 +206,7 @@ void Scheduler::AllocateWorkerCommandBuffer() { }); } -void Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { +u64 Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { EndPendingOperations(); InvalidateState(); @@ -217,6 +218,7 @@ void Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_s on_submit(); } + std::scoped_lock lock{submit_mutex}; switch (const VkResult result = master_semaphore->SubmitQueue( cmdbuf, signal_semaphore, wait_semaphore, signal_value)) { case VK_SUCCESS: @@ -231,6 +233,7 @@ void Scheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_s }); chunk->MarkSubmit(); DispatchWork(); + return signal_value; } void Scheduler::AllocateNewContext() { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 8d75ce987..475c682eb 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -34,7 +34,7 @@ public: ~Scheduler(); /// Sends the current execution context to the GPU. - void Flush(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr); + u64 Flush(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr); /// Sends the current execution context to the GPU and waits for it to complete. void Finish(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr); @@ -106,6 +106,8 @@ public: return *master_semaphore; } + std::mutex submit_mutex; + private: class Command { public: @@ -201,7 +203,7 @@ private: void AllocateWorkerCommandBuffer(); - void SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore); + u64 SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore); void AllocateNewContext(); diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index b1465e35c..23bbea7f1 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -99,18 +99,16 @@ void Swapchain::Create(u32 width_, u32 height_, bool srgb) { return; } - device.GetLogical().WaitIdle(); Destroy(); CreateSwapchain(capabilities, srgb); CreateSemaphores(); - CreateImageViews(); resource_ticks.clear(); resource_ticks.resize(image_count); } -void Swapchain::AcquireNextImage() { +bool Swapchain::AcquireNextImage() { const VkResult result = device.GetLogical().AcquireNextImageKHR( *swapchain, std::numeric_limits<u64>::max(), *present_semaphores[frame_index], VK_NULL_HANDLE, &image_index); @@ -127,8 +125,11 @@ void Swapchain::AcquireNextImage() { LOG_ERROR(Render_Vulkan, "vkAcquireNextImageKHR returned {}", vk::ToString(result)); break; } + scheduler.Wait(resource_ticks[image_index]); resource_ticks[image_index] = scheduler.CurrentTick(); + + return is_suboptimal || is_outdated; } void Swapchain::Present(VkSemaphore render_semaphore) { @@ -143,6 +144,7 @@ void Swapchain::Present(VkSemaphore render_semaphore) { .pImageIndices = &image_index, .pResults = nullptr, }; + std::scoped_lock lock{scheduler.submit_mutex}; switch (const VkResult result = present_queue.Present(present_info)) { case VK_SUCCESS: break; @@ -168,7 +170,7 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo const auto present_modes{physical_device.GetSurfacePresentModesKHR(surface)}; const VkCompositeAlphaFlagBitsKHR alpha_flags{ChooseAlphaFlags(capabilities)}; - const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)}; + surface_format = ChooseSwapSurfaceFormat(formats); present_mode = ChooseSwapPresentMode(present_modes); u32 requested_image_count{capabilities.minImageCount + 1}; @@ -193,7 +195,7 @@ void Swapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bo .imageColorSpace = surface_format.colorSpace, .imageExtent = {}, .imageArrayLayers = 1, - .imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT, + .imageUsage = VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT, .imageSharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, @@ -241,45 +243,14 @@ void Swapchain::CreateSemaphores() { present_semaphores.resize(image_count); std::ranges::generate(present_semaphores, [this] { return device.GetLogical().CreateSemaphore(); }); -} - -void Swapchain::CreateImageViews() { - VkImageViewCreateInfo ci{ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .image = {}, - .viewType = VK_IMAGE_VIEW_TYPE_2D, - .format = image_view_format, - .components = - { - .r = VK_COMPONENT_SWIZZLE_IDENTITY, - .g = VK_COMPONENT_SWIZZLE_IDENTITY, - .b = VK_COMPONENT_SWIZZLE_IDENTITY, - .a = VK_COMPONENT_SWIZZLE_IDENTITY, - }, - .subresourceRange = - { - .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = 1, - }, - }; - - image_views.resize(image_count); - for (std::size_t i = 0; i < image_count; i++) { - ci.image = images[i]; - image_views[i] = device.GetLogical().CreateImageView(ci); - } + render_semaphores.resize(image_count); + std::ranges::generate(render_semaphores, + [this] { return device.GetLogical().CreateSemaphore(); }); } void Swapchain::Destroy() { frame_index = 0; present_semaphores.clear(); - framebuffers.clear(); - image_views.clear(); swapchain.reset(); } diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index caf1ff32b..419742586 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -27,7 +27,7 @@ public: void Create(u32 width, u32 height, bool srgb); /// Acquires the next image in the swapchain, waits as needed. - void AcquireNextImage(); + bool AcquireNextImage(); /// Presents the rendered image to the swapchain. void Present(VkSemaphore render_semaphore); @@ -52,6 +52,11 @@ public: return is_suboptimal; } + /// Returns true when the swapchain format is in the srgb color space + bool IsSrgb() const { + return current_srgb; + } + VkExtent2D GetSize() const { return extent; } @@ -64,22 +69,34 @@ public: return image_index; } + std::size_t GetFrameIndex() const { + return frame_index; + } + VkImage GetImageIndex(std::size_t index) const { return images[index]; } - VkImageView GetImageViewIndex(std::size_t index) const { - return *image_views[index]; + VkImage CurrentImage() const { + return images[image_index]; } VkFormat GetImageViewFormat() const { return image_view_format; } + VkFormat GetImageFormat() const { + return surface_format.format; + } + VkSemaphore CurrentPresentSemaphore() const { return *present_semaphores[frame_index]; } + VkSemaphore CurrentRenderSemaphore() const { + return *render_semaphores[frame_index]; + } + u32 GetWidth() const { return width; } @@ -88,6 +105,10 @@ public: return height; } + VkExtent2D GetExtent() const { + return extent; + } + private: void CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, bool srgb); void CreateSemaphores(); @@ -107,10 +128,9 @@ private: std::size_t image_count{}; std::vector<VkImage> images; - std::vector<vk::ImageView> image_views; - std::vector<vk::Framebuffer> framebuffers; std::vector<u64> resource_ticks; std::vector<vk::Semaphore> present_semaphores; + std::vector<vk::Semaphore> render_semaphores; u32 width; u32 height; @@ -121,6 +141,7 @@ private: VkFormat image_view_format{}; VkExtent2D extent{}; VkPresentModeKHR present_mode{}; + VkSurfaceFormatKHR surface_format{}; bool current_srgb{}; bool current_fps_unlocked{}; diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 009dab0b6..0630ebda5 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -14,13 +14,18 @@ namespace Vulkan { UpdateDescriptorQueue::UpdateDescriptorQueue(const Device& device_, Scheduler& scheduler_) : device{device_}, scheduler{scheduler_} { + payload_start = payload.data(); payload_cursor = payload.data(); } UpdateDescriptorQueue::~UpdateDescriptorQueue() = default; void UpdateDescriptorQueue::TickFrame() { - payload_cursor = payload.data(); + if (++frame_index >= FRAMES_IN_FLIGHT) { + frame_index = 0; + } + payload_start = payload.data() + frame_index * FRAME_PAYLOAD_SIZE; + payload_cursor = payload_start; } void UpdateDescriptorQueue::Acquire() { @@ -28,10 +33,10 @@ void UpdateDescriptorQueue::Acquire() { // This is the maximum number of entries a single draw call might use. static constexpr size_t MIN_ENTRIES = 0x400; - if (std::distance(payload.data(), payload_cursor) + MIN_ENTRIES >= payload.max_size()) { + if (std::distance(payload_start, payload_cursor) + MIN_ENTRIES >= FRAME_PAYLOAD_SIZE) { LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread"); scheduler.WaitWorker(); - payload_cursor = payload.data(); + payload_cursor = payload_start; } upload_start = payload_cursor; } diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index 625bcc809..1c1a7020b 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -29,6 +29,12 @@ struct DescriptorUpdateEntry { }; class UpdateDescriptorQueue final { + // This should be plenty for the vast majority of cases. Most desktop platforms only + // provide up to 3 swapchain images. + static constexpr size_t FRAMES_IN_FLIGHT = 5; + static constexpr size_t FRAME_PAYLOAD_SIZE = 0x10000; + static constexpr size_t PAYLOAD_SIZE = FRAME_PAYLOAD_SIZE * FRAMES_IN_FLIGHT; + public: explicit UpdateDescriptorQueue(const Device& device_, Scheduler& scheduler_); ~UpdateDescriptorQueue(); @@ -73,9 +79,11 @@ private: const Device& device; Scheduler& scheduler; + size_t frame_index{0}; DescriptorUpdateEntry* payload_cursor = nullptr; + DescriptorUpdateEntry* payload_start = nullptr; const DescriptorUpdateEntry* upload_start = nullptr; - std::array<DescriptorUpdateEntry, 0x10000> payload; + std::array<DescriptorUpdateEntry, PAYLOAD_SIZE> payload; }; } // namespace Vulkan diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index d9482371b..c5213875b 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -228,14 +228,14 @@ const ShaderInfo* ShaderCache::MakeShaderInfo(GenericEnvironment& env, VAddr cpu auto info = std::make_unique<ShaderInfo>(); if (const std::optional<u64> cached_hash{env.Analyze()}) { info->unique_hash = *cached_hash; - info->size_bytes = env.CachedSize(); + info->size_bytes = env.CachedSizeBytes(); } else { // Slow path, not really hit on commercial games // Build a control flow graph to get the real shader size Shader::ObjectPool<Shader::Maxwell::Flow::Block> flow_block; Shader::Maxwell::Flow::CFG cfg{env, flow_block, env.StartAddress()}; info->unique_hash = env.CalculateHash(); - info->size_bytes = env.ReadSize(); + info->size_bytes = env.ReadSizeBytes(); } const size_t size_bytes{info->size_bytes}; const ShaderInfo* const result{info.get()}; diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp index 574760f80..c7cb56243 100644 --- a/src/video_core/shader_environment.cpp +++ b/src/video_core/shader_environment.cpp @@ -170,15 +170,19 @@ std::optional<u64> GenericEnvironment::Analyze() { void GenericEnvironment::SetCachedSize(size_t size_bytes) { cached_lowest = start_address; cached_highest = start_address + static_cast<u32>(size_bytes); - code.resize(CachedSize()); + code.resize(CachedSizeWords()); gpu_memory->ReadBlock(program_base + cached_lowest, code.data(), code.size() * sizeof(u64)); } -size_t GenericEnvironment::CachedSize() const noexcept { - return cached_highest - cached_lowest + INST_SIZE; +size_t GenericEnvironment::CachedSizeWords() const noexcept { + return CachedSizeBytes() / INST_SIZE; } -size_t GenericEnvironment::ReadSize() const noexcept { +size_t GenericEnvironment::CachedSizeBytes() const noexcept { + return static_cast<size_t>(cached_highest) - cached_lowest + INST_SIZE; +} + +size_t GenericEnvironment::ReadSizeBytes() const noexcept { return read_highest - read_lowest + INST_SIZE; } @@ -187,7 +191,7 @@ bool GenericEnvironment::CanBeSerialized() const noexcept { } u64 GenericEnvironment::CalculateHash() const { - const size_t size{ReadSize()}; + const size_t size{ReadSizeBytes()}; const auto data{std::make_unique<char[]>(size)}; gpu_memory->ReadBlock(program_base + read_lowest, data.get(), size); return Common::CityHash64(data.get(), size); @@ -198,7 +202,7 @@ void GenericEnvironment::Dump(u64 hash) { } void GenericEnvironment::Serialize(std::ofstream& file) const { - const u64 code_size{static_cast<u64>(CachedSize())}; + const u64 code_size{static_cast<u64>(CachedSizeBytes())}; const u64 num_texture_types{static_cast<u64>(texture_types.size())}; const u64 num_texture_pixel_formats{static_cast<u64>(texture_pixel_formats.size())}; const u64 num_cbuf_values{static_cast<u64>(cbuf_values.size())}; diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h index d75987a52..a0f61cbda 100644 --- a/src/video_core/shader_environment.h +++ b/src/video_core/shader_environment.h @@ -48,9 +48,11 @@ public: void SetCachedSize(size_t size_bytes); - [[nodiscard]] size_t CachedSize() const noexcept; + [[nodiscard]] size_t CachedSizeWords() const noexcept; - [[nodiscard]] size_t ReadSize() const noexcept; + [[nodiscard]] size_t CachedSizeBytes() const noexcept; + + [[nodiscard]] size_t ReadSizeBytes() const noexcept; [[nodiscard]] bool CanBeSerialized() const noexcept; diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 1a76d4178..cb51529e4 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -250,10 +250,13 @@ bool IsPixelFormatASTC(PixelFormat format) { case PixelFormat::ASTC_2D_6X6_UNORM: case PixelFormat::ASTC_2D_6X6_SRGB: case PixelFormat::ASTC_2D_10X6_UNORM: + case PixelFormat::ASTC_2D_10X6_SRGB: case PixelFormat::ASTC_2D_10X5_UNORM: case PixelFormat::ASTC_2D_10X5_SRGB: case PixelFormat::ASTC_2D_10X10_UNORM: case PixelFormat::ASTC_2D_10X10_SRGB: + case PixelFormat::ASTC_2D_12X10_UNORM: + case PixelFormat::ASTC_2D_12X10_SRGB: case PixelFormat::ASTC_2D_12X12_UNORM: case PixelFormat::ASTC_2D_12X12_SRGB: case PixelFormat::ASTC_2D_8X6_UNORM: @@ -279,11 +282,13 @@ bool IsPixelFormatSRGB(PixelFormat format) { case PixelFormat::ASTC_2D_8X5_SRGB: case PixelFormat::ASTC_2D_5X4_SRGB: case PixelFormat::ASTC_2D_5X5_SRGB: + case PixelFormat::ASTC_2D_10X6_SRGB: case PixelFormat::ASTC_2D_10X8_SRGB: case PixelFormat::ASTC_2D_6X6_SRGB: case PixelFormat::ASTC_2D_10X5_SRGB: case PixelFormat::ASTC_2D_10X10_SRGB: case PixelFormat::ASTC_2D_12X12_SRGB: + case PixelFormat::ASTC_2D_12X10_SRGB: case PixelFormat::ASTC_2D_8X6_SRGB: case PixelFormat::ASTC_2D_6X5_SRGB: return true; diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 44b79af20..0225d3287 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -95,10 +95,13 @@ enum class PixelFormat { ASTC_2D_6X6_UNORM, ASTC_2D_6X6_SRGB, ASTC_2D_10X6_UNORM, + ASTC_2D_10X6_SRGB, ASTC_2D_10X5_UNORM, ASTC_2D_10X5_SRGB, ASTC_2D_10X10_UNORM, ASTC_2D_10X10_SRGB, + ASTC_2D_12X10_UNORM, + ASTC_2D_12X10_SRGB, ASTC_2D_12X12_UNORM, ASTC_2D_12X12_SRGB, ASTC_2D_8X6_UNORM, @@ -232,10 +235,13 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_WIDTH_TABLE = {{ 6, // ASTC_2D_6X6_UNORM 6, // ASTC_2D_6X6_SRGB 10, // ASTC_2D_10X6_UNORM + 10, // ASTC_2D_10X6_SRGB 10, // ASTC_2D_10X5_UNORM 10, // ASTC_2D_10X5_SRGB 10, // ASTC_2D_10X10_UNORM 10, // ASTC_2D_10X10_SRGB + 12, // ASTC_2D_12X10_UNORM + 12, // ASTC_2D_12X10_SRGB 12, // ASTC_2D_12X12_UNORM 12, // ASTC_2D_12X12_SRGB 8, // ASTC_2D_8X6_UNORM @@ -338,10 +344,13 @@ constexpr std::array<u8, MaxPixelFormat> BLOCK_HEIGHT_TABLE = {{ 6, // ASTC_2D_6X6_UNORM 6, // ASTC_2D_6X6_SRGB 6, // ASTC_2D_10X6_UNORM + 6, // ASTC_2D_10X6_SRGB 5, // ASTC_2D_10X5_UNORM 5, // ASTC_2D_10X5_SRGB 10, // ASTC_2D_10X10_UNORM 10, // ASTC_2D_10X10_SRGB + 10, // ASTC_2D_12X10_UNORM + 10, // ASTC_2D_12X10_SRGB 12, // ASTC_2D_12X12_UNORM 12, // ASTC_2D_12X12_SRGB 6, // ASTC_2D_8X6_UNORM @@ -444,10 +453,13 @@ constexpr std::array<u8, MaxPixelFormat> BITS_PER_BLOCK_TABLE = {{ 128, // ASTC_2D_6X6_UNORM 128, // ASTC_2D_6X6_SRGB 128, // ASTC_2D_10X6_UNORM + 128, // ASTC_2D_10X6_SRGB 128, // ASTC_2D_10X5_UNORM 128, // ASTC_2D_10X5_SRGB 128, // ASTC_2D_10X10_UNORM 128, // ASTC_2D_10X10_SRGB + 128, // ASTC_2D_12X10_UNORM + 128, // ASTC_2D_12X10_SRGB 128, // ASTC_2D_12X12_UNORM 128, // ASTC_2D_12X12_SRGB 128, // ASTC_2D_8X6_UNORM diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 5fc2b2fec..11ced6c38 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -210,6 +210,8 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, return PixelFormat::ASTC_2D_6X6_SRGB; case Hash(TextureFormat::ASTC_2D_10X6, UNORM, LINEAR): return PixelFormat::ASTC_2D_10X6_UNORM; + case Hash(TextureFormat::ASTC_2D_10X6, UNORM, SRGB): + return PixelFormat::ASTC_2D_10X6_SRGB; case Hash(TextureFormat::ASTC_2D_10X5, UNORM, LINEAR): return PixelFormat::ASTC_2D_10X5_UNORM; case Hash(TextureFormat::ASTC_2D_10X5, UNORM, SRGB): @@ -218,6 +220,10 @@ PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, return PixelFormat::ASTC_2D_10X10_UNORM; case Hash(TextureFormat::ASTC_2D_10X10, UNORM, SRGB): return PixelFormat::ASTC_2D_10X10_SRGB; + case Hash(TextureFormat::ASTC_2D_12X10, UNORM, LINEAR): + return PixelFormat::ASTC_2D_12X10_UNORM; + case Hash(TextureFormat::ASTC_2D_12X10, UNORM, SRGB): + return PixelFormat::ASTC_2D_12X10_SRGB; case Hash(TextureFormat::ASTC_2D_12X12, UNORM, LINEAR): return PixelFormat::ASTC_2D_12X12_UNORM; case Hash(TextureFormat::ASTC_2D_12X12, UNORM, SRGB): diff --git a/src/video_core/texture_cache/formatter.h b/src/video_core/texture_cache/formatter.h index f1f0a057b..b97147797 100644 --- a/src/video_core/texture_cache/formatter.h +++ b/src/video_core/texture_cache/formatter.h @@ -179,6 +179,8 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str return "ASTC_2D_6X6_SRGB"; case PixelFormat::ASTC_2D_10X6_UNORM: return "ASTC_2D_10X6_UNORM"; + case PixelFormat::ASTC_2D_10X6_SRGB: + return "ASTC_2D_10X6_SRGB"; case PixelFormat::ASTC_2D_10X5_UNORM: return "ASTC_2D_10X5_UNORM"; case PixelFormat::ASTC_2D_10X5_SRGB: @@ -187,6 +189,10 @@ struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::str return "ASTC_2D_10X10_UNORM"; case PixelFormat::ASTC_2D_10X10_SRGB: return "ASTC_2D_10X10_SRGB"; + case PixelFormat::ASTC_2D_12X10_UNORM: + return "ASTC_2D_12X10_UNORM"; + case PixelFormat::ASTC_2D_12X10_SRGB: + return "ASTC_2D_12X10_SRGB"; case PixelFormat::ASTC_2D_12X12_UNORM: return "ASTC_2D_12X12_UNORM"; case PixelFormat::ASTC_2D_12X12_SRGB: diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index e601f8446..f335009d0 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -888,7 +888,7 @@ void TextureCache<P>::DownloadImageIntoBuffer(typename TextureCache<P>::Image* i buffer, download_map.buffer, }; - std::array buffer_offsets{ + std::array<u64, 2> buffer_offsets{ buffer_offset, download_map.offset, }; diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 6f288b3f8..6ffca2af2 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -617,7 +617,9 @@ bool Device::ShouldBoostClocks() const { const bool is_steam_deck = vendor_id == 0x1002 && device_id == 0x163F; - return validated_driver && !is_steam_deck; + const bool is_debugging = this->HasDebuggingToolAttached(); + + return validated_driver && !is_steam_deck && !is_debugging; } bool Device::GetSuitability(bool requires_swapchain) { diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index bb731276e..0131f63e7 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -497,7 +497,7 @@ void Config::ReadCoreValues() { qt_config->beginGroup(QStringLiteral("Core")); ReadGlobalSetting(Settings::values.use_multi_core); - ReadGlobalSetting(Settings::values.use_extended_memory_layout); + ReadGlobalSetting(Settings::values.use_unsafe_extended_memory_layout); qt_config->endGroup(); } @@ -692,6 +692,7 @@ void Config::ReadRendererValues() { qt_config->beginGroup(QStringLiteral("Renderer")); ReadGlobalSetting(Settings::values.renderer_backend); + ReadGlobalSetting(Settings::values.async_presentation); ReadGlobalSetting(Settings::values.renderer_force_max_clock); ReadGlobalSetting(Settings::values.vulkan_device); ReadGlobalSetting(Settings::values.fullscreen_mode); @@ -712,7 +713,6 @@ void Config::ReadRendererValues() { ReadGlobalSetting(Settings::values.shader_backend); ReadGlobalSetting(Settings::values.use_asynchronous_shaders); ReadGlobalSetting(Settings::values.use_fast_gpu_time); - ReadGlobalSetting(Settings::values.use_pessimistic_flushes); ReadGlobalSetting(Settings::values.use_vulkan_driver_pipeline_cache); ReadGlobalSetting(Settings::values.bg_red); ReadGlobalSetting(Settings::values.bg_green); @@ -1161,7 +1161,7 @@ void Config::SaveCoreValues() { qt_config->beginGroup(QStringLiteral("Core")); WriteGlobalSetting(Settings::values.use_multi_core); - WriteGlobalSetting(Settings::values.use_extended_memory_layout); + WriteGlobalSetting(Settings::values.use_unsafe_extended_memory_layout); qt_config->endGroup(); } @@ -1313,6 +1313,7 @@ void Config::SaveRendererValues() { static_cast<u32>(Settings::values.renderer_backend.GetValue(global)), static_cast<u32>(Settings::values.renderer_backend.GetDefault()), Settings::values.renderer_backend.UsingGlobal()); + WriteGlobalSetting(Settings::values.async_presentation); WriteGlobalSetting(Settings::values.renderer_force_max_clock); WriteGlobalSetting(Settings::values.vulkan_device); WriteSetting(QString::fromStdString(Settings::values.fullscreen_mode.GetLabel()), @@ -1357,7 +1358,6 @@ void Config::SaveRendererValues() { Settings::values.shader_backend.UsingGlobal()); WriteGlobalSetting(Settings::values.use_asynchronous_shaders); WriteGlobalSetting(Settings::values.use_fast_gpu_time); - WriteGlobalSetting(Settings::values.use_pessimistic_flushes); WriteGlobalSetting(Settings::values.use_vulkan_driver_pipeline_cache); WriteGlobalSetting(Settings::values.bg_red); WriteGlobalSetting(Settings::values.bg_green); diff --git a/src/yuzu/configuration/configure_general.cpp b/src/yuzu/configuration/configure_general.cpp index 207bcdc4d..26258d744 100644 --- a/src/yuzu/configuration/configure_general.cpp +++ b/src/yuzu/configuration/configure_general.cpp @@ -35,9 +35,6 @@ void ConfigureGeneral::SetConfiguration() { ui->use_multi_core->setEnabled(runtime_lock); ui->use_multi_core->setChecked(Settings::values.use_multi_core.GetValue()); - ui->use_extended_memory_layout->setEnabled(runtime_lock); - ui->use_extended_memory_layout->setChecked( - Settings::values.use_extended_memory_layout.GetValue()); ui->toggle_check_exit->setChecked(UISettings::values.confirm_before_closing.GetValue()); ui->toggle_user_on_boot->setChecked(UISettings::values.select_user_on_boot.GetValue()); @@ -79,9 +76,6 @@ void ConfigureGeneral::ResetDefaults() { void ConfigureGeneral::ApplyConfiguration() { ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_multi_core, ui->use_multi_core, use_multi_core); - ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_extended_memory_layout, - ui->use_extended_memory_layout, - use_extended_memory_layout); if (Settings::IsConfiguringGlobal()) { UISettings::values.confirm_before_closing = ui->toggle_check_exit->isChecked(); @@ -141,9 +135,6 @@ void ConfigureGeneral::SetupPerGameUI() { Settings::values.use_speed_limit, use_speed_limit); ConfigurationShared::SetColoredTristate(ui->use_multi_core, Settings::values.use_multi_core, use_multi_core); - ConfigurationShared::SetColoredTristate(ui->use_extended_memory_layout, - Settings::values.use_extended_memory_layout, - use_extended_memory_layout); connect(ui->toggle_speed_limit, &QCheckBox::clicked, ui->speed_limit, [this]() { ui->speed_limit->setEnabled(ui->toggle_speed_limit->isChecked() && diff --git a/src/yuzu/configuration/configure_general.h b/src/yuzu/configuration/configure_general.h index a090c1a3f..7ff63f425 100644 --- a/src/yuzu/configuration/configure_general.h +++ b/src/yuzu/configuration/configure_general.h @@ -47,7 +47,6 @@ private: ConfigurationShared::CheckState use_speed_limit; ConfigurationShared::CheckState use_multi_core; - ConfigurationShared::CheckState use_extended_memory_layout; const Core::System& system; }; diff --git a/src/yuzu/configuration/configure_general.ui b/src/yuzu/configuration/configure_general.ui index add110bb0..986a1625b 100644 --- a/src/yuzu/configuration/configure_general.ui +++ b/src/yuzu/configuration/configure_general.ui @@ -62,13 +62,6 @@ </widget> </item> <item> - <widget class="QCheckBox" name="use_extended_memory_layout"> - <property name="text"> - <string>Extended memory layout (8GB DRAM)</string> - </property> - </widget> - </item> - <item> <widget class="QCheckBox" name="toggle_check_exit"> <property name="text"> <string>Confirm exit while emulation is running</string> diff --git a/src/yuzu/configuration/configure_graphics_advanced.cpp b/src/yuzu/configuration/configure_graphics_advanced.cpp index 59fb1b334..ddda79983 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.cpp +++ b/src/yuzu/configuration/configure_graphics_advanced.cpp @@ -22,17 +22,18 @@ ConfigureGraphicsAdvanced::~ConfigureGraphicsAdvanced() = default; void ConfigureGraphicsAdvanced::SetConfiguration() { const bool runtime_lock = !system.IsPoweredOn(); ui->use_vsync->setEnabled(runtime_lock); + ui->async_present->setEnabled(runtime_lock); ui->renderer_force_max_clock->setEnabled(runtime_lock); ui->async_astc->setEnabled(runtime_lock); ui->use_asynchronous_shaders->setEnabled(runtime_lock); ui->anisotropic_filtering_combobox->setEnabled(runtime_lock); + ui->async_present->setChecked(Settings::values.async_presentation.GetValue()); ui->renderer_force_max_clock->setChecked(Settings::values.renderer_force_max_clock.GetValue()); ui->use_vsync->setChecked(Settings::values.use_vsync.GetValue()); ui->async_astc->setChecked(Settings::values.async_astc.GetValue()); ui->use_asynchronous_shaders->setChecked(Settings::values.use_asynchronous_shaders.GetValue()); ui->use_fast_gpu_time->setChecked(Settings::values.use_fast_gpu_time.GetValue()); - ui->use_pessimistic_flushes->setChecked(Settings::values.use_pessimistic_flushes.GetValue()); ui->use_vulkan_driver_pipeline_cache->setChecked( Settings::values.use_vulkan_driver_pipeline_cache.GetValue()); @@ -54,6 +55,8 @@ void ConfigureGraphicsAdvanced::SetConfiguration() { void ConfigureGraphicsAdvanced::ApplyConfiguration() { ConfigurationShared::ApplyPerGameSetting(&Settings::values.gpu_accuracy, ui->gpu_accuracy); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.async_presentation, + ui->async_present, async_present); ConfigurationShared::ApplyPerGameSetting(&Settings::values.renderer_force_max_clock, ui->renderer_force_max_clock, renderer_force_max_clock); @@ -67,8 +70,6 @@ void ConfigureGraphicsAdvanced::ApplyConfiguration() { use_asynchronous_shaders); ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_fast_gpu_time, ui->use_fast_gpu_time, use_fast_gpu_time); - ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_pessimistic_flushes, - ui->use_pessimistic_flushes, use_pessimistic_flushes); ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_vulkan_driver_pipeline_cache, ui->use_vulkan_driver_pipeline_cache, use_vulkan_driver_pipeline_cache); @@ -90,6 +91,7 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { // Disable if not global (only happens during game) if (Settings::IsConfiguringGlobal()) { ui->gpu_accuracy->setEnabled(Settings::values.gpu_accuracy.UsingGlobal()); + ui->async_present->setEnabled(Settings::values.async_presentation.UsingGlobal()); ui->renderer_force_max_clock->setEnabled( Settings::values.renderer_force_max_clock.UsingGlobal()); ui->use_vsync->setEnabled(Settings::values.use_vsync.UsingGlobal()); @@ -97,8 +99,6 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { ui->use_asynchronous_shaders->setEnabled( Settings::values.use_asynchronous_shaders.UsingGlobal()); ui->use_fast_gpu_time->setEnabled(Settings::values.use_fast_gpu_time.UsingGlobal()); - ui->use_pessimistic_flushes->setEnabled( - Settings::values.use_pessimistic_flushes.UsingGlobal()); ui->use_vulkan_driver_pipeline_cache->setEnabled( Settings::values.use_vulkan_driver_pipeline_cache.UsingGlobal()); ui->anisotropic_filtering_combobox->setEnabled( @@ -107,6 +107,8 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { return; } + ConfigurationShared::SetColoredTristate(ui->async_present, Settings::values.async_presentation, + async_present); ConfigurationShared::SetColoredTristate(ui->renderer_force_max_clock, Settings::values.renderer_force_max_clock, renderer_force_max_clock); @@ -118,9 +120,6 @@ void ConfigureGraphicsAdvanced::SetupPerGameUI() { use_asynchronous_shaders); ConfigurationShared::SetColoredTristate(ui->use_fast_gpu_time, Settings::values.use_fast_gpu_time, use_fast_gpu_time); - ConfigurationShared::SetColoredTristate(ui->use_pessimistic_flushes, - Settings::values.use_pessimistic_flushes, - use_pessimistic_flushes); ConfigurationShared::SetColoredTristate(ui->use_vulkan_driver_pipeline_cache, Settings::values.use_vulkan_driver_pipeline_cache, use_vulkan_driver_pipeline_cache); diff --git a/src/yuzu/configuration/configure_graphics_advanced.h b/src/yuzu/configuration/configure_graphics_advanced.h index bf1b04749..ff5060957 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.h +++ b/src/yuzu/configuration/configure_graphics_advanced.h @@ -36,12 +36,12 @@ private: std::unique_ptr<Ui::ConfigureGraphicsAdvanced> ui; + ConfigurationShared::CheckState async_present; ConfigurationShared::CheckState renderer_force_max_clock; ConfigurationShared::CheckState use_vsync; ConfigurationShared::CheckState async_astc; ConfigurationShared::CheckState use_asynchronous_shaders; ConfigurationShared::CheckState use_fast_gpu_time; - ConfigurationShared::CheckState use_pessimistic_flushes; ConfigurationShared::CheckState use_vulkan_driver_pipeline_cache; const Core::System& system; diff --git a/src/yuzu/configuration/configure_graphics_advanced.ui b/src/yuzu/configuration/configure_graphics_advanced.ui index a7dbdc18c..1234f695c 100644 --- a/src/yuzu/configuration/configure_graphics_advanced.ui +++ b/src/yuzu/configuration/configure_graphics_advanced.ui @@ -7,7 +7,7 @@ <x>0</x> <y>0</y> <width>404</width> - <height>321</height> + <height>376</height> </rect> </property> <property name="windowTitle"> @@ -70,6 +70,13 @@ </widget> </item> <item> + <widget class="QCheckBox" name="async_present"> + <property name="text"> + <string>Enable asynchronous presentation (Vulkan only)</string> + </property> + </widget> + </item> + <item> <widget class="QCheckBox" name="renderer_force_max_clock"> <property name="toolTip"> <string>Runs work in the background while waiting for graphics commands to keep the GPU from lowering its clock speed.</string> @@ -112,7 +119,7 @@ <item> <widget class="QCheckBox" name="use_fast_gpu_time"> <property name="toolTip"> - <string>Enables Fast GPU Time. This option will force most games to run at their highest native resolution.</string> + <string>Enables Fast GPU Time. This option will force most games to run at their highest native resolution.</string> </property> <property name="text"> <string>Use Fast GPU Time (Hack)</string> @@ -120,19 +127,9 @@ </widget> </item> <item> - <widget class="QCheckBox" name="use_pessimistic_flushes"> - <property name="toolTip"> - <string>Enables pessimistic buffer flushes. This option will force unmodified buffers to be flushed, which can cost performance.</string> - </property> - <property name="text"> - <string>Use pessimistic buffer flushes (Hack)</string> - </property> - </widget> - </item> - <item> <widget class="QCheckBox" name="use_vulkan_driver_pipeline_cache"> <property name="toolTip"> - <string>Enables GPU vendor-specific pipeline cache. This option can improve shader loading time significantly in cases where the Vulkan driver does not store pipeline cache files internally.</string> + <string>Enables GPU vendor-specific pipeline cache. This option can improve shader loading time significantly in cases where the Vulkan driver does not store pipeline cache files internally.</string> </property> <property name="text"> <string>Use Vulkan pipeline cache</string> diff --git a/src/yuzu/configuration/configure_system.cpp b/src/yuzu/configuration/configure_system.cpp index 6af34f793..286ccc5cd 100644 --- a/src/yuzu/configuration/configure_system.cpp +++ b/src/yuzu/configuration/configure_system.cpp @@ -111,6 +111,9 @@ void ConfigureSystem::SetConfiguration() { ui->custom_rtc_edit->setDateTime(QDateTime::fromSecsSinceEpoch(rtc_time)); ui->device_name_edit->setText( QString::fromUtf8(Settings::values.device_name.GetValue().c_str())); + ui->use_unsafe_extended_memory_layout->setEnabled(enabled); + ui->use_unsafe_extended_memory_layout->setChecked( + Settings::values.use_unsafe_extended_memory_layout.GetValue()); if (Settings::IsConfiguringGlobal()) { ui->combo_language->setCurrentIndex(Settings::values.language_index.GetValue()); @@ -160,6 +163,9 @@ void ConfigureSystem::ApplyConfiguration() { ConfigurationShared::ApplyPerGameSetting(&Settings::values.region_index, ui->combo_region); ConfigurationShared::ApplyPerGameSetting(&Settings::values.time_zone_index, ui->combo_time_zone); + ConfigurationShared::ApplyPerGameSetting(&Settings::values.use_unsafe_extended_memory_layout, + ui->use_unsafe_extended_memory_layout, + use_unsafe_extended_memory_layout); if (Settings::IsConfiguringGlobal()) { // Guard if during game and set to game-specific value @@ -215,6 +221,10 @@ void ConfigureSystem::SetupPerGameUI() { Settings::values.rng_seed.GetValue().has_value(), Settings::values.rng_seed.GetValue(true).has_value(), use_rng_seed); + ConfigurationShared::SetColoredTristate(ui->use_unsafe_extended_memory_layout, + Settings::values.use_unsafe_extended_memory_layout, + use_unsafe_extended_memory_layout); + ui->custom_rtc_checkbox->setVisible(false); ui->custom_rtc_edit->setVisible(false); } diff --git a/src/yuzu/configuration/configure_system.h b/src/yuzu/configuration/configure_system.h index ec28724a1..ce1a91601 100644 --- a/src/yuzu/configuration/configure_system.h +++ b/src/yuzu/configuration/configure_system.h @@ -41,6 +41,7 @@ private: bool enabled = false; ConfigurationShared::CheckState use_rng_seed; + ConfigurationShared::CheckState use_unsafe_extended_memory_layout; Core::System& system; }; diff --git a/src/yuzu/configuration/configure_system.ui b/src/yuzu/configuration/configure_system.ui index 9e7bc3b93..e0caecd5e 100644 --- a/src/yuzu/configuration/configure_system.ui +++ b/src/yuzu/configuration/configure_system.ui @@ -478,6 +478,13 @@ </property> </widget> </item> + <item row="7" column="0"> + <widget class="QCheckBox" name="use_unsafe_extended_memory_layout"> + <property name="text"> + <string>Unsafe extended memory layout (8GB DRAM)</string> + </property> + </widget> + </item> </layout> </item> </layout> diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp index b79409a68..ba9eece1d 100644 --- a/src/yuzu/main.cpp +++ b/src/yuzu/main.cpp @@ -27,6 +27,7 @@ #include "configuration/configure_input.h" #include "configuration/configure_per_game.h" #include "configuration/configure_tas.h" +#include "core/file_sys/romfs_factory.h" #include "core/file_sys/vfs.h" #include "core/file_sys/vfs_real.h" #include "core/frontend/applets/cabinet.h" @@ -4171,6 +4172,8 @@ void GMainWindow::OnReinitializeKeys(ReinitializeKeyBehavior behavior) { } Core::Crypto::KeyManager& keys = Core::Crypto::KeyManager::Instance(); + bool all_keys_present{true}; + if (keys.BaseDeriveNecessary()) { Core::Crypto::PartitionDataManager pdm{vfs->OpenDirectory("", FileSys::Mode::Read)}; @@ -4195,6 +4198,7 @@ void GMainWindow::OnReinitializeKeys(ReinitializeKeyBehavior behavior) { errors += tr(" - Missing PRODINFO"); } if (!errors.isEmpty()) { + all_keys_present = false; QMessageBox::warning( this, tr("Derivation Components Missing"), tr("Encryption keys are missing. " @@ -4222,11 +4226,40 @@ void GMainWindow::OnReinitializeKeys(ReinitializeKeyBehavior behavior) { system->GetFileSystemController().CreateFactories(*vfs); + if (all_keys_present && !this->CheckSystemArchiveDecryption()) { + LOG_WARNING(Frontend, "Mii model decryption failed"); + QMessageBox::warning( + this, tr("System Archive Decryption Failed"), + tr("Encryption keys failed to decrypt firmware. " + "<br>Please follow <a href='https://yuzu-emu.org/help/quickstart/'>the yuzu " + "quickstart guide</a> to get all your keys, firmware and " + "games.")); + } + if (behavior == ReinitializeKeyBehavior::Warning) { game_list->PopulateAsync(UISettings::values.game_dirs); } } +bool GMainWindow::CheckSystemArchiveDecryption() { + constexpr u64 MiiModelId = 0x0100000000000802; + + auto bis_system = system->GetFileSystemController().GetSystemNANDContents(); + if (!bis_system) { + // Not having system BIS files is not an error. + return true; + } + + auto mii_nca = bis_system->GetEntry(MiiModelId, FileSys::ContentRecordType::Data); + if (!mii_nca) { + // Not having the Mii model is not an error. + return true; + } + + // Return whether we are able to decrypt the RomFS of the Mii model. + return mii_nca->GetRomFS().get() != nullptr; +} + std::optional<u64> GMainWindow::SelectRomFSDumpTarget(const FileSys::ContentProvider& installed, u64 program_id) { const auto dlc_entries = diff --git a/src/yuzu/main.h b/src/yuzu/main.h index 8b5c1d747..3bbc31ada 100644 --- a/src/yuzu/main.h +++ b/src/yuzu/main.h @@ -392,6 +392,7 @@ private: void LoadTranslation(); void OpenPerGameConfiguration(u64 title_id, const std::string& file_name); bool CheckDarkMode(); + bool CheckSystemArchiveDecryption(); QString GetTasStateDescription() const; bool CreateShortcut(const std::string& shortcut_path, const std::string& title, diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 464da3231..605280949 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -274,7 +274,7 @@ void Config::ReadValues() { // Core ReadSetting("Core", Settings::values.use_multi_core); - ReadSetting("Core", Settings::values.use_extended_memory_layout); + ReadSetting("Core", Settings::values.use_unsafe_extended_memory_layout); // Cpu ReadSetting("Cpu", Settings::values.cpu_accuracy); @@ -300,6 +300,7 @@ void Config::ReadValues() { // Renderer ReadSetting("Renderer", Settings::values.renderer_backend); + ReadSetting("Renderer", Settings::values.async_presentation); ReadSetting("Renderer", Settings::values.renderer_force_max_clock); ReadSetting("Renderer", Settings::values.renderer_debug); ReadSetting("Renderer", Settings::values.renderer_shader_feedback); @@ -326,7 +327,6 @@ void Config::ReadValues() { ReadSetting("Renderer", Settings::values.accelerate_astc); ReadSetting("Renderer", Settings::values.async_astc); ReadSetting("Renderer", Settings::values.use_fast_gpu_time); - ReadSetting("Renderer", Settings::values.use_pessimistic_flushes); ReadSetting("Renderer", Settings::values.use_vulkan_driver_pipeline_cache); ReadSetting("Renderer", Settings::values.bg_red); diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index 209cfc28a..db6fba922 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h @@ -163,9 +163,9 @@ keyboard_enabled = # 0: Disabled, 1 (default): Enabled use_multi_core = -# Enable extended guest system memory layout (8GB DRAM) +# Enable unsafe extended guest system memory layout (8GB DRAM) # 0 (default): Disabled, 1: Enabled -use_extended_memory_layout = +use_unsafe_extended_memory_layout = [Cpu] # Adjusts various optimizations. @@ -264,6 +264,10 @@ cpuopt_unsafe_ignore_global_monitor = # 0: OpenGL, 1 (default): Vulkan backend = +# Whether to enable asynchronous presentation (Vulkan only) +# 0 (default): Off, 1: On +async_presentation = + # Enable graphics API debugging mode. # 0 (default): Disabled, 1: Enabled debug = @@ -370,10 +374,6 @@ use_asynchronous_gpu_emulation = # 0: Off, 1 (default): On use_fast_gpu_time = -# Force unmodified buffers to be flushed, which can cost performance. -# 0: Off (default), 1: On -use_pessimistic_flushes = - # Whether to use garbage collection or not for GPU caches. # 0 (default): Off, 1: On use_caches_gc = |