diff options
Diffstat (limited to 'src/video_core')
27 files changed, 358 insertions, 286 deletions
diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index c3318095c..be2113f5a 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -261,16 +261,6 @@ public: stream_score += score; } - /// Sets the new frame tick - void SetFrameTick(u64 new_frame_tick) noexcept { - frame_tick = new_frame_tick; - } - - /// Returns the new frame tick - [[nodiscard]] u64 FrameTick() const noexcept { - return frame_tick; - } - /// Returns the likeliness of this being a stream buffer [[nodiscard]] int StreamScore() const noexcept { return stream_score; @@ -307,6 +297,14 @@ public: return words.size_bytes; } + size_t getLRUID() const noexcept { + return lru_id; + } + + void setLRUID(size_t lru_id_) { + lru_id = lru_id_; + } + private: template <Type type> u64* Array() noexcept { @@ -603,9 +601,9 @@ private: RasterizerInterface* rasterizer = nullptr; VAddr cpu_addr = 0; Words words; - u64 frame_tick = 0; BufferFlagBits flags{}; int stream_score = 0; + size_t lru_id = SIZE_MAX; }; } // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 3b43554f9..7bfd57369 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -20,6 +20,7 @@ #include "common/common_types.h" #include "common/div_ceil.h" #include "common/literals.h" +#include "common/lru_cache.h" #include "common/microprofile.h" #include "common/scope_exit.h" #include "common/settings.h" @@ -330,7 +331,7 @@ private: template <bool insert> void ChangeRegister(BufferId buffer_id); - void TouchBuffer(Buffer& buffer) const noexcept; + void TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept; bool SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); @@ -428,7 +429,11 @@ private: size_t immediate_buffer_capacity = 0; std::unique_ptr<u8[]> immediate_buffer_alloc; - typename SlotVector<Buffer>::Iterator deletion_iterator; + struct LRUItemParams { + using ObjectType = BufferId; + using TickType = u64; + }; + Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; u64 frame_tick = 0; u64 total_used_memory = 0; @@ -445,7 +450,6 @@ BufferCache<P>::BufferCache(VideoCore::RasterizerInterface& rasterizer_, kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_} { // Ensure the first slot is used for the null buffer void(slot_buffers.insert(runtime, NullBufferParams{})); - deletion_iterator = slot_buffers.end(); common_ranges.clear(); } @@ -454,20 +458,17 @@ void BufferCache<P>::RunGarbageCollector() { const bool aggressive_gc = total_used_memory >= CRITICAL_MEMORY; const u64 ticks_to_destroy = aggressive_gc ? 60 : 120; int num_iterations = aggressive_gc ? 64 : 32; - for (; num_iterations > 0; --num_iterations) { - if (deletion_iterator == slot_buffers.end()) { - deletion_iterator = slot_buffers.begin(); - } - ++deletion_iterator; - if (deletion_iterator == slot_buffers.end()) { - break; - } - const auto [buffer_id, buffer] = *deletion_iterator; - if (buffer->FrameTick() + ticks_to_destroy < frame_tick) { - DownloadBufferMemory(*buffer); - DeleteBuffer(buffer_id); + const auto clean_up = [this, &num_iterations](BufferId buffer_id) { + if (num_iterations == 0) { + return true; } - } + --num_iterations; + auto& buffer = slot_buffers[buffer_id]; + DownloadBufferMemory(buffer); + DeleteBuffer(buffer_id); + return false; + }; + lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up); } template <class P> @@ -485,7 +486,7 @@ void BufferCache<P>::TickFrame() { const bool skip_preferred = hits * 256 < shots * 251; uniform_buffer_skip_cache_size = skip_preferred ? DEFAULT_SKIP_CACHE_SIZE : 0; - if (Settings::values.use_caches_gc.GetValue() && total_used_memory >= EXPECTED_MEMORY) { + if (total_used_memory >= EXPECTED_MEMORY) { RunGarbageCollector(); } ++frame_tick; @@ -954,7 +955,7 @@ bool BufferCache<P>::IsRegionCpuModified(VAddr addr, size_t size) { template <class P> void BufferCache<P>::BindHostIndexBuffer() { Buffer& buffer = slot_buffers[index_buffer.buffer_id]; - TouchBuffer(buffer); + TouchBuffer(buffer, index_buffer.buffer_id); const u32 offset = buffer.Offset(index_buffer.cpu_addr); const u32 size = index_buffer.size; SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); @@ -975,7 +976,7 @@ void BufferCache<P>::BindHostVertexBuffers() { for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { const Binding& binding = vertex_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; - TouchBuffer(buffer); + TouchBuffer(buffer, binding.buffer_id); SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); if (!flags[Dirty::VertexBuffer0 + index]) { continue; @@ -1011,7 +1012,7 @@ void BufferCache<P>::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 const VAddr cpu_addr = binding.cpu_addr; const u32 size = std::min(binding.size, (*uniform_buffer_sizes)[stage][index]); Buffer& buffer = slot_buffers[binding.buffer_id]; - TouchBuffer(buffer); + TouchBuffer(buffer, binding.buffer_id); const bool use_fast_buffer = binding.buffer_id != NULL_BUFFER_ID && size <= uniform_buffer_skip_cache_size && !buffer.IsRegionGpuModified(cpu_addr, size); @@ -1083,7 +1084,7 @@ void BufferCache<P>::BindHostGraphicsStorageBuffers(size_t stage) { ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { const Binding& binding = storage_buffers[stage][index]; Buffer& buffer = slot_buffers[binding.buffer_id]; - TouchBuffer(buffer); + TouchBuffer(buffer, binding.buffer_id); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -1128,7 +1129,7 @@ void BufferCache<P>::BindHostTransformFeedbackBuffers() { for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { const Binding& binding = transform_feedback_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; - TouchBuffer(buffer); + TouchBuffer(buffer, binding.buffer_id); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -1148,7 +1149,7 @@ void BufferCache<P>::BindHostComputeUniformBuffers() { ForEachEnabledBit(enabled_compute_uniform_buffer_mask, [&](u32 index) { const Binding& binding = compute_uniform_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; - TouchBuffer(buffer); + TouchBuffer(buffer, binding.buffer_id); const u32 size = std::min(binding.size, (*compute_uniform_buffer_sizes)[index]); SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -1168,7 +1169,7 @@ void BufferCache<P>::BindHostComputeStorageBuffers() { ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { const Binding& binding = compute_storage_buffers[index]; Buffer& buffer = slot_buffers[binding.buffer_id]; - TouchBuffer(buffer); + TouchBuffer(buffer, binding.buffer_id); const u32 size = binding.size; SynchronizeBuffer(buffer, binding.cpu_addr, size); @@ -1513,11 +1514,11 @@ BufferId BufferCache<P>::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { const OverlapResult overlap = ResolveOverlaps(cpu_addr, wanted_size); const u32 size = static_cast<u32>(overlap.end - overlap.begin); const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, overlap.begin, size); - TouchBuffer(slot_buffers[new_buffer_id]); for (const BufferId overlap_id : overlap.ids) { JoinOverlap(new_buffer_id, overlap_id, !overlap.has_stream_leap); } Register(new_buffer_id); + TouchBuffer(slot_buffers[new_buffer_id], new_buffer_id); return new_buffer_id; } @@ -1534,12 +1535,14 @@ void BufferCache<P>::Unregister(BufferId buffer_id) { template <class P> template <bool insert> void BufferCache<P>::ChangeRegister(BufferId buffer_id) { - const Buffer& buffer = slot_buffers[buffer_id]; + Buffer& buffer = slot_buffers[buffer_id]; const auto size = buffer.SizeBytes(); if (insert) { total_used_memory += Common::AlignUp(size, 1024); + buffer.setLRUID(lru_cache.Insert(buffer_id, frame_tick)); } else { total_used_memory -= Common::AlignUp(size, 1024); + lru_cache.Free(buffer.getLRUID()); } const VAddr cpu_addr_begin = buffer.CpuAddr(); const VAddr cpu_addr_end = cpu_addr_begin + size; @@ -1555,8 +1558,10 @@ void BufferCache<P>::ChangeRegister(BufferId buffer_id) { } template <class P> -void BufferCache<P>::TouchBuffer(Buffer& buffer) const noexcept { - buffer.SetFrameTick(frame_tick); +void BufferCache<P>::TouchBuffer(Buffer& buffer, BufferId buffer_id) noexcept { + if (buffer_id != NULL_BUFFER_ID) { + lru_cache.Touch(buffer.getLRUID(), frame_tick); + } } template <class P> diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp index 70030066a..d7e749485 100644 --- a/src/video_core/command_classes/codecs/vp9.cpp +++ b/src/video_core/command_classes/codecs/vp9.cpp @@ -742,6 +742,7 @@ VpxBitStreamWriter VP9::ComposeUncompressedHeader() { uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); + ASSERT(!current_frame_info.segment_enabled); uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h index 87eafdb03..3b1ed4b3a 100644 --- a/src/video_core/command_classes/codecs/vp9_types.h +++ b/src/video_core/command_classes/codecs/vp9_types.h @@ -22,7 +22,7 @@ struct Vp9FrameDimensions { }; static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size"); -enum FrameFlags : u32 { +enum class FrameFlags : u32 { IsKeyFrame = 1 << 0, LastFrameIsKeyFrame = 1 << 1, FrameSizeChanged = 1 << 2, @@ -30,6 +30,7 @@ enum FrameFlags : u32 { LastShowFrame = 1 << 4, IntraOnly = 1 << 5, }; +DECLARE_ENUM_FLAG_OPERATORS(FrameFlags) enum class TxSize { Tx4x4 = 0, // 4x4 transform @@ -92,44 +93,34 @@ struct Vp9EntropyProbs { static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size"); struct Vp9PictureInfo { - bool is_key_frame; - bool intra_only; - bool last_frame_was_key; - bool frame_size_changed; - bool error_resilient_mode; - bool last_frame_shown; - bool show_frame; + u32 bitstream_size; + std::array<u64, 4> frame_offsets; std::array<s8, 4> ref_frame_sign_bias; s32 base_q_index; s32 y_dc_delta_q; s32 uv_dc_delta_q; s32 uv_ac_delta_q; - bool lossless; s32 transform_mode; - bool allow_high_precision_mv; s32 interp_filter; s32 reference_mode; - s8 comp_fixed_ref; - std::array<s8, 2> comp_var_ref; s32 log2_tile_cols; s32 log2_tile_rows; - bool segment_enabled; - bool segment_map_update; - bool segment_map_temporal_update; - s32 segment_abs_delta; - std::array<u32, 8> segment_feature_enable; - std::array<std::array<s16, 4>, 8> segment_feature_data; - bool mode_ref_delta_enabled; - bool use_prev_in_find_mv_refs; std::array<s8, 4> ref_deltas; std::array<s8, 2> mode_deltas; Vp9EntropyProbs entropy; Vp9FrameDimensions frame_size; u8 first_level; u8 sharpness_level; - u32 bitstream_size; - std::array<u64, 4> frame_offsets; - std::array<bool, 4> refresh_frame; + bool is_key_frame; + bool intra_only; + bool last_frame_was_key; + bool error_resilient_mode; + bool last_frame_shown; + bool show_frame; + bool lossless; + bool allow_high_precision_mv; + bool segment_enabled; + bool mode_ref_delta_enabled; }; struct Vp9FrameContainer { @@ -145,7 +136,7 @@ struct PictureInfo { Vp9FrameDimensions golden_frame_size; ///< 0x50 Vp9FrameDimensions alt_frame_size; ///< 0x58 Vp9FrameDimensions current_frame_size; ///< 0x60 - u32 vp9_flags; ///< 0x68 + FrameFlags vp9_flags; ///< 0x68 std::array<s8, 4> ref_frame_sign_bias; ///< 0x6C u8 first_level; ///< 0x70 u8 sharpness_level; ///< 0x71 @@ -158,60 +149,43 @@ struct PictureInfo { u8 allow_high_precision_mv; ///< 0x78 u8 interp_filter; ///< 0x79 u8 reference_mode; ///< 0x7A - s8 comp_fixed_ref; ///< 0x7B - std::array<s8, 2> comp_var_ref; ///< 0x7C + INSERT_PADDING_BYTES_NOINIT(3); ///< 0x7B u8 log2_tile_cols; ///< 0x7E u8 log2_tile_rows; ///< 0x7F Segmentation segmentation; ///< 0x80 LoopFilter loop_filter; ///< 0xE4 - INSERT_PADDING_BYTES_NOINIT(5); ///< 0xEB - u32 surface_params; ///< 0xF0 - INSERT_PADDING_WORDS_NOINIT(3); ///< 0xF4 + INSERT_PADDING_BYTES_NOINIT(21); ///< 0xEB [[nodiscard]] Vp9PictureInfo Convert() const { return { - .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0, - .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0, - .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0, - .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0, - .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0, - .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0, - .show_frame = true, + .bitstream_size = bitstream_size, + .frame_offsets{}, .ref_frame_sign_bias = ref_frame_sign_bias, .base_q_index = base_q_index, .y_dc_delta_q = y_dc_delta_q, .uv_dc_delta_q = uv_dc_delta_q, .uv_ac_delta_q = uv_ac_delta_q, - .lossless = lossless != 0, .transform_mode = tx_mode, - .allow_high_precision_mv = allow_high_precision_mv != 0, .interp_filter = interp_filter, .reference_mode = reference_mode, - .comp_fixed_ref = comp_fixed_ref, - .comp_var_ref = comp_var_ref, .log2_tile_cols = log2_tile_cols, .log2_tile_rows = log2_tile_rows, - .segment_enabled = segmentation.enabled != 0, - .segment_map_update = segmentation.update_map != 0, - .segment_map_temporal_update = segmentation.temporal_update != 0, - .segment_abs_delta = segmentation.abs_delta, - .segment_feature_enable = segmentation.feature_mask, - .segment_feature_data = segmentation.feature_data, - .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0, - .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) && - !(vp9_flags == (FrameFlags::FrameSizeChanged)) && - !(vp9_flags == (FrameFlags::IntraOnly)) && - (vp9_flags == (FrameFlags::LastShowFrame)) && - !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)), .ref_deltas = loop_filter.ref_deltas, .mode_deltas = loop_filter.mode_deltas, .entropy{}, .frame_size = current_frame_size, .first_level = first_level, .sharpness_level = sharpness_level, - .bitstream_size = bitstream_size, - .frame_offsets{}, - .refresh_frame{}, + .is_key_frame = True(vp9_flags & FrameFlags::IsKeyFrame), + .intra_only = True(vp9_flags & FrameFlags::IntraOnly), + .last_frame_was_key = True(vp9_flags & FrameFlags::LastFrameIsKeyFrame), + .error_resilient_mode = True(vp9_flags & FrameFlags::ErrorResilientMode), + .last_frame_shown = True(vp9_flags & FrameFlags::LastShowFrame), + .show_frame = true, + .lossless = lossless != 0, + .allow_high_precision_mv = allow_high_precision_mv != 0, + .segment_enabled = segmentation.enabled != 0, + .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0, }; } }; @@ -316,7 +290,6 @@ ASSERT_POSITION(last_frame_size, 0x48); ASSERT_POSITION(first_level, 0x70); ASSERT_POSITION(segmentation, 0x80); ASSERT_POSITION(loop_filter, 0xE4); -ASSERT_POSITION(surface_params, 0xF0); #undef ASSERT_POSITION #define ASSERT_POSITION(field_name, position) \ diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index d5e77941c..0ee07f398 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp @@ -96,12 +96,11 @@ void Vic::Execute() { if (!converted_frame_buffer) { converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(linear_size)), av_free}; } - - const int converted_stride{frame->width * 4}; + const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0}; u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, - &converted_frame_buf_addr, &converted_stride); + &converted_frame_buf_addr, converted_stride.data()); const u32 blk_kind = static_cast<u32>(config.block_linear_kind); if (blk_kind != 0) { diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 1aa43523a..7f4ca6282 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -475,10 +475,10 @@ public: // These values are used by Nouveau and some games. AddGL = 0x8006, - SubtractGL = 0x8007, - ReverseSubtractGL = 0x8008, - MinGL = 0x800a, - MaxGL = 0x800b + MinGL = 0x8007, + MaxGL = 0x8008, + SubtractGL = 0x800a, + ReverseSubtractGL = 0x800b }; enum class Factor : u32 { diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index 7f50ac2f8..d03d480b4 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -6,7 +6,7 @@ #include <array> #include <bitset> -#include <xbyak.h> +#include <xbyak/xbyak.h> #include "common/bit_field.h" #include "common/common_types.h" #include "common/x64/xbyak_abi.h" diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 882eff880..c60ed6453 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -463,6 +463,7 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange( ++page_index; page_offset = 0; remaining_size -= num_bytes; + old_page_addr = page_addr; } split(); return result; diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index ee992aed4..de9e41659 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -156,6 +156,10 @@ public: return shader_backend; } + bool IsAmd() const { + return vendor_name == "ATI Technologies Inc."; + } + private: static bool TestVariableAoffi(); static bool TestPreciseBug(); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 1f4dda17e..b0e14182e 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -219,6 +219,7 @@ ShaderCache::ShaderCache(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindo host_info{ .support_float16 = false, .support_int64 = device.HasShaderInt64(), + .needs_demote_reorder = device.IsAmd(), } { if (use_asynchronous_shaders) { workers = CreateWorkers(); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 7c9b0d6db..9ff0a28cd 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -164,7 +164,8 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { blit_screen.Recreate(); } const VkSemaphore render_semaphore = blit_screen.DrawToSwapchain(*framebuffer, use_accelerated); - scheduler.Flush(render_semaphore); + const VkSemaphore present_semaphore = swapchain.CurrentPresentSemaphore(); + scheduler.Flush(render_semaphore, present_semaphore); scheduler.WaitWorker(); swapchain.Present(render_semaphore); diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 5c43b8acf..888bc7392 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -159,11 +159,13 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr); - const size_t size_bytes = GetSizeInBytes(framebuffer); // TODO(Rodrigo): Read this from HLE constexpr u32 block_height_log2 = 4; const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer); + const u64 size_bytes{Tegra::Texture::CalculateSize(true, bytes_per_pixel, + framebuffer.stride, framebuffer.height, + 1, block_height_log2, 0)}; Tegra::Texture::UnswizzleTexture( mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes), bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0); @@ -356,7 +358,7 @@ void VKBlitScreen::CreateDescriptorPool() { void VKBlitScreen::CreateRenderPass() { const VkAttachmentDescription color_attachment{ .flags = 0, - .format = swapchain.GetImageFormat(), + .format = swapchain.GetImageViewFormat(), .samples = VK_SAMPLE_COUNT_1_BIT, .loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR, .storeOp = VK_ATTACHMENT_STORE_OP_STORE, diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index a37ca1fdf..31bfbcb06 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -281,7 +281,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, Tegra::Engines::Maxw .supported_spirv = device.IsKhrSpirv1_4Supported() ? 0x00010400U : 0x00010000U, .unified_descriptor_binding = true, .support_descriptor_aliasing = true, - .support_int8 = true, + .support_int8 = device.IsInt8Supported(), .support_int16 = device.IsShaderInt16Supported(), .support_int64 = device.IsShaderInt64Supported(), .support_vertex_instance_id = false, @@ -325,6 +325,8 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, Tegra::Engines::Maxw host_info = Shader::HostTranslateInfo{ .support_float16 = device.IsFloat16Supported(), .support_int64 = device.IsShaderInt64Supported(), + .needs_demote_reorder = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY_KHR || + driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE_KHR, }; } diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 3ac18ea54..841a6b846 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -228,9 +228,7 @@ void RasterizerVulkan::Clear() { }; const u32 color_attachment = regs.clear_buffers.RT; - const auto attachment_aspect_mask = framebuffer->ImageRanges()[color_attachment].aspectMask; - const bool is_color_rt = (attachment_aspect_mask & VK_IMAGE_ASPECT_COLOR_BIT) != 0; - if (use_color && is_color_rt) { + if (use_color && framebuffer->HasAspectColorBit(color_attachment)) { VkClearValue clear_value; std::memcpy(clear_value.color.float32, regs.clear_color, sizeof(regs.clear_color)); @@ -248,12 +246,15 @@ void RasterizerVulkan::Clear() { return; } VkImageAspectFlags aspect_flags = 0; - if (use_depth) { + if (use_depth && framebuffer->HasAspectDepthBit()) { aspect_flags |= VK_IMAGE_ASPECT_DEPTH_BIT; } - if (use_stencil) { + if (use_stencil && framebuffer->HasAspectStencilBit()) { aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT; } + if (aspect_flags == 0) { + return; + } scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) { VkClearAttachment attachment; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 4840962de..1d438787a 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -55,14 +55,14 @@ VKScheduler::~VKScheduler() { worker_thread.join(); } -void VKScheduler::Flush(VkSemaphore semaphore) { - SubmitExecution(semaphore); +void VKScheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { + SubmitExecution(signal_semaphore, wait_semaphore); AllocateNewContext(); } -void VKScheduler::Finish(VkSemaphore semaphore) { +void VKScheduler::Finish(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { const u64 presubmit_tick = CurrentTick(); - SubmitExecution(semaphore); + SubmitExecution(signal_semaphore, wait_semaphore); WaitWorker(); Wait(presubmit_tick); AllocateNewContext(); @@ -171,37 +171,41 @@ void VKScheduler::AllocateWorkerCommandBuffer() { }); } -void VKScheduler::SubmitExecution(VkSemaphore semaphore) { +void VKScheduler::SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { EndPendingOperations(); InvalidateState(); const u64 signal_value = master_semaphore->NextTick(); - Record([semaphore, signal_value, this](vk::CommandBuffer cmdbuf) { + Record([signal_semaphore, wait_semaphore, signal_value, this](vk::CommandBuffer cmdbuf) { cmdbuf.End(); - - const u32 num_signal_semaphores = semaphore ? 2U : 1U; - - const u64 wait_value = signal_value - 1; - const VkPipelineStageFlags wait_stage_mask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT; - const VkSemaphore timeline_semaphore = master_semaphore->Handle(); + + const u32 num_signal_semaphores = signal_semaphore ? 2U : 1U; const std::array signal_values{signal_value, u64(0)}; - const std::array signal_semaphores{timeline_semaphore, semaphore}; + const std::array signal_semaphores{timeline_semaphore, signal_semaphore}; + + const u32 num_wait_semaphores = wait_semaphore ? 2U : 1U; + const std::array wait_values{signal_value - 1, u64(1)}; + const std::array wait_semaphores{timeline_semaphore, wait_semaphore}; + static constexpr std::array<VkPipelineStageFlags, 2> wait_stage_masks{ + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + }; const VkTimelineSemaphoreSubmitInfoKHR timeline_si{ .sType = VK_STRUCTURE_TYPE_TIMELINE_SEMAPHORE_SUBMIT_INFO_KHR, .pNext = nullptr, - .waitSemaphoreValueCount = 1, - .pWaitSemaphoreValues = &wait_value, + .waitSemaphoreValueCount = num_wait_semaphores, + .pWaitSemaphoreValues = wait_values.data(), .signalSemaphoreValueCount = num_signal_semaphores, .pSignalSemaphoreValues = signal_values.data(), }; const VkSubmitInfo submit_info{ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .pNext = &timeline_si, - .waitSemaphoreCount = 1, - .pWaitSemaphores = &timeline_semaphore, - .pWaitDstStageMask = &wait_stage_mask, + .waitSemaphoreCount = num_wait_semaphores, + .pWaitSemaphores = wait_semaphores.data(), + .pWaitDstStageMask = wait_stage_masks.data(), .commandBufferCount = 1, .pCommandBuffers = cmdbuf.address(), .signalSemaphoreCount = num_signal_semaphores, diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index cf39a2363..759ed5a48 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -34,10 +34,10 @@ public: ~VKScheduler(); /// Sends the current execution context to the GPU. - void Flush(VkSemaphore semaphore = nullptr); + void Flush(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr); /// Sends the current execution context to the GPU and waits for it to complete. - void Finish(VkSemaphore semaphore = nullptr); + void Finish(VkSemaphore signal_semaphore = nullptr, VkSemaphore wait_semaphore = nullptr); /// Waits for the worker thread to finish executing everything. After this function returns it's /// safe to touch worker resources. @@ -191,7 +191,7 @@ private: void AllocateWorkerCommandBuffer(); - void SubmitExecution(VkSemaphore semaphore); + void SubmitExecution(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore); void AllocateNewContext(); diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index d990eefba..aadf03cb0 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -20,16 +20,15 @@ namespace Vulkan { namespace { -VkSurfaceFormatKHR ChooseSwapSurfaceFormat(vk::Span<VkSurfaceFormatKHR> formats, bool srgb) { +VkSurfaceFormatKHR ChooseSwapSurfaceFormat(vk::Span<VkSurfaceFormatKHR> formats) { if (formats.size() == 1 && formats[0].format == VK_FORMAT_UNDEFINED) { VkSurfaceFormatKHR format; format.format = VK_FORMAT_B8G8R8A8_UNORM; format.colorSpace = VK_COLOR_SPACE_SRGB_NONLINEAR_KHR; return format; } - const auto& found = std::find_if(formats.begin(), formats.end(), [srgb](const auto& format) { - const auto request_format = srgb ? VK_FORMAT_B8G8R8A8_SRGB : VK_FORMAT_B8G8R8A8_UNORM; - return format.format == request_format && + const auto& found = std::find_if(formats.begin(), formats.end(), [](const auto& format) { + return format.format == VK_FORMAT_B8G8R8A8_UNORM && format.colorSpace == VK_COLOR_SPACE_SRGB_NONLINEAR_KHR; }); return found != formats.end() ? *found : formats[0]; @@ -107,14 +106,12 @@ void VKSwapchain::AcquireNextImage() { } void VKSwapchain::Present(VkSemaphore render_semaphore) { - const VkSemaphore present_semaphore{*present_semaphores[frame_index]}; - const std::array<VkSemaphore, 2> semaphores{present_semaphore, render_semaphore}; const auto present_queue{device.GetPresentQueue()}; const VkPresentInfoKHR present_info{ .sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR, .pNext = nullptr, - .waitSemaphoreCount = render_semaphore ? 2U : 1U, - .pWaitSemaphores = semaphores.data(), + .waitSemaphoreCount = render_semaphore ? 1U : 0U, + .pWaitSemaphores = &render_semaphore, .swapchainCount = 1, .pSwapchains = swapchain.address(), .pImageIndices = &image_index, @@ -145,7 +142,7 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, const auto formats{physical_device.GetSurfaceFormatsKHR(surface)}; const auto present_modes{physical_device.GetSurfacePresentModesKHR(surface)}; - const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats, srgb)}; + const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)}; const VkPresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)}; u32 requested_image_count{capabilities.minImageCount + 1}; @@ -180,6 +177,17 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, swapchain_ci.queueFamilyIndexCount = static_cast<u32>(queue_indices.size()); swapchain_ci.pQueueFamilyIndices = queue_indices.data(); } + static constexpr std::array view_formats{VK_FORMAT_B8G8R8A8_UNORM, VK_FORMAT_B8G8R8A8_SRGB}; + VkImageFormatListCreateInfo format_list{ + .sType = VK_STRUCTURE_TYPE_IMAGE_FORMAT_LIST_CREATE_INFO_KHR, + .pNext = nullptr, + .viewFormatCount = static_cast<u32>(view_formats.size()), + .pViewFormats = view_formats.data(), + }; + if (device.IsKhrSwapchainMutableFormatEnabled()) { + format_list.pNext = std::exchange(swapchain_ci.pNext, &format_list); + swapchain_ci.flags |= VK_SWAPCHAIN_CREATE_MUTABLE_FORMAT_BIT_KHR; + } // Request the size again to reduce the possibility of a TOCTOU race condition. const auto updated_capabilities = physical_device.GetSurfaceCapabilitiesKHR(surface); swapchain_ci.imageExtent = ChooseSwapExtent(updated_capabilities, width, height); @@ -191,7 +199,7 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, images = swapchain.GetImages(); image_count = static_cast<u32>(images.size()); - image_format = surface_format.format; + image_view_format = srgb ? VK_FORMAT_B8G8R8A8_SRGB : VK_FORMAT_B8G8R8A8_UNORM; } void VKSwapchain::CreateSemaphores() { @@ -207,7 +215,7 @@ void VKSwapchain::CreateImageViews() { .flags = 0, .image = {}, .viewType = VK_IMAGE_VIEW_TYPE_2D, - .format = image_format, + .format = image_view_format, .components = { .r = VK_COMPONENT_SWIZZLE_IDENTITY, diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index 35c2cdc14..5bce41e21 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -68,8 +68,12 @@ public: return *image_views[index]; } - VkFormat GetImageFormat() const { - return image_format; + VkFormat GetImageViewFormat() const { + return image_view_format; + } + + VkSemaphore CurrentPresentSemaphore() const { + return *present_semaphores[frame_index]; } private: @@ -96,7 +100,7 @@ private: u32 image_index{}; u32 frame_index{}; - VkFormat image_format{}; + VkFormat image_view_format{}; VkExtent2D extent{}; bool current_srgb{}; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index 8f4df7122..ff979a7ac 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1186,9 +1186,12 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM renderpass_key.depth_format = depth_buffer->format; num_layers = std::max(num_layers, depth_buffer->range.extent.layers); images[num_images] = depth_buffer->ImageHandle(); - image_ranges[num_images] = MakeSubresourceRange(depth_buffer); + const VkImageSubresourceRange subresource_range = MakeSubresourceRange(depth_buffer); + image_ranges[num_images] = subresource_range; samples = depth_buffer->Samples(); ++num_images; + has_depth = (subresource_range.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) != 0; + has_stencil = (subresource_range.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) != 0; } else { renderpass_key.depth_format = PixelFormat::Invalid; } diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 5fe6b7ba3..6d5a68bfe 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -232,6 +232,18 @@ public: return image_ranges; } + [[nodiscard]] bool HasAspectColorBit(size_t index) const noexcept { + return (image_ranges.at(index).aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) != 0; + } + + [[nodiscard]] bool HasAspectDepthBit() const noexcept { + return has_depth; + } + + [[nodiscard]] bool HasAspectStencilBit() const noexcept { + return has_stencil; + } + private: vk::Framebuffer framebuffer; VkRenderPass renderpass{}; @@ -241,6 +253,8 @@ private: u32 num_images = 0; std::array<VkImage, 9> images{}; std::array<VkImageSubresourceRange, 9> image_ranges{}; + bool has_depth{}; + bool has_stencil{}; }; struct TextureCacheParams { diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h index ff1feda9b..0c17a791b 100644 --- a/src/video_core/texture_cache/image_base.h +++ b/src/video_core/texture_cache/image_base.h @@ -80,7 +80,7 @@ struct ImageBase { VAddr cpu_addr_end = 0; u64 modification_tick = 0; - u64 frame_tick = 0; + size_t lru_index = SIZE_MAX; std::array<u32, MAX_MIP_LEVELS> mip_level_offsets{}; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index a087498ff..24b809242 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -5,7 +5,6 @@ #pragma once #include "common/alignment.h" -#include "common/settings.h" #include "video_core/dirty_flags.h" #include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/texture_cache_base.h" @@ -43,8 +42,6 @@ TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& void(slot_image_views.insert(runtime, NullImageParams{})); void(slot_samplers.insert(runtime, sampler_descriptor)); - deletion_iterator = slot_images.begin(); - if constexpr (HAS_DEVICE_MEMORY_INFO) { const auto device_memory = runtime.GetDeviceLocalMemory(); const u64 possible_expected_memory = (device_memory * 3) / 10; @@ -64,70 +61,38 @@ template <class P> void TextureCache<P>::RunGarbageCollector() { const bool high_priority_mode = total_used_memory >= expected_memory; const bool aggressive_mode = total_used_memory >= critical_memory; - const u64 ticks_to_destroy = high_priority_mode ? 60 : 100; - int num_iterations = aggressive_mode ? 256 : (high_priority_mode ? 128 : 64); - for (; num_iterations > 0; --num_iterations) { - if (deletion_iterator == slot_images.end()) { - deletion_iterator = slot_images.begin(); - if (deletion_iterator == slot_images.end()) { - break; - } + const u64 ticks_to_destroy = aggressive_mode ? 10ULL : high_priority_mode ? 25ULL : 100ULL; + size_t num_iterations = aggressive_mode ? 10000 : (high_priority_mode ? 100 : 5); + const auto clean_up = [this, &num_iterations, high_priority_mode](ImageId image_id) { + if (num_iterations == 0) { + return true; } - auto [image_id, image_tmp] = *deletion_iterator; - Image* image = image_tmp; // fix clang error. - const bool is_alias = True(image->flags & ImageFlagBits::Alias); - const bool is_bad_overlap = True(image->flags & ImageFlagBits::BadOverlap); - const bool must_download = image->IsSafeDownload(); - bool should_care = is_bad_overlap || is_alias || (high_priority_mode && !must_download); - const u64 ticks_needed = - is_bad_overlap - ? ticks_to_destroy >> 4 - : ((should_care && aggressive_mode) ? ticks_to_destroy >> 1 : ticks_to_destroy); - should_care |= aggressive_mode; - if (should_care && image->frame_tick + ticks_needed < frame_tick) { - if (is_bad_overlap) { - const bool overlap_check = std::ranges::all_of( - image->overlapping_images, [&, image](const ImageId& overlap_id) { - auto& overlap = slot_images[overlap_id]; - return overlap.frame_tick >= image->frame_tick; - }); - if (!overlap_check) { - ++deletion_iterator; - continue; - } - } - if (!is_bad_overlap && must_download) { - const bool alias_check = std::ranges::none_of( - image->aliased_images, [&, image](const AliasedImage& alias) { - auto& alias_image = slot_images[alias.id]; - return (alias_image.frame_tick < image->frame_tick) || - (alias_image.modification_tick < image->modification_tick); - }); - - if (alias_check) { - auto map = runtime.DownloadStagingBuffer(image->unswizzled_size_bytes); - const auto copies = FullDownloadCopies(image->info); - image->DownloadMemory(map, copies); - runtime.Finish(); - SwizzleImage(gpu_memory, image->gpu_addr, image->info, copies, map.mapped_span); - } - } - if (True(image->flags & ImageFlagBits::Tracked)) { - UntrackImage(*image, image_id); - } - UnregisterImage(image_id); - DeleteImage(image_id); - if (is_bad_overlap) { - ++num_iterations; - } + --num_iterations; + auto& image = slot_images[image_id]; + const bool must_download = image.IsSafeDownload(); + if (!high_priority_mode && must_download) { + return false; } - ++deletion_iterator; - } + if (must_download) { + auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes); + const auto copies = FullDownloadCopies(image.info); + image.DownloadMemory(map, copies); + runtime.Finish(); + SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span); + } + if (True(image.flags & ImageFlagBits::Tracked)) { + UntrackImage(image, image_id); + } + UnregisterImage(image_id); + DeleteImage(image_id); + return false; + }; + lru_cache.ForEachItemBelow(frame_tick - ticks_to_destroy, clean_up); } template <class P> void TextureCache<P>::TickFrame() { - if (Settings::values.use_caches_gc.GetValue() && total_used_memory > minimum_memory) { + if (total_used_memory > minimum_memory) { RunGarbageCollector(); } sentenced_images.Tick(); @@ -1078,6 +1043,8 @@ void TextureCache<P>::RegisterImage(ImageId image_id) { tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); } total_used_memory += Common::AlignUp(tentative_size, 1024); + image.lru_index = lru_cache.Insert(image_id, frame_tick); + ForEachGPUPage(image.gpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { gpu_page_table[page].push_back(image_id); }); if (False(image.flags & ImageFlagBits::Sparse)) { @@ -1115,6 +1082,7 @@ void TextureCache<P>::UnregisterImage(ImageId image_id) { tentative_size = EstimatedDecompressedSize(tentative_size, image.info.format); } total_used_memory -= Common::AlignUp(tentative_size, 1024); + lru_cache.Free(image.lru_index); const auto& clear_page_table = [this, image_id]( u64 page, @@ -1384,7 +1352,7 @@ void TextureCache<P>::PrepareImage(ImageId image_id, bool is_modification, bool if (is_modification) { MarkModification(image); } - image.frame_tick = frame_tick; + lru_cache.Touch(image.lru_index, frame_tick); } template <class P> diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index e4ae351cb..d7528ed24 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -14,6 +14,7 @@ #include "common/common_types.h" #include "common/literals.h" +#include "common/lru_cache.h" #include "video_core/compatible_formats.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/engines/fermi_2d.h" @@ -370,6 +371,12 @@ private: std::vector<ImageId> uncommitted_downloads; std::queue<std::vector<ImageId>> committed_downloads; + struct LRUItemParams { + using ObjectType = ImageId; + using TickType = u64; + }; + Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache; + static constexpr size_t TICKS_TO_DESTROY = 6; DelayedDestructionRing<Image, TICKS_TO_DESTROY> sentenced_images; DelayedDestructionRing<ImageView, TICKS_TO_DESTROY> sentenced_image_view; @@ -379,7 +386,6 @@ private: u64 modification_tick = 0; u64 frame_tick = 0; - typename SlotVector<Image>::Iterator deletion_iterator; }; } // namespace VideoCommon diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index c32ae956a..24e943e4c 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -63,14 +63,6 @@ void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 const u32 unswizzled_offset = slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL; - if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset); - offset >= input.size()) { - // TODO(Rodrigo): This is an out of bounds access that should never happen. To - // avoid crashing the emulator, break. - ASSERT_MSG(false, "offset {} exceeds input size {}!", offset, input.size()); - break; - } - u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; @@ -84,56 +76,31 @@ template <bool TO_LINEAR> void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { switch (bytes_per_pixel) { - case 1: - return SwizzleImpl<TO_LINEAR, 1>(output, input, width, height, depth, block_height, - block_depth, stride_alignment); - case 2: - return SwizzleImpl<TO_LINEAR, 2>(output, input, width, height, depth, block_height, - block_depth, stride_alignment); - case 3: - return SwizzleImpl<TO_LINEAR, 3>(output, input, width, height, depth, block_height, - block_depth, stride_alignment); - case 4: - return SwizzleImpl<TO_LINEAR, 4>(output, input, width, height, depth, block_height, - block_depth, stride_alignment); - case 6: - return SwizzleImpl<TO_LINEAR, 6>(output, input, width, height, depth, block_height, +#define BPP_CASE(x) \ + case x: \ + return SwizzleImpl<TO_LINEAR, x>(output, input, width, height, depth, block_height, \ block_depth, stride_alignment); - case 8: - return SwizzleImpl<TO_LINEAR, 8>(output, input, width, height, depth, block_height, - block_depth, stride_alignment); - case 12: - return SwizzleImpl<TO_LINEAR, 12>(output, input, width, height, depth, block_height, - block_depth, stride_alignment); - case 16: - return SwizzleImpl<TO_LINEAR, 16>(output, input, width, height, depth, block_height, - block_depth, stride_alignment); + BPP_CASE(1) + BPP_CASE(2) + BPP_CASE(3) + BPP_CASE(4) + BPP_CASE(6) + BPP_CASE(8) + BPP_CASE(12) + BPP_CASE(16) +#undef BPP_CASE default: UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel); } } -} // Anonymous namespace - -void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, - u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, - u32 stride_alignment) { - Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth, - stride_alignment); -} - -void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, - u32 height, u32 depth, u32 block_height, u32 block_depth, - u32 stride_alignment) { - Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth, - stride_alignment); -} +template <u32 BYTES_PER_PIXEL> void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, - u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, - u32 block_height_bit, u32 offset_x, u32 offset_y) { + u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit, + u32 offset_x, u32 offset_y) { const u32 block_height = 1U << block_height_bit; const u32 image_width_in_gobs = - (swizzled_width * bytes_per_pixel + (GOB_SIZE_X - 1)) / GOB_SIZE_X; + (swizzled_width * BYTES_PER_PIXEL + (GOB_SIZE_X - 1)) / GOB_SIZE_X; for (u32 line = 0; line < subrect_height; ++line) { const u32 dst_y = line + offset_y; const u32 gob_address_y = @@ -143,20 +110,21 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 for (u32 x = 0; x < subrect_width; ++x) { const u32 dst_x = x + offset_x; const u32 gob_address = - gob_address_y + (dst_x * bytes_per_pixel / GOB_SIZE_X) * GOB_SIZE * block_height; - const u32 swizzled_offset = gob_address + table[(dst_x * bytes_per_pixel) % GOB_SIZE_X]; - const u32 unswizzled_offset = line * source_pitch + x * bytes_per_pixel; + gob_address_y + (dst_x * BYTES_PER_PIXEL / GOB_SIZE_X) * GOB_SIZE * block_height; + const u32 swizzled_offset = gob_address + table[(dst_x * BYTES_PER_PIXEL) % GOB_SIZE_X]; + const u32 unswizzled_offset = line * source_pitch + x * BYTES_PER_PIXEL; const u8* const source_line = unswizzled_data + unswizzled_offset; u8* const dest_addr = swizzled_data + swizzled_offset; - std::memcpy(dest_addr, source_line, bytes_per_pixel); + std::memcpy(dest_addr, source_line, BYTES_PER_PIXEL); } } } -void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel, - u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) { - const u32 stride = width * bytes_per_pixel; +template <u32 BYTES_PER_PIXEL> +void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 block_height, + u32 origin_x, u32 origin_y, u8* output, const u8* input) { + const u32 stride = width * BYTES_PER_PIXEL; const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X; const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height); @@ -171,24 +139,25 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, const u32 src_offset_y = (block_y >> block_height) * block_size + ((block_y & block_height_mask) << GOB_SIZE_SHIFT); for (u32 column = 0; column < line_length_in; ++column) { - const u32 src_x = (column + origin_x) * bytes_per_pixel; + const u32 src_x = (column + origin_x) * BYTES_PER_PIXEL; const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift; const u32 swizzled_offset = src_offset_y + src_offset_x + table[src_x % GOB_SIZE_X]; - const u32 unswizzled_offset = line * pitch + column * bytes_per_pixel; + const u32 unswizzled_offset = line * pitch + column * BYTES_PER_PIXEL; - std::memcpy(output + unswizzled_offset, input + swizzled_offset, bytes_per_pixel); + std::memcpy(output + unswizzled_offset, input + swizzled_offset, BYTES_PER_PIXEL); } } } +template <u32 BYTES_PER_PIXEL> void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, - u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x, - u32 origin_y, u8* output, const u8* input) { + u32 block_height, u32 block_depth, u32 origin_x, u32 origin_y, u8* output, + const u8* input) { UNIMPLEMENTED_IF(origin_x > 0); UNIMPLEMENTED_IF(origin_y > 0); - const u32 stride = width * bytes_per_pixel; + const u32 stride = width * BYTES_PER_PIXEL; const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X; const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); @@ -203,11 +172,93 @@ void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 widt for (u32 x = 0; x < line_length_in; ++x) { const u32 dst_offset = ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + table[x % GOB_SIZE_X]; - const u32 src_offset = x * bytes_per_pixel + line * pitch; - std::memcpy(output + dst_offset, input + src_offset, bytes_per_pixel); + const u32 src_offset = x * BYTES_PER_PIXEL + line * pitch; + std::memcpy(output + dst_offset, input + src_offset, BYTES_PER_PIXEL); } } } +} // Anonymous namespace + +void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, + u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment) { + Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth, + stride_alignment); +} + +void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, + u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment) { + Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth, + stride_alignment); +} + +void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, + u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data, + u32 block_height_bit, u32 offset_x, u32 offset_y) { + switch (bytes_per_pixel) { +#define BPP_CASE(x) \ + case x: \ + return SwizzleSubrect<x>(subrect_width, subrect_height, source_pitch, swizzled_width, \ + swizzled_data, unswizzled_data, block_height_bit, offset_x, \ + offset_y); + BPP_CASE(1) + BPP_CASE(2) + BPP_CASE(3) + BPP_CASE(4) + BPP_CASE(6) + BPP_CASE(8) + BPP_CASE(12) + BPP_CASE(16) +#undef BPP_CASE + default: + UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel); + } +} + +void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel, + u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) { + switch (bytes_per_pixel) { +#define BPP_CASE(x) \ + case x: \ + return UnswizzleSubrect<x>(line_length_in, line_count, pitch, width, block_height, \ + origin_x, origin_y, output, input); + BPP_CASE(1) + BPP_CASE(2) + BPP_CASE(3) + BPP_CASE(4) + BPP_CASE(6) + BPP_CASE(8) + BPP_CASE(12) + BPP_CASE(16) +#undef BPP_CASE + default: + UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel); + } +} + +void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height, + u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x, + u32 origin_y, u8* output, const u8* input) { + switch (bytes_per_pixel) { +#define BPP_CASE(x) \ + case x: \ + return SwizzleSliceToVoxel<x>(line_length_in, line_count, pitch, width, height, \ + block_height, block_depth, origin_x, origin_y, output, \ + input); + BPP_CASE(1) + BPP_CASE(2) + BPP_CASE(3) + BPP_CASE(4) + BPP_CASE(6) + BPP_CASE(8) + BPP_CASE(12) + BPP_CASE(16) +#undef BPP_CASE + default: + UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel); + } +} void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, const u32 block_height_bit, const std::size_t copy_size, const u8* source_data, @@ -228,7 +279,7 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 u8* dest_addr = swizzle_data + swizzled_offset; count++; - std::memcpy(dest_addr, source_line, 1); + *dest_addr = *source_line; } } } diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index 1a9399455..7994cb859 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -159,7 +159,7 @@ static_assert(sizeof(TextureHandle) == 4, "TextureHandle has wrong size"); return {raw, raw}; } else { const Tegra::Texture::TextureHandle handle{raw}; - return {handle.tic_id, via_header_index ? handle.tic_id : handle.tsc_id}; + return {handle.tic_id, handle.tsc_id}; } } diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 8e56a89e1..24821c1a3 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -368,18 +368,21 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR }; SetNext(next, demote); - VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8; - if (is_float16_supported) { - float16_int8 = { + if (is_int8_supported || is_float16_supported) { + VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FLOAT16_INT8_FEATURES_KHR, .pNext = nullptr, - .shaderFloat16 = true, - .shaderInt8 = false, + .shaderFloat16 = is_float16_supported, + .shaderInt8 = is_int8_supported, }; SetNext(next, float16_int8); - } else { + } + if (!is_float16_supported) { LOG_INFO(Render_Vulkan, "Device doesn't support float16 natively"); } + if (!is_int8_supported) { + LOG_INFO(Render_Vulkan, "Device doesn't support int8 natively"); + } if (!nv_viewport_swizzle) { LOG_INFO(Render_Vulkan, "Device doesn't support viewport swizzles"); @@ -836,6 +839,8 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { bool has_khr_shader_float16_int8{}; bool has_khr_workgroup_memory_explicit_layout{}; bool has_khr_pipeline_executable_properties{}; + bool has_khr_image_format_list{}; + bool has_khr_swapchain_mutable_format{}; bool has_ext_subgroup_size_control{}; bool has_ext_transform_feedback{}; bool has_ext_custom_border_color{}; @@ -885,6 +890,9 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { test(has_ext_shader_atomic_int64, VK_KHR_SHADER_ATOMIC_INT64_EXTENSION_NAME, false); test(has_khr_workgroup_memory_explicit_layout, VK_KHR_WORKGROUP_MEMORY_EXPLICIT_LAYOUT_EXTENSION_NAME, false); + test(has_khr_image_format_list, VK_KHR_IMAGE_FORMAT_LIST_EXTENSION_NAME, false); + test(has_khr_swapchain_mutable_format, VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME, + false); test(has_ext_line_rasterization, VK_EXT_LINE_RASTERIZATION_EXTENSION_NAME, false); if (Settings::values.enable_nsight_aftermath) { test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, @@ -909,6 +917,7 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { physical.GetFeatures2KHR(features); is_float16_supported = float16_int8_features.shaderFloat16; + is_int8_supported = float16_int8_features.shaderInt8; extensions.push_back(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME); } if (has_ext_subgroup_size_control) { @@ -1062,6 +1071,11 @@ std::vector<const char*> Device::LoadExtensions(bool requires_surface) { khr_pipeline_executable_properties = true; } } + if (has_khr_image_format_list && has_khr_swapchain_mutable_format) { + extensions.push_back(VK_KHR_IMAGE_FORMAT_LIST_EXTENSION_NAME); + extensions.push_back(VK_KHR_SWAPCHAIN_MUTABLE_FORMAT_EXTENSION_NAME); + khr_swapchain_mutable_format = true; + } if (khr_push_descriptor) { VkPhysicalDevicePushDescriptorPropertiesKHR push_descriptor; push_descriptor.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PUSH_DESCRIPTOR_PROPERTIES_KHR; diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index c19f40746..5599c38c5 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -139,11 +139,16 @@ public: return is_optimal_astc_supported; } - /// Returns true if the device supports float16 natively + /// Returns true if the device supports float16 natively. bool IsFloat16Supported() const { return is_float16_supported; } + /// Returns true if the device supports int8 natively. + bool IsInt8Supported() const { + return is_int8_supported; + } + /// Returns true if the device warp size can potentially be bigger than guest's warp size. bool IsWarpSizePotentiallyBiggerThanGuest() const { return is_warp_potentially_bigger; @@ -219,6 +224,11 @@ public: return khr_pipeline_executable_properties; } + /// Returns true if VK_KHR_swapchain_mutable_format is enabled. + bool IsKhrSwapchainMutableFormatEnabled() const { + return khr_swapchain_mutable_format; + } + /// Returns true if the device supports VK_KHR_workgroup_memory_explicit_layout. bool IsKhrWorkgroupMemoryExplicitLayoutSupported() const { return khr_workgroup_memory_explicit_layout; @@ -367,7 +377,8 @@ private: u64 device_access_memory{}; ///< Total size of device local memory in bytes. u32 max_push_descriptors{}; ///< Maximum number of push descriptors bool is_optimal_astc_supported{}; ///< Support for native ASTC. - bool is_float16_supported{}; ///< Support for float16 arithmetics. + bool is_float16_supported{}; ///< Support for float16 arithmetic. + bool is_int8_supported{}; ///< Support for int8 arithmetic. bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. bool is_depth_bounds_supported{}; ///< Support for depth bounds. @@ -384,6 +395,7 @@ private: bool khr_workgroup_memory_explicit_layout{}; ///< Support for explicit workgroup layouts. bool khr_push_descriptor{}; ///< Support for VK_KHR_push_descritor. bool khr_pipeline_executable_properties{}; ///< Support for executable properties. + bool khr_swapchain_mutable_format{}; ///< Support for VK_KHR_swapchain_mutable_format. bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. bool ext_sampler_filter_minmax{}; ///< Support for VK_EXT_sampler_filter_minmax. bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. |