diff options
author | Levi <L3ehunin@gmail.com> | 2021-01-11 06:09:56 +0100 |
---|---|---|
committer | Levi <L3ehunin@gmail.com> | 2021-01-11 06:09:56 +0100 |
commit | 7a3c884e39fccfbb498b855080bffabc9ce2e7f1 (patch) | |
tree | 5056f9406dec188439cb0deb87603498243a9412 /src/video_core | |
parent | More forgetting... duh (diff) | |
parent | Merge pull request #5229 from Morph1984/fullscreen-opt (diff) | |
download | yuzu-7a3c884e39fccfbb498b855080bffabc9ce2e7f1.tar yuzu-7a3c884e39fccfbb498b855080bffabc9ce2e7f1.tar.gz yuzu-7a3c884e39fccfbb498b855080bffabc9ce2e7f1.tar.bz2 yuzu-7a3c884e39fccfbb498b855080bffabc9ce2e7f1.tar.lz yuzu-7a3c884e39fccfbb498b855080bffabc9ce2e7f1.tar.xz yuzu-7a3c884e39fccfbb498b855080bffabc9ce2e7f1.tar.zst yuzu-7a3c884e39fccfbb498b855080bffabc9ce2e7f1.zip |
Diffstat (limited to 'src/video_core')
262 files changed, 16506 insertions, 10235 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index da9e9fdda..f7b9d7f86 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -5,8 +5,27 @@ add_library(video_core STATIC buffer_cache/buffer_cache.h buffer_cache/map_interval.cpp buffer_cache/map_interval.h + cdma_pusher.cpp + cdma_pusher.h + command_classes/codecs/codec.cpp + command_classes/codecs/codec.h + command_classes/codecs/h264.cpp + command_classes/codecs/h264.h + command_classes/codecs/vp9.cpp + command_classes/codecs/vp9.h + command_classes/codecs/vp9_types.h + command_classes/host1x.cpp + command_classes/host1x.h + command_classes/nvdec.cpp + command_classes/nvdec.h + command_classes/nvdec_common.h + command_classes/sync_manager.cpp + command_classes/sync_manager.h + command_classes/vic.cpp + command_classes/vic.h compatible_formats.cpp compatible_formats.h + delayed_destruction_ring.h dirty_flags.cpp dirty_flags.h dma_pusher.cpp @@ -29,6 +48,7 @@ add_library(video_core STATIC engines/shader_bytecode.h engines/shader_header.h engines/shader_type.h + framebuffer_config.h macro/macro.cpp macro/macro.h macro/macro_hle.cpp @@ -40,10 +60,6 @@ add_library(video_core STATIC fence_manager.h gpu.cpp gpu.h - gpu_asynch.cpp - gpu_asynch.h - gpu_synch.cpp - gpu_synch.h gpu_thread.cpp gpu_thread.h guest_driver.cpp @@ -66,14 +82,10 @@ add_library(video_core STATIC renderer_opengl/gl_device.h renderer_opengl/gl_fence_manager.cpp renderer_opengl/gl_fence_manager.h - renderer_opengl/gl_framebuffer_cache.cpp - renderer_opengl/gl_framebuffer_cache.h renderer_opengl/gl_rasterizer.cpp renderer_opengl/gl_rasterizer.h renderer_opengl/gl_resource_manager.cpp renderer_opengl/gl_resource_manager.h - renderer_opengl/gl_sampler_cache.cpp - renderer_opengl/gl_sampler_cache.h renderer_opengl/gl_shader_cache.cpp renderer_opengl/gl_shader_cache.h renderer_opengl/gl_shader_decompiler.cpp @@ -95,10 +107,62 @@ add_library(video_core STATIC renderer_opengl/maxwell_to_gl.h renderer_opengl/renderer_opengl.cpp renderer_opengl/renderer_opengl.h - renderer_opengl/utils.cpp - renderer_opengl/utils.h - sampler_cache.cpp - sampler_cache.h + renderer_opengl/util_shaders.cpp + renderer_opengl/util_shaders.h + renderer_vulkan/blit_image.cpp + renderer_vulkan/blit_image.h + renderer_vulkan/fixed_pipeline_state.cpp + renderer_vulkan/fixed_pipeline_state.h + renderer_vulkan/maxwell_to_vk.cpp + renderer_vulkan/maxwell_to_vk.h + renderer_vulkan/renderer_vulkan.h + renderer_vulkan/renderer_vulkan.cpp + renderer_vulkan/vk_blit_screen.cpp + renderer_vulkan/vk_blit_screen.h + renderer_vulkan/vk_buffer_cache.cpp + renderer_vulkan/vk_buffer_cache.h + renderer_vulkan/vk_command_pool.cpp + renderer_vulkan/vk_command_pool.h + renderer_vulkan/vk_compute_pass.cpp + renderer_vulkan/vk_compute_pass.h + renderer_vulkan/vk_compute_pipeline.cpp + renderer_vulkan/vk_compute_pipeline.h + renderer_vulkan/vk_descriptor_pool.cpp + renderer_vulkan/vk_descriptor_pool.h + renderer_vulkan/vk_fence_manager.cpp + renderer_vulkan/vk_fence_manager.h + renderer_vulkan/vk_graphics_pipeline.cpp + renderer_vulkan/vk_graphics_pipeline.h + renderer_vulkan/vk_master_semaphore.cpp + renderer_vulkan/vk_master_semaphore.h + renderer_vulkan/vk_memory_manager.cpp + renderer_vulkan/vk_memory_manager.h + renderer_vulkan/vk_pipeline_cache.cpp + renderer_vulkan/vk_pipeline_cache.h + renderer_vulkan/vk_query_cache.cpp + renderer_vulkan/vk_query_cache.h + renderer_vulkan/vk_rasterizer.cpp + renderer_vulkan/vk_rasterizer.h + renderer_vulkan/vk_resource_pool.cpp + renderer_vulkan/vk_resource_pool.h + renderer_vulkan/vk_scheduler.cpp + renderer_vulkan/vk_scheduler.h + renderer_vulkan/vk_shader_decompiler.cpp + renderer_vulkan/vk_shader_decompiler.h + renderer_vulkan/vk_shader_util.cpp + renderer_vulkan/vk_shader_util.h + renderer_vulkan/vk_staging_buffer_pool.cpp + renderer_vulkan/vk_staging_buffer_pool.h + renderer_vulkan/vk_state_tracker.cpp + renderer_vulkan/vk_state_tracker.h + renderer_vulkan/vk_stream_buffer.cpp + renderer_vulkan/vk_stream_buffer.h + renderer_vulkan/vk_swapchain.cpp + renderer_vulkan/vk_swapchain.h + renderer_vulkan/vk_texture_cache.cpp + renderer_vulkan/vk_texture_cache.h + renderer_vulkan/vk_update_descriptor.cpp + renderer_vulkan/vk_update_descriptor.h shader_cache.h shader_notify.cpp shader_notify.h @@ -155,109 +219,71 @@ add_library(video_core STATIC shader/transform_feedback.h surface.cpp surface.h + texture_cache/accelerated_swizzle.cpp + texture_cache/accelerated_swizzle.h + texture_cache/decode_bc4.cpp + texture_cache/decode_bc4.h + texture_cache/descriptor_table.h + texture_cache/formatter.cpp + texture_cache/formatter.h texture_cache/format_lookup_table.cpp texture_cache/format_lookup_table.h - texture_cache/surface_base.cpp - texture_cache/surface_base.h - texture_cache/surface_params.cpp - texture_cache/surface_params.h - texture_cache/surface_view.cpp - texture_cache/surface_view.h + texture_cache/image_base.cpp + texture_cache/image_base.h + texture_cache/image_info.cpp + texture_cache/image_info.h + texture_cache/image_view_base.cpp + texture_cache/image_view_base.h + texture_cache/image_view_info.cpp + texture_cache/image_view_info.h + texture_cache/render_targets.h + texture_cache/samples_helper.h + texture_cache/slot_vector.h texture_cache/texture_cache.h + texture_cache/types.h + texture_cache/util.cpp + texture_cache/util.h textures/astc.cpp textures/astc.h - textures/convert.cpp - textures/convert.h textures/decoders.cpp textures/decoders.h textures/texture.cpp textures/texture.h video_core.cpp video_core.h + vulkan_common/vulkan_debug_callback.cpp + vulkan_common/vulkan_debug_callback.h + vulkan_common/vulkan_device.cpp + vulkan_common/vulkan_device.h + vulkan_common/vulkan_instance.cpp + vulkan_common/vulkan_instance.h + vulkan_common/vulkan_library.cpp + vulkan_common/vulkan_library.h + vulkan_common/vulkan_surface.cpp + vulkan_common/vulkan_surface.h + vulkan_common/vulkan_wrapper.cpp + vulkan_common/vulkan_wrapper.h + vulkan_common/nsight_aftermath_tracker.cpp + vulkan_common/nsight_aftermath_tracker.h ) -if (ENABLE_VULKAN) - target_sources(video_core PRIVATE - renderer_vulkan/fixed_pipeline_state.cpp - renderer_vulkan/fixed_pipeline_state.h - renderer_vulkan/maxwell_to_vk.cpp - renderer_vulkan/maxwell_to_vk.h - renderer_vulkan/nsight_aftermath_tracker.cpp - renderer_vulkan/nsight_aftermath_tracker.h - renderer_vulkan/renderer_vulkan.h - renderer_vulkan/renderer_vulkan.cpp - renderer_vulkan/vk_blit_screen.cpp - renderer_vulkan/vk_blit_screen.h - renderer_vulkan/vk_buffer_cache.cpp - renderer_vulkan/vk_buffer_cache.h - renderer_vulkan/vk_command_pool.cpp - renderer_vulkan/vk_command_pool.h - renderer_vulkan/vk_compute_pass.cpp - renderer_vulkan/vk_compute_pass.h - renderer_vulkan/vk_compute_pipeline.cpp - renderer_vulkan/vk_compute_pipeline.h - renderer_vulkan/vk_descriptor_pool.cpp - renderer_vulkan/vk_descriptor_pool.h - renderer_vulkan/vk_device.cpp - renderer_vulkan/vk_device.h - renderer_vulkan/vk_fence_manager.cpp - renderer_vulkan/vk_fence_manager.h - renderer_vulkan/vk_graphics_pipeline.cpp - renderer_vulkan/vk_graphics_pipeline.h - renderer_vulkan/vk_image.cpp - renderer_vulkan/vk_image.h - renderer_vulkan/vk_master_semaphore.cpp - renderer_vulkan/vk_master_semaphore.h - renderer_vulkan/vk_memory_manager.cpp - renderer_vulkan/vk_memory_manager.h - renderer_vulkan/vk_pipeline_cache.cpp - renderer_vulkan/vk_pipeline_cache.h - renderer_vulkan/vk_query_cache.cpp - renderer_vulkan/vk_query_cache.h - renderer_vulkan/vk_rasterizer.cpp - renderer_vulkan/vk_rasterizer.h - renderer_vulkan/vk_renderpass_cache.cpp - renderer_vulkan/vk_renderpass_cache.h - renderer_vulkan/vk_resource_pool.cpp - renderer_vulkan/vk_resource_pool.h - renderer_vulkan/vk_sampler_cache.cpp - renderer_vulkan/vk_sampler_cache.h - renderer_vulkan/vk_scheduler.cpp - renderer_vulkan/vk_scheduler.h - renderer_vulkan/vk_shader_decompiler.cpp - renderer_vulkan/vk_shader_decompiler.h - renderer_vulkan/vk_shader_util.cpp - renderer_vulkan/vk_shader_util.h - renderer_vulkan/vk_staging_buffer_pool.cpp - renderer_vulkan/vk_staging_buffer_pool.h - renderer_vulkan/vk_state_tracker.cpp - renderer_vulkan/vk_state_tracker.h - renderer_vulkan/vk_stream_buffer.cpp - renderer_vulkan/vk_stream_buffer.h - renderer_vulkan/vk_swapchain.cpp - renderer_vulkan/vk_swapchain.h - renderer_vulkan/vk_texture_cache.cpp - renderer_vulkan/vk_texture_cache.h - renderer_vulkan/vk_update_descriptor.cpp - renderer_vulkan/vk_update_descriptor.h - renderer_vulkan/wrapper.cpp - renderer_vulkan/wrapper.h - ) -endif() - create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC common core) target_link_libraries(video_core PRIVATE glad xbyak) +if (MSVC) + target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR}) + target_link_libraries(video_core PUBLIC ${FFMPEG_LIBRARY_DIR}/swscale.lib ${FFMPEG_LIBRARY_DIR}/avcodec.lib ${FFMPEG_LIBRARY_DIR}/avutil.lib) +else() + target_include_directories(video_core PRIVATE ${FFMPEG_INCLUDE_DIR}) + target_link_libraries(video_core PRIVATE ${FFMPEG_LIBRARIES}) +endif() + add_dependencies(video_core host_shaders) target_include_directories(video_core PRIVATE ${HOST_SHADERS_INCLUDE}) - -if (ENABLE_VULKAN) - target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) - target_compile_definitions(video_core PRIVATE HAS_VULKAN) - target_link_libraries(video_core PRIVATE sirit) -endif() +target_include_directories(video_core PRIVATE sirit ../../externals/Vulkan-Headers/include) +target_link_libraries(video_core PRIVATE sirit) if (ENABLE_NSIGHT_AFTERMATH) if (NOT DEFINED ENV{NSIGHT_AFTERMATH_SDK}) @@ -271,7 +297,27 @@ if (ENABLE_NSIGHT_AFTERMATH) endif() if (MSVC) - target_compile_options(video_core PRIVATE /we4267) + target_compile_options(video_core PRIVATE + /we4267 # 'var' : conversion from 'size_t' to 'type', possible loss of data + /we4456 # Declaration of 'identifier' hides previous local declaration + /we4457 # Declaration of 'identifier' hides function parameter + /we4458 # Declaration of 'identifier' hides class member + /we4459 # Declaration of 'identifier' hides global declaration + /we4715 # 'function' : not all control paths return a value + ) else() - target_compile_options(video_core PRIVATE -Werror=conversion -Wno-error=sign-conversion -Werror=switch) + target_compile_options(video_core PRIVATE + -Werror=conversion + -Wno-error=sign-conversion + -Werror=pessimizing-move + -Werror=redundant-move + -Werror=shadow + -Werror=switch + -Werror=type-limits + -Werror=unused-variable + + $<$<CXX_COMPILER_ID:GNU>:-Werror=class-memaccess> + $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-parameter> + $<$<CXX_COMPILER_ID:GNU>:-Werror=unused-but-set-variable> + ) endif() diff --git a/src/video_core/buffer_cache/buffer_block.h b/src/video_core/buffer_cache/buffer_block.h index e64170e66..e9306194a 100644 --- a/src/video_core/buffer_cache/buffer_block.h +++ b/src/video_core/buffer_cache/buffer_block.h @@ -4,34 +4,29 @@ #pragma once -#include <unordered_set> -#include <utility> - -#include "common/alignment.h" #include "common/common_types.h" -#include "video_core/gpu.h" namespace VideoCommon { class BufferBlock { public: - bool Overlaps(VAddr start, VAddr end) const { + [[nodiscard]] bool Overlaps(VAddr start, VAddr end) const { return (cpu_addr < end) && (cpu_addr_end > start); } - bool IsInside(VAddr other_start, VAddr other_end) const { + [[nodiscard]] bool IsInside(VAddr other_start, VAddr other_end) const { return cpu_addr <= other_start && other_end <= cpu_addr_end; } - std::size_t Offset(VAddr in_addr) const { + [[nodiscard]] std::size_t Offset(VAddr in_addr) const { return static_cast<std::size_t>(in_addr - cpu_addr); } - VAddr CpuAddr() const { + [[nodiscard]] VAddr CpuAddr() const { return cpu_addr; } - VAddr CpuAddrEnd() const { + [[nodiscard]] VAddr CpuAddrEnd() const { return cpu_addr_end; } @@ -40,11 +35,11 @@ public: cpu_addr_end = new_addr + size; } - std::size_t Size() const { + [[nodiscard]] std::size_t Size() const { return size; } - u64 Epoch() const { + [[nodiscard]] u64 Epoch() const { return epoch; } diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index e7edd733f..83b9ee871 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -118,20 +118,17 @@ public: /// Prepares the buffer cache for data uploading /// @param max_size Maximum number of bytes that will be uploaded /// @return True when a stream buffer invalidation was required, false otherwise - bool Map(std::size_t max_size) { + void Map(std::size_t max_size) { std::lock_guard lock{mutex}; - bool invalidated; - std::tie(buffer_ptr, buffer_offset_base, invalidated) = stream_buffer->Map(max_size, 4); + std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); buffer_offset = buffer_offset_base; - - return invalidated; } /// Finishes the upload stream void Unmap() { std::lock_guard lock{mutex}; - stream_buffer->Unmap(buffer_offset - buffer_offset_base); + stream_buffer.Unmap(buffer_offset - buffer_offset_base); } /// Function called at the end of each frame, inteded for deferred operations @@ -261,9 +258,9 @@ public: protected: explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, - std::unique_ptr<StreamBuffer> stream_buffer_) + StreamBuffer& stream_buffer_) : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, - stream_buffer{std::move(stream_buffer_)}, stream_buffer_handle{stream_buffer->Handle()} {} + stream_buffer{stream_buffer_} {} ~BufferCache() = default; @@ -441,7 +438,7 @@ private: buffer_ptr += size; buffer_offset += size; - return BufferInfo{stream_buffer->Handle(), uploaded_offset, stream_buffer->Address()}; + return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()}; } void AlignBuffer(std::size_t alignment) { @@ -545,7 +542,7 @@ private: bool IsRegionWritten(VAddr start, VAddr end) const { const u64 page_end = end >> WRITE_PAGE_BIT; for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - if (written_pages.count(page_start) > 0) { + if (written_pages.contains(page_start)) { return true; } } @@ -567,9 +564,7 @@ private: VideoCore::RasterizerInterface& rasterizer; Tegra::MemoryManager& gpu_memory; Core::Memory::Memory& cpu_memory; - - std::unique_ptr<StreamBuffer> stream_buffer; - BufferType stream_buffer_handle; + StreamBuffer& stream_buffer; u8* buffer_ptr = nullptr; u64 buffer_offset = 0; diff --git a/src/video_core/buffer_cache/map_interval.h b/src/video_core/buffer_cache/map_interval.h index fe0bcd1d8..ef974b08a 100644 --- a/src/video_core/buffer_cache/map_interval.h +++ b/src/video_core/buffer_cache/map_interval.h @@ -84,9 +84,10 @@ private: void FillFreeList(Chunk& chunk); std::vector<MapInterval*> free_list; - std::unique_ptr<Chunk>* new_chunk = &first_chunk.next; Chunk first_chunk; + + std::unique_ptr<Chunk>* new_chunk = &first_chunk.next; }; } // namespace VideoCommon diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp new file mode 100644 index 000000000..94679d5d1 --- /dev/null +++ b/src/video_core/cdma_pusher.cpp @@ -0,0 +1,170 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include "command_classes/host1x.h" +#include "command_classes/nvdec.h" +#include "command_classes/vic.h" +#include "common/bit_util.h" +#include "video_core/cdma_pusher.h" +#include "video_core/command_classes/nvdec_common.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra { +CDmaPusher::CDmaPusher(GPU& gpu_) + : gpu{gpu_}, nvdec_processor(std::make_shared<Nvdec>(gpu)), + vic_processor(std::make_unique<Vic>(gpu, nvdec_processor)), + host1x_processor(std::make_unique<Host1x>(gpu)), + sync_manager(std::make_unique<SyncptIncrManager>(gpu)) {} + +CDmaPusher::~CDmaPusher() = default; + +void CDmaPusher::Push(ChCommandHeaderList&& entries) { + cdma_queue.push(std::move(entries)); +} + +void CDmaPusher::DispatchCalls() { + while (!cdma_queue.empty()) { + Step(); + } +} + +void CDmaPusher::Step() { + const auto entries{cdma_queue.front()}; + cdma_queue.pop(); + + std::vector<u32> values(entries.size()); + std::memcpy(values.data(), entries.data(), entries.size() * sizeof(u32)); + + for (const u32 value : values) { + if (mask != 0) { + const u32 lbs = Common::CountTrailingZeroes32(mask); + mask &= ~(1U << lbs); + ExecuteCommand(static_cast<u32>(offset + lbs), value); + continue; + } else if (count != 0) { + --count; + ExecuteCommand(static_cast<u32>(offset), value); + if (incrementing) { + ++offset; + } + continue; + } + const auto mode = static_cast<ChSubmissionMode>((value >> 28) & 0xf); + switch (mode) { + case ChSubmissionMode::SetClass: { + mask = value & 0x3f; + offset = (value >> 16) & 0xfff; + current_class = static_cast<ChClassId>((value >> 6) & 0x3ff); + break; + } + case ChSubmissionMode::Incrementing: + case ChSubmissionMode::NonIncrementing: + count = value & 0xffff; + offset = (value >> 16) & 0xfff; + incrementing = mode == ChSubmissionMode::Incrementing; + break; + case ChSubmissionMode::Mask: + mask = value & 0xffff; + offset = (value >> 16) & 0xfff; + break; + case ChSubmissionMode::Immediate: { + const u32 data = value & 0xfff; + offset = (value >> 16) & 0xfff; + ExecuteCommand(static_cast<u32>(offset), data); + break; + } + default: + UNIMPLEMENTED_MSG("ChSubmission mode {} is not implemented!", static_cast<u32>(mode)); + break; + } + } +} + +void CDmaPusher::ExecuteCommand(u32 state_offset, u32 data) { + switch (current_class) { + case ChClassId::NvDec: + ThiStateWrite(nvdec_thi_state, state_offset, {data}); + switch (static_cast<ThiMethod>(state_offset)) { + case ThiMethod::IncSyncpt: { + LOG_DEBUG(Service_NVDRV, "NVDEC Class IncSyncpt Method"); + const auto syncpoint_id = static_cast<u32>(data & 0xFF); + const auto cond = static_cast<u32>((data >> 8) & 0xFF); + if (cond == 0) { + sync_manager->Increment(syncpoint_id); + } else { + sync_manager->SignalDone( + sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id)); + } + break; + } + case ThiMethod::SetMethod1: + LOG_DEBUG(Service_NVDRV, "NVDEC method 0x{:X}", + static_cast<u32>(nvdec_thi_state.method_0)); + nvdec_processor->ProcessMethod(static_cast<Nvdec::Method>(nvdec_thi_state.method_0), + {data}); + break; + default: + break; + } + break; + case ChClassId::GraphicsVic: + ThiStateWrite(vic_thi_state, static_cast<u32>(state_offset), {data}); + switch (static_cast<ThiMethod>(state_offset)) { + case ThiMethod::IncSyncpt: { + LOG_DEBUG(Service_NVDRV, "VIC Class IncSyncpt Method"); + const auto syncpoint_id = static_cast<u32>(data & 0xFF); + const auto cond = static_cast<u32>((data >> 8) & 0xFF); + if (cond == 0) { + sync_manager->Increment(syncpoint_id); + } else { + sync_manager->SignalDone( + sync_manager->IncrementWhenDone(static_cast<u32>(current_class), syncpoint_id)); + } + break; + } + case ThiMethod::SetMethod1: + LOG_DEBUG(Service_NVDRV, "VIC method 0x{:X}, Args=({})", + static_cast<u32>(vic_thi_state.method_0), data); + vic_processor->ProcessMethod(static_cast<Vic::Method>(vic_thi_state.method_0), {data}); + break; + default: + break; + } + break; + case ChClassId::Host1x: + // This device is mainly for syncpoint synchronization + LOG_DEBUG(Service_NVDRV, "Host1X Class Method"); + host1x_processor->ProcessMethod(static_cast<Host1x::Method>(state_offset), {data}); + break; + default: + UNIMPLEMENTED_MSG("Current class not implemented {:X}", static_cast<u32>(current_class)); + break; + } +} + +void CDmaPusher::ThiStateWrite(ThiRegisters& state, u32 state_offset, + const std::vector<u32>& arguments) { + u8* const state_offset_ptr = reinterpret_cast<u8*>(&state) + sizeof(u32) * state_offset; + std::memcpy(state_offset_ptr, arguments.data(), sizeof(u32) * arguments.size()); +} + +} // namespace Tegra diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h new file mode 100644 index 000000000..8ca70b6dd --- /dev/null +++ b/src/video_core/cdma_pusher.h @@ -0,0 +1,136 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <unordered_map> +#include <vector> +#include <queue> + +#include "common/bit_field.h" +#include "common/common_types.h" +#include "video_core/command_classes/sync_manager.h" + +namespace Tegra { + +class GPU; +class Nvdec; +class Vic; +class Host1x; + +enum class ChSubmissionMode : u32 { + SetClass = 0, + Incrementing = 1, + NonIncrementing = 2, + Mask = 3, + Immediate = 4, + Restart = 5, + Gather = 6, +}; + +enum class ChClassId : u32 { + NoClass = 0x0, + Host1x = 0x1, + VideoEncodeMpeg = 0x20, + VideoEncodeNvEnc = 0x21, + VideoStreamingVi = 0x30, + VideoStreamingIsp = 0x32, + VideoStreamingIspB = 0x34, + VideoStreamingViI2c = 0x36, + GraphicsVic = 0x5d, + Graphics3D = 0x60, + GraphicsGpu = 0x61, + Tsec = 0xe0, + TsecB = 0xe1, + NvJpg = 0xc0, + NvDec = 0xf0 +}; + +enum class ChMethod : u32 { + Empty = 0, + SetMethod = 0x10, + SetData = 0x11, +}; + +union ChCommandHeader { + u32 raw; + BitField<0, 16, u32> value; + BitField<16, 12, ChMethod> method_offset; + BitField<28, 4, ChSubmissionMode> submission_mode; +}; +static_assert(sizeof(ChCommandHeader) == sizeof(u32), "ChCommand header is an invalid size"); + +struct ChCommand { + ChClassId class_id{}; + int method_offset{}; + std::vector<u32> arguments; +}; + +using ChCommandHeaderList = std::vector<ChCommandHeader>; +using ChCommandList = std::vector<ChCommand>; + +struct ThiRegisters { + u32_le increment_syncpt{}; + INSERT_PADDING_WORDS(1); + u32_le increment_syncpt_error{}; + u32_le ctx_switch_incremement_syncpt{}; + INSERT_PADDING_WORDS(4); + u32_le ctx_switch{}; + INSERT_PADDING_WORDS(1); + u32_le ctx_syncpt_eof{}; + INSERT_PADDING_WORDS(5); + u32_le method_0{}; + u32_le method_1{}; + INSERT_PADDING_WORDS(12); + u32_le int_status{}; + u32_le int_mask{}; +}; + +enum class ThiMethod : u32 { + IncSyncpt = offsetof(ThiRegisters, increment_syncpt) / sizeof(u32), + SetMethod0 = offsetof(ThiRegisters, method_0) / sizeof(u32), + SetMethod1 = offsetof(ThiRegisters, method_1) / sizeof(u32), +}; + +class CDmaPusher { +public: + explicit CDmaPusher(GPU& gpu_); + ~CDmaPusher(); + + /// Push NVDEC command buffer entries into queue + void Push(ChCommandHeaderList&& entries); + + /// Process queued command buffer entries + void DispatchCalls(); + + /// Process one queue element + void Step(); + + /// Invoke command class devices to execute the command based on the current state + void ExecuteCommand(u32 state_offset, u32 data); + +private: + /// Write arguments value to the ThiRegisters member at the specified offset + void ThiStateWrite(ThiRegisters& state, u32 state_offset, const std::vector<u32>& arguments); + + GPU& gpu; + std::shared_ptr<Tegra::Nvdec> nvdec_processor; + std::unique_ptr<Tegra::Vic> vic_processor; + std::unique_ptr<Tegra::Host1x> host1x_processor; + std::unique_ptr<SyncptIncrManager> sync_manager; + ChClassId current_class{}; + ThiRegisters vic_thi_state{}; + ThiRegisters nvdec_thi_state{}; + + s32 count{}; + s32 offset{}; + s32 mask{}; + bool incrementing{}; + + // Queue of command lists to be processed + std::queue<ChCommandHeaderList> cdma_queue; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.cpp b/src/video_core/command_classes/codecs/codec.cpp new file mode 100644 index 000000000..39bc923a5 --- /dev/null +++ b/src/video_core/command_classes/codecs/codec.cpp @@ -0,0 +1,129 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> +#include <fstream> +#include <vector> +#include "common/assert.h" +#include "video_core/command_classes/codecs/codec.h" +#include "video_core/command_classes/codecs/h264.h" +#include "video_core/command_classes/codecs/vp9.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +extern "C" { +#include <libavutil/opt.h> +} + +namespace Tegra { + +void AVFrameDeleter(AVFrame* ptr) { + av_frame_unref(ptr); + av_free(ptr); +} + +Codec::Codec(GPU& gpu_) + : gpu(gpu_), h264_decoder(std::make_unique<Decoder::H264>(gpu)), + vp9_decoder(std::make_unique<Decoder::VP9>(gpu)) {} + +Codec::~Codec() { + if (!initialized) { + return; + } + // Free libav memory + AVFrame* av_frame{nullptr}; + avcodec_send_packet(av_codec_ctx, nullptr); + av_frame = av_frame_alloc(); + avcodec_receive_frame(av_codec_ctx, av_frame); + avcodec_flush_buffers(av_codec_ctx); + + av_frame_unref(av_frame); + av_free(av_frame); + avcodec_close(av_codec_ctx); +} + +void Codec::SetTargetCodec(NvdecCommon::VideoCodec codec) { + LOG_INFO(Service_NVDRV, "NVDEC video codec initialized to {}", codec); + current_codec = codec; +} + +void Codec::StateWrite(u32 offset, u64 arguments) { + u8* const state_offset = reinterpret_cast<u8*>(&state) + offset * sizeof(u64); + std::memcpy(state_offset, &arguments, sizeof(u64)); +} + +void Codec::Decode() { + bool is_first_frame = false; + + if (!initialized) { + if (current_codec == NvdecCommon::VideoCodec::H264) { + av_codec = avcodec_find_decoder(AV_CODEC_ID_H264); + } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { + av_codec = avcodec_find_decoder(AV_CODEC_ID_VP9); + } else { + LOG_ERROR(Service_NVDRV, "Unknown video codec {}", current_codec); + return; + } + + av_codec_ctx = avcodec_alloc_context3(av_codec); + av_opt_set(av_codec_ctx->priv_data, "tune", "zerolatency", 0); + + // TODO(ameerj): libavcodec gpu hw acceleration + + const auto av_error = avcodec_open2(av_codec_ctx, av_codec, nullptr); + if (av_error < 0) { + LOG_ERROR(Service_NVDRV, "avcodec_open2() Failed."); + avcodec_close(av_codec_ctx); + return; + } + initialized = true; + is_first_frame = true; + } + bool vp9_hidden_frame = false; + + AVPacket packet{}; + av_init_packet(&packet); + std::vector<u8> frame_data; + + if (current_codec == NvdecCommon::VideoCodec::H264) { + frame_data = h264_decoder->ComposeFrameHeader(state, is_first_frame); + } else if (current_codec == NvdecCommon::VideoCodec::Vp9) { + frame_data = vp9_decoder->ComposeFrameHeader(state); + vp9_hidden_frame = vp9_decoder->WasFrameHidden(); + } + + packet.data = frame_data.data(); + packet.size = static_cast<int>(frame_data.size()); + + avcodec_send_packet(av_codec_ctx, &packet); + + if (!vp9_hidden_frame) { + // Only receive/store visible frames + AVFramePtr frame = AVFramePtr{av_frame_alloc(), AVFrameDeleter}; + avcodec_receive_frame(av_codec_ctx, frame.get()); + av_frames.push(std::move(frame)); + // Limit queue to 10 frames. Workaround for ZLA decode and queue spam + if (av_frames.size() > 10) { + av_frames.pop(); + } + } +} + +AVFramePtr Codec::GetCurrentFrame() { + // Sometimes VIC will request more frames than have been decoded. + // in this case, return a nullptr and don't overwrite previous frame data + if (av_frames.empty()) { + return AVFramePtr{nullptr, AVFrameDeleter}; + } + + AVFramePtr frame = std::move(av_frames.front()); + av_frames.pop(); + return frame; +} + +NvdecCommon::VideoCodec Codec::GetCurrentCodec() const { + return current_codec; +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/codec.h b/src/video_core/command_classes/codecs/codec.h new file mode 100644 index 000000000..8a2a6c360 --- /dev/null +++ b/src/video_core/command_classes/codecs/codec.h @@ -0,0 +1,70 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <queue> +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +extern "C" { +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wconversion" +#endif +#include <libavcodec/avcodec.h> +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +namespace Tegra { +class GPU; +struct VicRegisters; + +void AVFrameDeleter(AVFrame* ptr); +using AVFramePtr = std::unique_ptr<AVFrame, decltype(&AVFrameDeleter)>; + +namespace Decoder { +class H264; +class VP9; +} // namespace Decoder + +class Codec { +public: + explicit Codec(GPU& gpu); + ~Codec(); + + /// Sets NVDEC video stream codec + void SetTargetCodec(NvdecCommon::VideoCodec codec); + + /// Populate NvdecRegisters state with argument value at the provided offset + void StateWrite(u32 offset, u64 arguments); + + /// Call decoders to construct headers, decode AVFrame with ffmpeg + void Decode(); + + /// Returns next decoded frame + [[nodiscard]] AVFramePtr GetCurrentFrame(); + + /// Returns the value of current_codec + [[nodiscard]] NvdecCommon::VideoCodec GetCurrentCodec() const; + +private: + bool initialized{}; + NvdecCommon::VideoCodec current_codec{NvdecCommon::VideoCodec::None}; + + AVCodec* av_codec{nullptr}; + AVCodecContext* av_codec_ctx{nullptr}; + + GPU& gpu; + std::unique_ptr<Decoder::H264> h264_decoder; + std::unique_ptr<Decoder::VP9> vp9_decoder; + + NvdecCommon::NvdecRegisters state{}; + std::queue<AVFramePtr> av_frames{}; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp new file mode 100644 index 000000000..65bbeac78 --- /dev/null +++ b/src/video_core/command_classes/codecs/h264.cpp @@ -0,0 +1,293 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <array> +#include "common/bit_util.h" +#include "video_core/command_classes/codecs/h264.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +namespace { +// ZigZag LUTs from libavcodec. +constexpr std::array<u8, 64> zig_zag_direct{ + 0, 1, 8, 16, 9, 2, 3, 10, 17, 24, 32, 25, 18, 11, 4, 5, 12, 19, 26, 33, 40, 48, + 41, 34, 27, 20, 13, 6, 7, 14, 21, 28, 35, 42, 49, 56, 57, 50, 43, 36, 29, 22, 15, 23, + 30, 37, 44, 51, 58, 59, 52, 45, 38, 31, 39, 46, 53, 60, 61, 54, 47, 55, 62, 63, +}; + +constexpr std::array<u8, 16> zig_zag_scan{ + 0 + 0 * 4, 1 + 0 * 4, 0 + 1 * 4, 0 + 2 * 4, 1 + 1 * 4, 2 + 0 * 4, 3 + 0 * 4, 2 + 1 * 4, + 1 + 2 * 4, 0 + 3 * 4, 1 + 3 * 4, 2 + 2 * 4, 3 + 1 * 4, 3 + 2 * 4, 2 + 3 * 4, 3 + 3 * 4, +}; +} // Anonymous namespace + +H264::H264(GPU& gpu_) : gpu(gpu_) {} + +H264::~H264() = default; + +const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state, + bool is_first_frame) { + H264DecoderContext context{}; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &context, sizeof(H264DecoderContext)); + + const s32 frame_number = static_cast<s32>((context.h264_parameter_set.flags >> 46) & 0x1ffff); + if (!is_first_frame && frame_number != 0) { + frame.resize(context.frame_data_size); + + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, frame.data(), frame.size()); + } else { + /// Encode header + H264BitWriter writer{}; + writer.WriteU(1, 24); + writer.WriteU(0, 1); + writer.WriteU(3, 2); + writer.WriteU(7, 5); + writer.WriteU(100, 8); + writer.WriteU(0, 8); + writer.WriteU(31, 8); + writer.WriteUe(0); + const auto chroma_format_idc = + static_cast<u32>((context.h264_parameter_set.flags >> 12) & 3); + writer.WriteUe(chroma_format_idc); + if (chroma_format_idc == 3) { + writer.WriteBit(false); + } + + writer.WriteUe(0); + writer.WriteUe(0); + writer.WriteBit(false); // QpprimeYZeroTransformBypassFlag + writer.WriteBit(false); // Scaling matrix present flag + + const auto order_cnt_type = static_cast<u32>((context.h264_parameter_set.flags >> 14) & 3); + writer.WriteUe(static_cast<u32>((context.h264_parameter_set.flags >> 8) & 0xf)); + writer.WriteUe(order_cnt_type); + if (order_cnt_type == 0) { + writer.WriteUe(context.h264_parameter_set.log2_max_pic_order_cnt); + } else if (order_cnt_type == 1) { + writer.WriteBit(context.h264_parameter_set.delta_pic_order_always_zero_flag != 0); + + writer.WriteSe(0); + writer.WriteSe(0); + writer.WriteUe(0); + } + + const s32 pic_height = context.h264_parameter_set.pic_height_in_map_units / + (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); + + writer.WriteUe(16); + writer.WriteBit(false); + writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); + writer.WriteUe(pic_height - 1); + writer.WriteBit(context.h264_parameter_set.frame_mbs_only_flag != 0); + + if (!context.h264_parameter_set.frame_mbs_only_flag) { + writer.WriteBit(((context.h264_parameter_set.flags >> 0) & 1) != 0); + } + + writer.WriteBit(((context.h264_parameter_set.flags >> 1) & 1) != 0); + writer.WriteBit(false); // Frame cropping flag + writer.WriteBit(false); // VUI parameter present flag + + writer.End(); + + // H264 PPS + writer.WriteU(1, 24); + writer.WriteU(0, 1); + writer.WriteU(3, 2); + writer.WriteU(8, 5); + + writer.WriteUe(0); + writer.WriteUe(0); + + writer.WriteBit(context.h264_parameter_set.entropy_coding_mode_flag != 0); + writer.WriteBit(false); + writer.WriteUe(0); + writer.WriteUe(context.h264_parameter_set.num_refidx_l0_default_active); + writer.WriteUe(context.h264_parameter_set.num_refidx_l1_default_active); + writer.WriteBit(((context.h264_parameter_set.flags >> 2) & 1) != 0); + writer.WriteU(static_cast<s32>((context.h264_parameter_set.flags >> 32) & 0x3), 2); + s32 pic_init_qp = static_cast<s32>((context.h264_parameter_set.flags >> 16) & 0x3f); + pic_init_qp = (pic_init_qp << 26) >> 26; + writer.WriteSe(pic_init_qp); + writer.WriteSe(0); + s32 chroma_qp_index_offset = + static_cast<s32>((context.h264_parameter_set.flags >> 22) & 0x1f); + chroma_qp_index_offset = (chroma_qp_index_offset << 27) >> 27; + + writer.WriteSe(chroma_qp_index_offset); + writer.WriteBit(context.h264_parameter_set.deblocking_filter_control_flag != 0); + writer.WriteBit(((context.h264_parameter_set.flags >> 3) & 1) != 0); + writer.WriteBit(context.h264_parameter_set.redundant_pic_count_flag != 0); + writer.WriteBit(context.h264_parameter_set.transform_8x8_mode_flag != 0); + + writer.WriteBit(true); + + for (s32 index = 0; index < 6; index++) { + writer.WriteBit(true); + const auto matrix_x4 = + std::vector<u8>(context.scaling_matrix_4.begin(), context.scaling_matrix_4.end()); + writer.WriteScalingList(matrix_x4, index * 16, 16); + } + + if (context.h264_parameter_set.transform_8x8_mode_flag) { + for (s32 index = 0; index < 2; index++) { + writer.WriteBit(true); + const auto matrix_x8 = std::vector<u8>(context.scaling_matrix_8.begin(), + context.scaling_matrix_8.end()); + + writer.WriteScalingList(matrix_x8, index * 64, 64); + } + } + + s32 chroma_qp_index_offset2 = + static_cast<s32>((context.h264_parameter_set.flags >> 27) & 0x1f); + chroma_qp_index_offset2 = (chroma_qp_index_offset2 << 27) >> 27; + + writer.WriteSe(chroma_qp_index_offset2); + + writer.End(); + + const auto& encoded_header = writer.GetByteArray(); + frame.resize(encoded_header.size() + context.frame_data_size); + std::memcpy(frame.data(), encoded_header.data(), encoded_header.size()); + + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, + frame.data() + encoded_header.size(), + context.frame_data_size); + } + + return frame; +} + +H264BitWriter::H264BitWriter() = default; + +H264BitWriter::~H264BitWriter() = default; + +void H264BitWriter::WriteU(s32 value, s32 value_sz) { + WriteBits(value, value_sz); +} + +void H264BitWriter::WriteSe(s32 value) { + WriteExpGolombCodedInt(value); +} + +void H264BitWriter::WriteUe(u32 value) { + WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::End() { + WriteBit(true); + Flush(); +} + +void H264BitWriter::WriteBit(bool state) { + WriteBits(state ? 1 : 0, 1); +} + +void H264BitWriter::WriteScalingList(const std::vector<u8>& list, s32 start, s32 count) { + std::vector<u8> scan(count); + if (count == 16) { + std::memcpy(scan.data(), zig_zag_scan.data(), scan.size()); + } else { + std::memcpy(scan.data(), zig_zag_direct.data(), scan.size()); + } + u8 last_scale = 8; + + for (s32 index = 0; index < count; index++) { + const u8 value = list[start + scan[index]]; + const s32 delta_scale = static_cast<s32>(value - last_scale); + + WriteSe(delta_scale); + + last_scale = value; + } +} + +std::vector<u8>& H264BitWriter::GetByteArray() { + return byte_array; +} + +const std::vector<u8>& H264BitWriter::GetByteArray() const { + return byte_array; +} + +void H264BitWriter::WriteBits(s32 value, s32 bit_count) { + s32 value_pos = 0; + + s32 remaining = bit_count; + + while (remaining > 0) { + s32 copy_size = remaining; + + const s32 free_bits = GetFreeBufferBits(); + + if (copy_size > free_bits) { + copy_size = free_bits; + } + + const s32 mask = (1 << copy_size) - 1; + + const s32 src_shift = (bit_count - value_pos) - copy_size; + const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + + buffer |= ((value >> src_shift) & mask) << dst_shift; + + value_pos += copy_size; + buffer_pos += copy_size; + remaining -= copy_size; + } +} + +void H264BitWriter::WriteExpGolombCodedInt(s32 value) { + const s32 sign = value <= 0 ? 0 : 1; + if (value < 0) { + value = -value; + } + value = (value << 1) - sign; + WriteExpGolombCodedUInt(value); +} + +void H264BitWriter::WriteExpGolombCodedUInt(u32 value) { + const s32 size = 32 - Common::CountLeadingZeroes32(static_cast<s32>(value + 1)); + WriteBits(1, size); + + value -= (1U << (size - 1)) - 1; + WriteBits(static_cast<s32>(value), size - 1); +} + +s32 H264BitWriter::GetFreeBufferBits() { + if (buffer_pos == buffer_size) { + Flush(); + } + + return buffer_size - buffer_pos; +} + +void H264BitWriter::Flush() { + if (buffer_pos == 0) { + return; + } + byte_array.push_back(static_cast<u8>(buffer)); + + buffer = 0; + buffer_pos = 0; +} +} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/h264.h b/src/video_core/command_classes/codecs/h264.h new file mode 100644 index 000000000..0f3a1d9f3 --- /dev/null +++ b/src/video_core/command_classes/codecs/h264.h @@ -0,0 +1,118 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; +namespace Decoder { + +class H264BitWriter { +public: + H264BitWriter(); + ~H264BitWriter(); + + /// The following Write methods are based on clause 9.1 in the H.264 specification. + /// WriteSe and WriteUe write in the Exp-Golomb-coded syntax + void WriteU(s32 value, s32 value_sz); + void WriteSe(s32 value); + void WriteUe(u32 value); + + /// Finalize the bitstream + void End(); + + /// append a bit to the stream, equivalent value to the state parameter + void WriteBit(bool state); + + /// Based on section 7.3.2.1.1.1 and Table 7-4 in the H.264 specification + /// Writes the scaling matrices of the sream + void WriteScalingList(const std::vector<u8>& list, s32 start, s32 count); + + /// Return the bitstream as a vector. + [[nodiscard]] std::vector<u8>& GetByteArray(); + [[nodiscard]] const std::vector<u8>& GetByteArray() const; + +private: + void WriteBits(s32 value, s32 bit_count); + void WriteExpGolombCodedInt(s32 value); + void WriteExpGolombCodedUInt(u32 value); + [[nodiscard]] s32 GetFreeBufferBits(); + void Flush(); + + s32 buffer_size{8}; + + s32 buffer{}; + s32 buffer_pos{}; + std::vector<u8> byte_array; +}; + +class H264 { +public: + explicit H264(GPU& gpu); + ~H264(); + + /// Compose the H264 header of the frame for FFmpeg decoding + [[nodiscard]] const std::vector<u8>& ComposeFrameHeader( + const NvdecCommon::NvdecRegisters& state, bool is_first_frame = false); + +private: + struct H264ParameterSet { + u32 log2_max_pic_order_cnt{}; + u32 delta_pic_order_always_zero_flag{}; + u32 frame_mbs_only_flag{}; + u32 pic_width_in_mbs{}; + u32 pic_height_in_map_units{}; + INSERT_PADDING_WORDS(1); + u32 entropy_coding_mode_flag{}; + u32 bottom_field_pic_order_flag{}; + u32 num_refidx_l0_default_active{}; + u32 num_refidx_l1_default_active{}; + u32 deblocking_filter_control_flag{}; + u32 redundant_pic_count_flag{}; + u32 transform_8x8_mode_flag{}; + INSERT_PADDING_WORDS(9); + u64 flags{}; + u32 frame_number{}; + u32 frame_number2{}; + }; + static_assert(sizeof(H264ParameterSet) == 0x68, "H264ParameterSet is an invalid size"); + + struct H264DecoderContext { + INSERT_PADDING_BYTES(0x48); + u32 frame_data_size{}; + INSERT_PADDING_BYTES(0xc); + H264ParameterSet h264_parameter_set{}; + INSERT_PADDING_BYTES(0x100); + std::array<u8, 0x60> scaling_matrix_4; + std::array<u8, 0x80> scaling_matrix_8; + }; + static_assert(sizeof(H264DecoderContext) == 0x2a0, "H264DecoderContext is an invalid size"); + + std::vector<u8> frame; + GPU& gpu; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9.cpp b/src/video_core/command_classes/codecs/vp9.cpp new file mode 100644 index 000000000..59e586695 --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9.cpp @@ -0,0 +1,989 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstring> // for std::memcpy +#include <numeric> +#include "video_core/command_classes/codecs/vp9.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" + +namespace Tegra::Decoder { +namespace { +// Default compressed header probabilities once frame context resets +constexpr Vp9EntropyProbs default_probs{ + .y_mode_prob{ + 65, 32, 18, 144, 162, 194, 41, 51, 98, 132, 68, 18, 165, 217, 196, 45, 40, 78, + 173, 80, 19, 176, 240, 193, 64, 35, 46, 221, 135, 38, 194, 248, 121, 96, 85, 29, + }, + .partition_prob{ + 199, 122, 141, 0, 147, 63, 159, 0, 148, 133, 118, 0, 121, 104, 114, 0, + 174, 73, 87, 0, 92, 41, 83, 0, 82, 99, 50, 0, 53, 39, 39, 0, + 177, 58, 59, 0, 68, 26, 63, 0, 52, 79, 25, 0, 17, 14, 12, 0, + 222, 34, 30, 0, 72, 16, 44, 0, 58, 32, 12, 0, 10, 7, 6, 0, + }, + .coef_probs{ + 195, 29, 183, 84, 49, 136, 8, 42, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 31, 107, 169, 35, 99, 159, 17, 82, 140, 8, 66, 114, 2, 44, 76, 1, 19, 32, + 40, 132, 201, 29, 114, 187, 13, 91, 157, 7, 75, 127, 3, 58, 95, 1, 28, 47, + 69, 142, 221, 42, 122, 201, 15, 91, 159, 6, 67, 121, 1, 42, 77, 1, 17, 31, + 102, 148, 228, 67, 117, 204, 17, 82, 154, 6, 59, 114, 2, 39, 75, 1, 15, 29, + 156, 57, 233, 119, 57, 212, 58, 48, 163, 29, 40, 124, 12, 30, 81, 3, 12, 31, + 191, 107, 226, 124, 117, 204, 25, 99, 155, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 29, 148, 210, 37, 126, 194, 8, 93, 157, 2, 68, 118, 1, 39, 69, 1, 17, 33, + 41, 151, 213, 27, 123, 193, 3, 82, 144, 1, 58, 105, 1, 32, 60, 1, 13, 26, + 59, 159, 220, 23, 126, 198, 4, 88, 151, 1, 66, 114, 1, 38, 71, 1, 18, 34, + 114, 136, 232, 51, 114, 207, 11, 83, 155, 3, 56, 105, 1, 33, 65, 1, 17, 34, + 149, 65, 234, 121, 57, 215, 61, 49, 166, 28, 36, 114, 12, 25, 76, 3, 16, 42, + 214, 49, 220, 132, 63, 188, 42, 65, 137, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 137, 221, 104, 131, 216, 49, 111, 192, 21, 87, 155, 2, 49, 87, 1, 16, 28, + 89, 163, 230, 90, 137, 220, 29, 100, 183, 10, 70, 135, 2, 42, 81, 1, 17, 33, + 108, 167, 237, 55, 133, 222, 15, 97, 179, 4, 72, 135, 1, 45, 85, 1, 19, 38, + 124, 146, 240, 66, 124, 224, 17, 88, 175, 4, 58, 122, 1, 36, 75, 1, 18, 37, + 141, 79, 241, 126, 70, 227, 66, 58, 182, 30, 44, 136, 12, 34, 96, 2, 20, 47, + 229, 99, 249, 143, 111, 235, 46, 109, 192, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 82, 158, 236, 94, 146, 224, 25, 117, 191, 9, 87, 149, 3, 56, 99, 1, 33, 57, + 83, 167, 237, 68, 145, 222, 10, 103, 177, 2, 72, 131, 1, 41, 79, 1, 20, 39, + 99, 167, 239, 47, 141, 224, 10, 104, 178, 2, 73, 133, 1, 44, 85, 1, 22, 47, + 127, 145, 243, 71, 129, 228, 17, 93, 177, 3, 61, 124, 1, 41, 84, 1, 21, 52, + 157, 78, 244, 140, 72, 231, 69, 58, 184, 31, 44, 137, 14, 38, 105, 8, 23, 61, + 125, 34, 187, 52, 41, 133, 6, 31, 56, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 37, 109, 153, 51, 102, 147, 23, 87, 128, 8, 67, 101, 1, 41, 63, 1, 19, 29, + 31, 154, 185, 17, 127, 175, 6, 96, 145, 2, 73, 114, 1, 51, 82, 1, 28, 45, + 23, 163, 200, 10, 131, 185, 2, 93, 148, 1, 67, 111, 1, 41, 69, 1, 14, 24, + 29, 176, 217, 12, 145, 201, 3, 101, 156, 1, 69, 111, 1, 39, 63, 1, 14, 23, + 57, 192, 233, 25, 154, 215, 6, 109, 167, 3, 78, 118, 1, 48, 69, 1, 21, 29, + 202, 105, 245, 108, 106, 216, 18, 90, 144, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 33, 172, 219, 64, 149, 206, 14, 117, 177, 5, 90, 141, 2, 61, 95, 1, 37, 57, + 33, 179, 220, 11, 140, 198, 1, 89, 148, 1, 60, 104, 1, 33, 57, 1, 12, 21, + 30, 181, 221, 8, 141, 198, 1, 87, 145, 1, 58, 100, 1, 31, 55, 1, 12, 20, + 32, 186, 224, 7, 142, 198, 1, 86, 143, 1, 58, 100, 1, 31, 55, 1, 12, 22, + 57, 192, 227, 20, 143, 204, 3, 96, 154, 1, 68, 112, 1, 42, 69, 1, 19, 32, + 212, 35, 215, 113, 47, 169, 29, 48, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 74, 129, 203, 106, 120, 203, 49, 107, 178, 19, 84, 144, 4, 50, 84, 1, 15, 25, + 71, 172, 217, 44, 141, 209, 15, 102, 173, 6, 76, 133, 2, 51, 89, 1, 24, 42, + 64, 185, 231, 31, 148, 216, 8, 103, 175, 3, 74, 131, 1, 46, 81, 1, 18, 30, + 65, 196, 235, 25, 157, 221, 5, 105, 174, 1, 67, 120, 1, 38, 69, 1, 15, 30, + 65, 204, 238, 30, 156, 224, 7, 107, 177, 2, 70, 124, 1, 42, 73, 1, 18, 34, + 225, 86, 251, 144, 104, 235, 42, 99, 181, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 175, 239, 112, 165, 229, 29, 136, 200, 12, 103, 162, 6, 77, 123, 2, 53, 84, + 75, 183, 239, 30, 155, 221, 3, 106, 171, 1, 74, 128, 1, 44, 76, 1, 17, 28, + 73, 185, 240, 27, 159, 222, 2, 107, 172, 1, 75, 127, 1, 42, 73, 1, 17, 29, + 62, 190, 238, 21, 159, 222, 2, 107, 172, 1, 72, 122, 1, 40, 71, 1, 18, 32, + 61, 199, 240, 27, 161, 226, 4, 113, 180, 1, 76, 129, 1, 46, 80, 1, 23, 41, + 7, 27, 153, 5, 30, 95, 1, 16, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 50, 75, 127, 57, 75, 124, 27, 67, 108, 10, 54, 86, 1, 33, 52, 1, 12, 18, + 43, 125, 151, 26, 108, 148, 7, 83, 122, 2, 59, 89, 1, 38, 60, 1, 17, 27, + 23, 144, 163, 13, 112, 154, 2, 75, 117, 1, 50, 81, 1, 31, 51, 1, 14, 23, + 18, 162, 185, 6, 123, 171, 1, 78, 125, 1, 51, 86, 1, 31, 54, 1, 14, 23, + 15, 199, 227, 3, 150, 204, 1, 91, 146, 1, 55, 95, 1, 30, 53, 1, 11, 20, + 19, 55, 240, 19, 59, 196, 3, 52, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 41, 166, 207, 104, 153, 199, 31, 123, 181, 14, 101, 152, 5, 72, 106, 1, 36, 52, + 35, 176, 211, 12, 131, 190, 2, 88, 144, 1, 60, 101, 1, 36, 60, 1, 16, 28, + 28, 183, 213, 8, 134, 191, 1, 86, 142, 1, 56, 96, 1, 30, 53, 1, 12, 20, + 20, 190, 215, 4, 135, 192, 1, 84, 139, 1, 53, 91, 1, 28, 49, 1, 11, 20, + 13, 196, 216, 2, 137, 192, 1, 86, 143, 1, 57, 99, 1, 32, 56, 1, 13, 24, + 211, 29, 217, 96, 47, 156, 22, 43, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 78, 120, 193, 111, 116, 186, 46, 102, 164, 15, 80, 128, 2, 49, 76, 1, 18, 28, + 71, 161, 203, 42, 132, 192, 10, 98, 150, 3, 69, 109, 1, 44, 70, 1, 18, 29, + 57, 186, 211, 30, 140, 196, 4, 93, 146, 1, 62, 102, 1, 38, 65, 1, 16, 27, + 47, 199, 217, 14, 145, 196, 1, 88, 142, 1, 57, 98, 1, 36, 62, 1, 15, 26, + 26, 219, 229, 5, 155, 207, 1, 94, 151, 1, 60, 104, 1, 36, 62, 1, 16, 28, + 233, 29, 248, 146, 47, 220, 43, 52, 140, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 100, 163, 232, 179, 161, 222, 63, 142, 204, 37, 113, 174, 26, 89, 137, 18, 68, 97, + 85, 181, 230, 32, 146, 209, 7, 100, 164, 3, 71, 121, 1, 45, 77, 1, 18, 30, + 65, 187, 230, 20, 148, 207, 2, 97, 159, 1, 68, 116, 1, 40, 70, 1, 14, 29, + 40, 194, 227, 8, 147, 204, 1, 94, 155, 1, 65, 112, 1, 39, 66, 1, 14, 26, + 16, 208, 228, 3, 151, 207, 1, 98, 160, 1, 67, 117, 1, 41, 74, 1, 17, 31, + 17, 38, 140, 7, 34, 80, 1, 17, 29, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 37, 75, 128, 41, 76, 128, 26, 66, 116, 12, 52, 94, 2, 32, 55, 1, 10, 16, + 50, 127, 154, 37, 109, 152, 16, 82, 121, 5, 59, 85, 1, 35, 54, 1, 13, 20, + 40, 142, 167, 17, 110, 157, 2, 71, 112, 1, 44, 72, 1, 27, 45, 1, 11, 17, + 30, 175, 188, 9, 124, 169, 1, 74, 116, 1, 48, 78, 1, 30, 49, 1, 11, 18, + 10, 222, 223, 2, 150, 194, 1, 83, 128, 1, 48, 79, 1, 27, 45, 1, 11, 17, + 36, 41, 235, 29, 36, 193, 10, 27, 111, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 85, 165, 222, 177, 162, 215, 110, 135, 195, 57, 113, 168, 23, 83, 120, 10, 49, 61, + 85, 190, 223, 36, 139, 200, 5, 90, 146, 1, 60, 103, 1, 38, 65, 1, 18, 30, + 72, 202, 223, 23, 141, 199, 2, 86, 140, 1, 56, 97, 1, 36, 61, 1, 16, 27, + 55, 218, 225, 13, 145, 200, 1, 86, 141, 1, 57, 99, 1, 35, 61, 1, 13, 22, + 15, 235, 212, 1, 132, 184, 1, 84, 139, 1, 57, 97, 1, 34, 56, 1, 14, 23, + 181, 21, 201, 61, 37, 123, 10, 38, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 47, 106, 172, 95, 104, 173, 42, 93, 159, 18, 77, 131, 4, 50, 81, 1, 17, 23, + 62, 147, 199, 44, 130, 189, 28, 102, 154, 18, 75, 115, 2, 44, 65, 1, 12, 19, + 55, 153, 210, 24, 130, 194, 3, 93, 146, 1, 61, 97, 1, 31, 50, 1, 10, 16, + 49, 186, 223, 17, 148, 204, 1, 96, 142, 1, 53, 83, 1, 26, 44, 1, 11, 17, + 13, 217, 212, 2, 136, 180, 1, 78, 124, 1, 50, 83, 1, 29, 49, 1, 14, 23, + 197, 13, 247, 82, 17, 222, 25, 17, 162, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 126, 186, 247, 234, 191, 243, 176, 177, 234, 104, 158, 220, 66, 128, 186, 55, 90, 137, + 111, 197, 242, 46, 158, 219, 9, 104, 171, 2, 65, 125, 1, 44, 80, 1, 17, 91, + 104, 208, 245, 39, 168, 224, 3, 109, 162, 1, 79, 124, 1, 50, 102, 1, 43, 102, + 84, 220, 246, 31, 177, 231, 2, 115, 180, 1, 79, 134, 1, 55, 77, 1, 60, 79, + 43, 243, 240, 8, 180, 217, 1, 115, 166, 1, 84, 121, 1, 51, 67, 1, 16, 6, + }, + .switchable_interp_prob{235, 162, 36, 255, 34, 3, 149, 144}, + .inter_mode_prob{ + 2, 173, 34, 0, 7, 145, 85, 0, 7, 166, 63, 0, 7, 94, + 66, 0, 8, 64, 46, 0, 17, 81, 31, 0, 25, 29, 30, 0, + }, + .intra_inter_prob{9, 102, 187, 225}, + .comp_inter_prob{9, 102, 187, 225, 0}, + .single_ref_prob{33, 16, 77, 74, 142, 142, 172, 170, 238, 247}, + .comp_ref_prob{50, 126, 123, 221, 226}, + .tx_32x32_prob{3, 136, 37, 5, 52, 13}, + .tx_16x16_prob{20, 152, 15, 101}, + .tx_8x8_prob{100, 66}, + .skip_probs{192, 128, 64}, + .joints{32, 64, 96}, + .sign{128, 128}, + .classes{ + 224, 144, 192, 168, 192, 176, 192, 198, 198, 245, + 216, 128, 176, 160, 176, 176, 192, 198, 198, 208, + }, + .class_0{216, 208}, + .prob_bits{ + 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, + 136, 140, 148, 160, 176, 192, 224, 234, 234, 240, + }, + .class_0_fr{128, 128, 64, 96, 112, 64, 128, 128, 64, 96, 112, 64}, + .fr{64, 96, 64, 64, 96, 64}, + .class_0_hp{160, 160}, + .high_precision{128, 128}, +}; + +constexpr std::array<s32, 256> norm_lut{ + 0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +constexpr std::array<s32, 254> map_lut{ + 20, 21, 22, 23, 24, 25, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, + 1, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 2, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 3, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, + 73, 4, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 5, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 6, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 7, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 8, 122, 123, 124, + 125, 126, 127, 128, 129, 130, 131, 132, 133, 9, 134, 135, 136, 137, 138, 139, 140, 141, 142, + 143, 144, 145, 10, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 11, 158, 159, + 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 12, 170, 171, 172, 173, 174, 175, 176, 177, + 178, 179, 180, 181, 13, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 14, 194, + 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 15, 206, 207, 208, 209, 210, 211, 212, + 213, 214, 215, 216, 217, 16, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 17, + 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 18, 242, 243, 244, 245, 246, 247, + 248, 249, 250, 251, 252, 253, 19, +}; + +// 6.2.14 Tile size calculation + +[[nodiscard]] s32 CalcMinLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 min_log2 = 0; + + while ((64 << min_log2) < sb64_cols) { + min_log2++; + } + + return min_log2; +} + +[[nodiscard]] s32 CalcMaxLog2TileCols(s32 frame_width) { + const s32 sb64_cols = (frame_width + 63) / 64; + s32 max_log2 = 1; + + while ((sb64_cols >> max_log2) >= 4) { + max_log2++; + } + + return max_log2 - 1; +} + +// Recenters probability. Based on section 6.3.6 of VP9 Specification +[[nodiscard]] s32 RecenterNonNeg(s32 new_prob, s32 old_prob) { + if (new_prob > old_prob * 2) { + return new_prob; + } + + if (new_prob >= old_prob) { + return (new_prob - old_prob) * 2; + } + + return (old_prob - new_prob) * 2 - 1; +} + +// Adjusts old_prob depending on new_prob. Based on section 6.3.5 of VP9 Specification +[[nodiscard]] s32 RemapProbability(s32 new_prob, s32 old_prob) { + new_prob--; + old_prob--; + + std::size_t index{}; + + if (old_prob * 2 <= 0xff) { + index = static_cast<std::size_t>(std::max(0, RecenterNonNeg(new_prob, old_prob) - 1)); + } else { + index = static_cast<std::size_t>( + std::max(0, RecenterNonNeg(0xff - 1 - new_prob, 0xff - 1 - old_prob) - 1)); + } + + return map_lut[index]; +} +} // Anonymous namespace + +VP9::VP9(GPU& gpu_) : gpu{gpu_} {} + +VP9::~VP9() = default; + +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const bool update = new_prob != old_prob; + + writer.Write(update, diff_update_probability); + + if (update) { + WriteProbabilityDelta(writer, new_prob, old_prob); + } +} +template <typename T, std::size_t N> +void VP9::WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob) { + for (std::size_t offset = 0; offset < new_prob.size(); ++offset) { + WriteProbabilityUpdate(writer, new_prob[offset], old_prob[offset]); + } +} + +template <typename T, std::size_t N> +void VP9::WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob) { + for (std::size_t offset = 0; offset < new_prob.size(); offset += 4) { + WriteProbabilityUpdate(writer, new_prob[offset + 0], old_prob[offset + 0]); + WriteProbabilityUpdate(writer, new_prob[offset + 1], old_prob[offset + 1]); + WriteProbabilityUpdate(writer, new_prob[offset + 2], old_prob[offset + 2]); + } +} + +void VP9::WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const int delta = RemapProbability(new_prob, old_prob); + + EncodeTermSubExp(writer, delta); +} + +void VP9::EncodeTermSubExp(VpxRangeEncoder& writer, s32 value) { + if (WriteLessThan(writer, value, 16)) { + writer.Write(value, 4); + } else if (WriteLessThan(writer, value, 32)) { + writer.Write(value - 16, 4); + } else if (WriteLessThan(writer, value, 64)) { + writer.Write(value - 32, 5); + } else { + value -= 64; + + constexpr s32 size = 8; + + const s32 mask = (1 << size) - 191; + + const s32 delta = value - mask; + + if (delta < 0) { + writer.Write(value, size - 1); + } else { + writer.Write(delta / 2 + mask, size - 1); + writer.Write(delta & 1, 1); + } + } +} + +bool VP9::WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test) { + const bool is_lt = value < test; + writer.Write(!is_lt); + return is_lt; +} + +void VP9::WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, + const std::array<u8, 1728>& new_prob, + const std::array<u8, 1728>& old_prob) { + constexpr u32 block_bytes = 2 * 2 * 6 * 6 * 3; + + const auto needs_update = [&](u32 base_index) { + return !std::equal(new_prob.begin() + base_index, + new_prob.begin() + base_index + block_bytes, + old_prob.begin() + base_index); + }; + + for (u32 block_index = 0; block_index < 4; block_index++) { + const u32 base_index = block_index * block_bytes; + const bool update = needs_update(base_index); + writer.Write(update); + + if (update) { + u32 index = base_index; + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 6; k++) { + for (s32 l = 0; l < 6; l++) { + if (k != 0 || l < 3) { + WriteProbabilityUpdate(writer, new_prob[index + 0], + old_prob[index + 0]); + WriteProbabilityUpdate(writer, new_prob[index + 1], + old_prob[index + 1]); + WriteProbabilityUpdate(writer, new_prob[index + 2], + old_prob[index + 2]); + } + index += 3; + } + } + } + } + } + if (block_index == static_cast<u32>(tx_mode)) { + break; + } + } +} + +void VP9::WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob) { + const bool update = new_prob != old_prob; + writer.Write(update, diff_update_probability); + + if (update) { + writer.Write(new_prob >> 1, 7); + } +} + +Vp9PictureInfo VP9::GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state) { + PictureInfo picture_info{}; + gpu.MemoryManager().ReadBlock(state.picture_info_offset, &picture_info, sizeof(PictureInfo)); + Vp9PictureInfo vp9_info = picture_info.Convert(); + + InsertEntropy(state.vp9_entropy_probs_offset, vp9_info.entropy); + + // surface_luma_offset[0:3] contains the address of the reference frame offsets in the following + // order: last, golden, altref, current. It may be worthwhile to track the updates done here + // to avoid buffering frame data needed for reference frame updating in the header composition. + std::memcpy(vp9_info.frame_offsets.data(), state.surface_luma_offset.data(), 4 * sizeof(u64)); + + return vp9_info; +} + +void VP9::InsertEntropy(u64 offset, Vp9EntropyProbs& dst) { + EntropyProbs entropy{}; + gpu.MemoryManager().ReadBlock(offset, &entropy, sizeof(EntropyProbs)); + entropy.Convert(dst); +} + +Vp9FrameContainer VP9::GetCurrentFrame(const NvdecCommon::NvdecRegisters& state) { + Vp9FrameContainer current_frame{}; + { + gpu.SyncGuestHost(); + current_frame.info = GetVp9PictureInfo(state); + current_frame.bit_stream.resize(current_frame.info.bitstream_size); + gpu.MemoryManager().ReadBlock(state.frame_bitstream_offset, current_frame.bit_stream.data(), + current_frame.info.bitstream_size); + } + // Buffer two frames, saving the last show frame info + if (!next_next_frame.bit_stream.empty()) { + Vp9FrameContainer temp{ + .info = current_frame.info, + .bit_stream = std::move(current_frame.bit_stream), + }; + next_next_frame.info.show_frame = current_frame.info.last_frame_shown; + current_frame.info = next_next_frame.info; + current_frame.bit_stream = std::move(next_next_frame.bit_stream); + next_next_frame = std::move(temp); + + if (!next_frame.bit_stream.empty()) { + Vp9FrameContainer temp2{ + .info = current_frame.info, + .bit_stream = std::move(current_frame.bit_stream), + }; + next_frame.info.show_frame = current_frame.info.last_frame_shown; + current_frame.info = next_frame.info; + current_frame.bit_stream = std::move(next_frame.bit_stream); + next_frame = std::move(temp2); + } else { + next_frame.info = current_frame.info; + next_frame.bit_stream = std::move(current_frame.bit_stream); + } + } else { + next_next_frame.info = current_frame.info; + next_next_frame.bit_stream = std::move(current_frame.bit_stream); + } + return current_frame; +} + +std::vector<u8> VP9::ComposeCompressedHeader() { + VpxRangeEncoder writer{}; + const bool update_probs = current_frame_info.show_frame && !current_frame_info.is_key_frame; + if (!current_frame_info.lossless) { + if (static_cast<u32>(current_frame_info.transform_mode) >= 3) { + writer.Write(3, 2); + writer.Write(current_frame_info.transform_mode == 4); + } else { + writer.Write(current_frame_info.transform_mode, 2); + } + } + + if (current_frame_info.transform_mode == 4) { + // tx_mode_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_8x8_prob, + prev_frame_probs.tx_8x8_prob); + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_16x16_prob, + prev_frame_probs.tx_16x16_prob); + WriteProbabilityUpdate(writer, current_frame_info.entropy.tx_32x32_prob, + prev_frame_probs.tx_32x32_prob); + if (update_probs) { + prev_frame_probs.tx_8x8_prob = current_frame_info.entropy.tx_8x8_prob; + prev_frame_probs.tx_16x16_prob = current_frame_info.entropy.tx_16x16_prob; + prev_frame_probs.tx_32x32_prob = current_frame_info.entropy.tx_32x32_prob; + } + } + // read_coef_probs() in the spec + WriteCoefProbabilityUpdate(writer, current_frame_info.transform_mode, + current_frame_info.entropy.coef_probs, prev_frame_probs.coef_probs); + // read_skip_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.skip_probs, + prev_frame_probs.skip_probs); + + if (update_probs) { + prev_frame_probs.coef_probs = current_frame_info.entropy.coef_probs; + prev_frame_probs.skip_probs = current_frame_info.entropy.skip_probs; + } + + if (!current_frame_info.intra_only) { + // read_inter_probs() in the spec + WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.inter_mode_prob, + prev_frame_probs.inter_mode_prob); + + if (current_frame_info.interp_filter == 4) { + // read_interp_filter_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.switchable_interp_prob, + prev_frame_probs.switchable_interp_prob); + if (update_probs) { + prev_frame_probs.switchable_interp_prob = + current_frame_info.entropy.switchable_interp_prob; + } + } + + // read_is_inter_probs() in the spec + WriteProbabilityUpdate(writer, current_frame_info.entropy.intra_inter_prob, + prev_frame_probs.intra_inter_prob); + + // frame_reference_mode() in the spec + if ((current_frame_info.ref_frame_sign_bias[1] & 1) != + (current_frame_info.ref_frame_sign_bias[2] & 1) || + (current_frame_info.ref_frame_sign_bias[1] & 1) != + (current_frame_info.ref_frame_sign_bias[3] & 1)) { + if (current_frame_info.reference_mode >= 1) { + writer.Write(1, 1); + writer.Write(current_frame_info.reference_mode == 2); + } else { + writer.Write(0, 1); + } + } + + // frame_reference_mode_probs() in the spec + if (current_frame_info.reference_mode == 2) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_inter_prob, + prev_frame_probs.comp_inter_prob); + if (update_probs) { + prev_frame_probs.comp_inter_prob = current_frame_info.entropy.comp_inter_prob; + } + } + + if (current_frame_info.reference_mode != 1) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.single_ref_prob, + prev_frame_probs.single_ref_prob); + if (update_probs) { + prev_frame_probs.single_ref_prob = current_frame_info.entropy.single_ref_prob; + } + } + + if (current_frame_info.reference_mode != 0) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.comp_ref_prob, + prev_frame_probs.comp_ref_prob); + if (update_probs) { + prev_frame_probs.comp_ref_prob = current_frame_info.entropy.comp_ref_prob; + } + } + + // read_y_mode_probs + for (std::size_t index = 0; index < current_frame_info.entropy.y_mode_prob.size(); + ++index) { + WriteProbabilityUpdate(writer, current_frame_info.entropy.y_mode_prob[index], + prev_frame_probs.y_mode_prob[index]); + } + + // read_partition_probs + WriteProbabilityUpdateAligned4(writer, current_frame_info.entropy.partition_prob, + prev_frame_probs.partition_prob); + + // mv_probs + for (s32 i = 0; i < 3; i++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.joints[i], + prev_frame_probs.joints[i]); + } + if (update_probs) { + prev_frame_probs.inter_mode_prob = current_frame_info.entropy.inter_mode_prob; + prev_frame_probs.intra_inter_prob = current_frame_info.entropy.intra_inter_prob; + prev_frame_probs.y_mode_prob = current_frame_info.entropy.y_mode_prob; + prev_frame_probs.partition_prob = current_frame_info.entropy.partition_prob; + prev_frame_probs.joints = current_frame_info.entropy.joints; + } + + for (s32 i = 0; i < 2; i++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.sign[i], + prev_frame_probs.sign[i]); + for (s32 j = 0; j < 10; j++) { + const int index = i * 10 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.classes[index], + prev_frame_probs.classes[index]); + } + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0[i], + prev_frame_probs.class_0[i]); + + for (s32 j = 0; j < 10; j++) { + const int index = i * 10 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.prob_bits[index], + prev_frame_probs.prob_bits[index]); + } + } + + for (s32 i = 0; i < 2; i++) { + for (s32 j = 0; j < 2; j++) { + for (s32 k = 0; k < 3; k++) { + const int index = i * 2 * 3 + j * 3 + k; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_fr[index], + prev_frame_probs.class_0_fr[index]); + } + } + + for (s32 j = 0; j < 3; j++) { + const int index = i * 3 + j; + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.fr[index], + prev_frame_probs.fr[index]); + } + } + + if (current_frame_info.allow_high_precision_mv) { + for (s32 index = 0; index < 2; index++) { + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.class_0_hp[index], + prev_frame_probs.class_0_hp[index]); + WriteMvProbabilityUpdate(writer, current_frame_info.entropy.high_precision[index], + prev_frame_probs.high_precision[index]); + } + } + + // save previous probs + if (update_probs) { + prev_frame_probs.sign = current_frame_info.entropy.sign; + prev_frame_probs.classes = current_frame_info.entropy.classes; + prev_frame_probs.class_0 = current_frame_info.entropy.class_0; + prev_frame_probs.prob_bits = current_frame_info.entropy.prob_bits; + prev_frame_probs.class_0_fr = current_frame_info.entropy.class_0_fr; + prev_frame_probs.fr = current_frame_info.entropy.fr; + prev_frame_probs.class_0_hp = current_frame_info.entropy.class_0_hp; + prev_frame_probs.high_precision = current_frame_info.entropy.high_precision; + } + } + writer.End(); + return writer.GetBuffer(); +} + +VpxBitStreamWriter VP9::ComposeUncompressedHeader() { + VpxBitStreamWriter uncomp_writer{}; + + uncomp_writer.WriteU(2, 2); // Frame marker. + uncomp_writer.WriteU(0, 2); // Profile. + uncomp_writer.WriteBit(false); // Show existing frame. + uncomp_writer.WriteBit(!current_frame_info.is_key_frame); // is key frame? + uncomp_writer.WriteBit(current_frame_info.show_frame); // show frame? + uncomp_writer.WriteBit(current_frame_info.error_resilient_mode); // error reslience + + if (current_frame_info.is_key_frame) { + uncomp_writer.WriteU(frame_sync_code, 24); + uncomp_writer.WriteU(0, 3); // Color space. + uncomp_writer.WriteU(0, 1); // Color range. + uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); + uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); + uncomp_writer.WriteBit(false); // Render and frame size different. + + // Reset context + prev_frame_probs = default_probs; + swap_next_golden = false; + loop_filter_ref_deltas.fill(0); + loop_filter_mode_deltas.fill(0); + + // allow frames offsets to stabilize before checking for golden frames + grace_period = 4; + + // On key frames, all frame slots are set to the current frame, + // so the value of the selected slot doesn't really matter. + frame_ctxs.fill({current_frame_number, false, default_probs}); + + // intra only, meaning the frame can be recreated with no other references + current_frame_info.intra_only = true; + + } else { + + if (!current_frame_info.show_frame) { + uncomp_writer.WriteBit(current_frame_info.intra_only); + if (!current_frame_info.last_frame_was_key) { + swap_next_golden = !swap_next_golden; + } + } else { + current_frame_info.intra_only = false; + } + if (!current_frame_info.error_resilient_mode) { + uncomp_writer.WriteU(0, 2); // Reset frame context. + } + + // Last, Golden, Altref frames + std::array<s32, 3> ref_frame_index{0, 1, 2}; + + // Set when next frame is hidden + // altref and golden references are swapped + if (swap_next_golden) { + ref_frame_index = std::array<s32, 3>{0, 2, 1}; + } + + // update Last Frame + u64 refresh_frame_flags = 1; + + // golden frame may refresh, determined if the next golden frame offset is changed + bool golden_refresh = false; + if (grace_period <= 0) { + for (s32 index = 1; index < 3; ++index) { + if (current_frame_info.frame_offsets[index] != + next_frame.info.frame_offsets[index]) { + current_frame_info.refresh_frame[index] = true; + golden_refresh = true; + grace_period = 3; + } + } + } + + if (current_frame_info.show_frame && + (!next_frame.info.show_frame || next_frame.info.is_key_frame)) { + // Update golden frame + refresh_frame_flags = swap_next_golden ? 2 : 4; + } + + if (!current_frame_info.show_frame) { + // Update altref + refresh_frame_flags = swap_next_golden ? 2 : 4; + } else if (golden_refresh) { + refresh_frame_flags = 3; + } + + if (current_frame_info.intra_only) { + uncomp_writer.WriteU(frame_sync_code, 24); + uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); + uncomp_writer.WriteU(current_frame_info.frame_size.width - 1, 16); + uncomp_writer.WriteU(current_frame_info.frame_size.height - 1, 16); + uncomp_writer.WriteBit(false); // Render and frame size different. + } else { + uncomp_writer.WriteU(static_cast<s32>(refresh_frame_flags), 8); + + for (s32 index = 1; index < 4; index++) { + uncomp_writer.WriteU(ref_frame_index[index - 1], 3); + uncomp_writer.WriteU(current_frame_info.ref_frame_sign_bias[index], 1); + } + + uncomp_writer.WriteBit(true); // Frame size with refs. + uncomp_writer.WriteBit(false); // Render and frame size different. + uncomp_writer.WriteBit(current_frame_info.allow_high_precision_mv); + uncomp_writer.WriteBit(current_frame_info.interp_filter == 4); + + if (current_frame_info.interp_filter != 4) { + uncomp_writer.WriteU(current_frame_info.interp_filter, 2); + } + } + } + + if (!current_frame_info.error_resilient_mode) { + uncomp_writer.WriteBit(true); // Refresh frame context. where do i get this info from? + uncomp_writer.WriteBit(true); // Frame parallel decoding mode. + } + + int frame_ctx_idx = 0; + if (!current_frame_info.show_frame) { + frame_ctx_idx = 1; + } + + uncomp_writer.WriteU(frame_ctx_idx, 2); // Frame context index. + prev_frame_probs = + frame_ctxs[frame_ctx_idx].probs; // reference probabilities for compressed header + frame_ctxs[frame_ctx_idx] = {current_frame_number, false, current_frame_info.entropy}; + + uncomp_writer.WriteU(current_frame_info.first_level, 6); + uncomp_writer.WriteU(current_frame_info.sharpness_level, 3); + uncomp_writer.WriteBit(current_frame_info.mode_ref_delta_enabled); + + if (current_frame_info.mode_ref_delta_enabled) { + // check if ref deltas are different, update accordingly + std::array<bool, 4> update_loop_filter_ref_deltas; + std::array<bool, 2> update_loop_filter_mode_deltas; + + bool loop_filter_delta_update = false; + + for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { + const s8 old_deltas = loop_filter_ref_deltas[index]; + const s8 new_deltas = current_frame_info.ref_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; + + update_loop_filter_ref_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; + } + + for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { + const s8 old_deltas = loop_filter_mode_deltas[index]; + const s8 new_deltas = current_frame_info.mode_deltas[index]; + const bool differing_delta = old_deltas != new_deltas; + + update_loop_filter_mode_deltas[index] = differing_delta; + loop_filter_delta_update |= differing_delta; + } + + uncomp_writer.WriteBit(loop_filter_delta_update); + + if (loop_filter_delta_update) { + for (std::size_t index = 0; index < current_frame_info.ref_deltas.size(); index++) { + uncomp_writer.WriteBit(update_loop_filter_ref_deltas[index]); + + if (update_loop_filter_ref_deltas[index]) { + uncomp_writer.WriteS(current_frame_info.ref_deltas[index], 6); + } + } + + for (std::size_t index = 0; index < current_frame_info.mode_deltas.size(); index++) { + uncomp_writer.WriteBit(update_loop_filter_mode_deltas[index]); + + if (update_loop_filter_mode_deltas[index]) { + uncomp_writer.WriteS(current_frame_info.mode_deltas[index], 6); + } + } + // save new deltas + loop_filter_ref_deltas = current_frame_info.ref_deltas; + loop_filter_mode_deltas = current_frame_info.mode_deltas; + } + } + + uncomp_writer.WriteU(current_frame_info.base_q_index, 8); + + uncomp_writer.WriteDeltaQ(current_frame_info.y_dc_delta_q); + uncomp_writer.WriteDeltaQ(current_frame_info.uv_dc_delta_q); + uncomp_writer.WriteDeltaQ(current_frame_info.uv_ac_delta_q); + + uncomp_writer.WriteBit(false); // Segmentation enabled (TODO). + + const s32 min_tile_cols_log2 = CalcMinLog2TileCols(current_frame_info.frame_size.width); + const s32 max_tile_cols_log2 = CalcMaxLog2TileCols(current_frame_info.frame_size.width); + + const s32 tile_cols_log2_diff = current_frame_info.log2_tile_cols - min_tile_cols_log2; + const s32 tile_cols_log2_inc_mask = (1 << tile_cols_log2_diff) - 1; + + // If it's less than the maximum, we need to add an extra 0 on the bitstream + // to indicate that it should stop reading. + if (current_frame_info.log2_tile_cols < max_tile_cols_log2) { + uncomp_writer.WriteU(tile_cols_log2_inc_mask << 1, tile_cols_log2_diff + 1); + } else { + uncomp_writer.WriteU(tile_cols_log2_inc_mask, tile_cols_log2_diff); + } + + const bool tile_rows_log2_is_nonzero = current_frame_info.log2_tile_rows != 0; + + uncomp_writer.WriteBit(tile_rows_log2_is_nonzero); + + if (tile_rows_log2_is_nonzero) { + uncomp_writer.WriteBit(current_frame_info.log2_tile_rows > 1); + } + + return uncomp_writer; +} + +const std::vector<u8>& VP9::ComposeFrameHeader(const NvdecCommon::NvdecRegisters& state) { + std::vector<u8> bitstream; + { + Vp9FrameContainer curr_frame = GetCurrentFrame(state); + current_frame_info = curr_frame.info; + bitstream = std::move(curr_frame.bit_stream); + } + + // The uncompressed header routine sets PrevProb parameters needed for the compressed header + auto uncomp_writer = ComposeUncompressedHeader(); + std::vector<u8> compressed_header = ComposeCompressedHeader(); + + uncomp_writer.WriteU(static_cast<s32>(compressed_header.size()), 16); + uncomp_writer.Flush(); + std::vector<u8> uncompressed_header = uncomp_writer.GetByteArray(); + + // Write headers and frame to buffer + frame.resize(uncompressed_header.size() + compressed_header.size() + bitstream.size()); + std::memcpy(frame.data(), uncompressed_header.data(), uncompressed_header.size()); + std::memcpy(frame.data() + uncompressed_header.size(), compressed_header.data(), + compressed_header.size()); + std::memcpy(frame.data() + uncompressed_header.size() + compressed_header.size(), + bitstream.data(), bitstream.size()); + + // keep track of frame number + current_frame_number++; + grace_period--; + + // don't display hidden frames + hidden = !current_frame_info.show_frame; + return frame; +} + +VpxRangeEncoder::VpxRangeEncoder() { + Write(false); +} + +VpxRangeEncoder::~VpxRangeEncoder() = default; + +void VpxRangeEncoder::Write(s32 value, s32 value_size) { + for (s32 bit = value_size - 1; bit >= 0; bit--) { + Write(((value >> bit) & 1) != 0); + } +} + +void VpxRangeEncoder::Write(bool bit) { + Write(bit, half_probability); +} + +void VpxRangeEncoder::Write(bool bit, s32 probability) { + u32 local_range = range; + const u32 split = 1 + (((local_range - 1) * static_cast<u32>(probability)) >> 8); + local_range = split; + + if (bit) { + low_value += split; + local_range = range - split; + } + + s32 shift = norm_lut[local_range]; + local_range <<= shift; + count += shift; + + if (count >= 0) { + const s32 offset = shift - count; + + if (((low_value << (offset - 1)) >> 31) != 0) { + const s32 current_pos = static_cast<s32>(base_stream.GetPosition()); + base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + while (PeekByte() == 0xff) { + base_stream.WriteByte(0); + + base_stream.Seek(-2, Common::SeekOrigin::FromCurrentPos); + } + base_stream.WriteByte(static_cast<u8>((PeekByte() + 1))); + base_stream.Seek(current_pos, Common::SeekOrigin::SetOrigin); + } + base_stream.WriteByte(static_cast<u8>((low_value >> (24 - offset)))); + + low_value <<= offset; + shift = count; + low_value &= 0xffffff; + count -= 8; + } + + low_value <<= shift; + range = local_range; +} + +void VpxRangeEncoder::End() { + for (std::size_t index = 0; index < 32; ++index) { + Write(false); + } +} + +u8 VpxRangeEncoder::PeekByte() { + const u8 value = base_stream.ReadByte(); + base_stream.Seek(-1, Common::SeekOrigin::FromCurrentPos); + + return value; +} + +VpxBitStreamWriter::VpxBitStreamWriter() = default; + +VpxBitStreamWriter::~VpxBitStreamWriter() = default; + +void VpxBitStreamWriter::WriteU(u32 value, u32 value_size) { + WriteBits(value, value_size); +} + +void VpxBitStreamWriter::WriteS(s32 value, u32 value_size) { + const bool sign = value < 0; + if (sign) { + value = -value; + } + + WriteBits(static_cast<u32>(value << 1) | (sign ? 1 : 0), value_size + 1); +} + +void VpxBitStreamWriter::WriteDeltaQ(u32 value) { + const bool delta_coded = value != 0; + WriteBit(delta_coded); + + if (delta_coded) { + WriteBits(value, 4); + } +} + +void VpxBitStreamWriter::WriteBits(u32 value, u32 bit_count) { + s32 value_pos = 0; + s32 remaining = bit_count; + + while (remaining > 0) { + s32 copy_size = remaining; + + const s32 free = GetFreeBufferBits(); + + if (copy_size > free) { + copy_size = free; + } + + const s32 mask = (1 << copy_size) - 1; + + const s32 src_shift = (bit_count - value_pos) - copy_size; + const s32 dst_shift = (buffer_size - buffer_pos) - copy_size; + + buffer |= ((value >> src_shift) & mask) << dst_shift; + + value_pos += copy_size; + buffer_pos += copy_size; + remaining -= copy_size; + } +} + +void VpxBitStreamWriter::WriteBit(bool state) { + WriteBits(state ? 1 : 0, 1); +} + +s32 VpxBitStreamWriter::GetFreeBufferBits() { + if (buffer_pos == buffer_size) { + Flush(); + } + + return buffer_size - buffer_pos; +} + +void VpxBitStreamWriter::Flush() { + if (buffer_pos == 0) { + return; + } + byte_array.push_back(static_cast<u8>(buffer)); + buffer = 0; + buffer_pos = 0; +} + +std::vector<u8>& VpxBitStreamWriter::GetByteArray() { + return byte_array; +} + +const std::vector<u8>& VpxBitStreamWriter::GetByteArray() const { + return byte_array; +} + +} // namespace Tegra::Decoder diff --git a/src/video_core/command_classes/codecs/vp9.h b/src/video_core/command_classes/codecs/vp9.h new file mode 100644 index 000000000..8396c8105 --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9.h @@ -0,0 +1,197 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <vector> + +#include "common/common_types.h" +#include "common/stream.h" +#include "video_core/command_classes/codecs/vp9_types.h" +#include "video_core/command_classes/nvdec_common.h" + +namespace Tegra { +class GPU; +enum class FrameType { KeyFrame = 0, InterFrame = 1 }; +namespace Decoder { + +/// The VpxRangeEncoder, and VpxBitStreamWriter classes are used to compose the +/// VP9 header bitstreams. + +class VpxRangeEncoder { +public: + VpxRangeEncoder(); + ~VpxRangeEncoder(); + + VpxRangeEncoder(const VpxRangeEncoder&) = delete; + VpxRangeEncoder& operator=(const VpxRangeEncoder&) = delete; + + VpxRangeEncoder(VpxRangeEncoder&&) = default; + VpxRangeEncoder& operator=(VpxRangeEncoder&&) = default; + + /// Writes the rightmost value_size bits from value into the stream + void Write(s32 value, s32 value_size); + + /// Writes a single bit with half probability + void Write(bool bit); + + /// Writes a bit to the base_stream encoded with probability + void Write(bool bit, s32 probability); + + /// Signal the end of the bitstream + void End(); + + [[nodiscard]] std::vector<u8>& GetBuffer() { + return base_stream.GetBuffer(); + } + + [[nodiscard]] const std::vector<u8>& GetBuffer() const { + return base_stream.GetBuffer(); + } + +private: + u8 PeekByte(); + Common::Stream base_stream{}; + u32 low_value{}; + u32 range{0xff}; + s32 count{-24}; + s32 half_probability{128}; +}; + +class VpxBitStreamWriter { +public: + VpxBitStreamWriter(); + ~VpxBitStreamWriter(); + + VpxBitStreamWriter(const VpxBitStreamWriter&) = delete; + VpxBitStreamWriter& operator=(const VpxBitStreamWriter&) = delete; + + VpxBitStreamWriter(VpxBitStreamWriter&&) = default; + VpxBitStreamWriter& operator=(VpxBitStreamWriter&&) = default; + + /// Write an unsigned integer value + void WriteU(u32 value, u32 value_size); + + /// Write a signed integer value + void WriteS(s32 value, u32 value_size); + + /// Based on 6.2.10 of VP9 Spec, writes a delta coded value + void WriteDeltaQ(u32 value); + + /// Write a single bit. + void WriteBit(bool state); + + /// Pushes current buffer into buffer_array, resets buffer + void Flush(); + + /// Returns byte_array + [[nodiscard]] std::vector<u8>& GetByteArray(); + + /// Returns const byte_array + [[nodiscard]] const std::vector<u8>& GetByteArray() const; + +private: + /// Write bit_count bits from value into buffer + void WriteBits(u32 value, u32 bit_count); + + /// Gets next available position in buffer, invokes Flush() if buffer is full + s32 GetFreeBufferBits(); + + s32 buffer_size{8}; + + s32 buffer{}; + s32 buffer_pos{}; + std::vector<u8> byte_array; +}; + +class VP9 { +public: + explicit VP9(GPU& gpu_); + ~VP9(); + + VP9(const VP9&) = delete; + VP9& operator=(const VP9&) = delete; + + VP9(VP9&&) = default; + VP9& operator=(VP9&&) = delete; + + /// Composes the VP9 frame from the GPU state information. Based on the official VP9 spec + /// documentation + [[nodiscard]] const std::vector<u8>& ComposeFrameHeader( + const NvdecCommon::NvdecRegisters& state); + + /// Returns true if the most recent frame was a hidden frame. + [[nodiscard]] bool WasFrameHidden() const { + return hidden; + } + +private: + /// Generates compressed header probability updates in the bitstream writer + template <typename T, std::size_t N> + void WriteProbabilityUpdate(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob); + + /// Generates compressed header probability updates in the bitstream writer + /// If probs are not equal, WriteProbabilityDelta is invoked + void WriteProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Generates compressed header probability deltas in the bitstream writer + void WriteProbabilityDelta(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Inverse of 6.3.4 Decode term subexp + void EncodeTermSubExp(VpxRangeEncoder& writer, s32 value); + + /// Writes if the value is less than the test value + bool WriteLessThan(VpxRangeEncoder& writer, s32 value, s32 test); + + /// Writes probability updates for the Coef probabilities + void WriteCoefProbabilityUpdate(VpxRangeEncoder& writer, s32 tx_mode, + const std::array<u8, 1728>& new_prob, + const std::array<u8, 1728>& old_prob); + + /// Write probabilities for 4-byte aligned structures + template <typename T, std::size_t N> + void WriteProbabilityUpdateAligned4(VpxRangeEncoder& writer, const std::array<T, N>& new_prob, + const std::array<T, N>& old_prob); + + /// Write motion vector probability updates. 6.3.17 in the spec + void WriteMvProbabilityUpdate(VpxRangeEncoder& writer, u8 new_prob, u8 old_prob); + + /// Returns VP9 information from NVDEC provided offset and size + [[nodiscard]] Vp9PictureInfo GetVp9PictureInfo(const NvdecCommon::NvdecRegisters& state); + + /// Read and convert NVDEC provided entropy probs to Vp9EntropyProbs struct + void InsertEntropy(u64 offset, Vp9EntropyProbs& dst); + + /// Returns frame to be decoded after buffering + [[nodiscard]] Vp9FrameContainer GetCurrentFrame(const NvdecCommon::NvdecRegisters& state); + + /// Use NVDEC providied information to compose the headers for the current frame + [[nodiscard]] std::vector<u8> ComposeCompressedHeader(); + [[nodiscard]] VpxBitStreamWriter ComposeUncompressedHeader(); + + GPU& gpu; + std::vector<u8> frame; + + std::array<s8, 4> loop_filter_ref_deltas{}; + std::array<s8, 2> loop_filter_mode_deltas{}; + + bool hidden = false; + s64 current_frame_number = -2; // since we buffer 2 frames + s32 grace_period = 6; // frame offsets need to stabilize + std::array<FrameContexts, 4> frame_ctxs{}; + Vp9FrameContainer next_frame{}; + Vp9FrameContainer next_next_frame{}; + bool swap_next_golden{}; + + Vp9PictureInfo current_frame_info{}; + Vp9EntropyProbs prev_frame_probs{}; + + s32 diff_update_probability = 252; + s32 frame_sync_code = 0x498342; +}; + +} // namespace Decoder +} // namespace Tegra diff --git a/src/video_core/command_classes/codecs/vp9_types.h b/src/video_core/command_classes/codecs/vp9_types.h new file mode 100644 index 000000000..139501a1c --- /dev/null +++ b/src/video_core/command_classes/codecs/vp9_types.h @@ -0,0 +1,302 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <cstring> +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class GPU; + +namespace Decoder { +struct Vp9FrameDimensions { + s16 width{}; + s16 height{}; + s16 luma_pitch{}; + s16 chroma_pitch{}; +}; +static_assert(sizeof(Vp9FrameDimensions) == 0x8, "Vp9 Vp9FrameDimensions is an invalid size"); + +enum FrameFlags : u32 { + IsKeyFrame = 1 << 0, + LastFrameIsKeyFrame = 1 << 1, + FrameSizeChanged = 1 << 2, + ErrorResilientMode = 1 << 3, + LastShowFrame = 1 << 4, + IntraOnly = 1 << 5, +}; + +enum class TxSize { + Tx4x4 = 0, // 4x4 transform + Tx8x8 = 1, // 8x8 transform + Tx16x16 = 2, // 16x16 transform + Tx32x32 = 3, // 32x32 transform + TxSizes = 4 +}; + +enum class TxMode { + Only4X4 = 0, // Only 4x4 transform used + Allow8X8 = 1, // Allow block transform size up to 8x8 + Allow16X16 = 2, // Allow block transform size up to 16x16 + Allow32X32 = 3, // Allow block transform size up to 32x32 + TxModeSelect = 4, // Transform specified for each block + TxModes = 5 +}; + +struct Segmentation { + u8 enabled{}; + u8 update_map{}; + u8 temporal_update{}; + u8 abs_delta{}; + std::array<u32, 8> feature_mask{}; + std::array<std::array<s16, 4>, 8> feature_data{}; +}; +static_assert(sizeof(Segmentation) == 0x64, "Segmentation is an invalid size"); + +struct LoopFilter { + u8 mode_ref_delta_enabled{}; + std::array<s8, 4> ref_deltas{}; + std::array<s8, 2> mode_deltas{}; +}; +static_assert(sizeof(LoopFilter) == 0x7, "LoopFilter is an invalid size"); + +struct Vp9EntropyProbs { + std::array<u8, 36> y_mode_prob{}; + std::array<u8, 64> partition_prob{}; + std::array<u8, 1728> coef_probs{}; + std::array<u8, 8> switchable_interp_prob{}; + std::array<u8, 28> inter_mode_prob{}; + std::array<u8, 4> intra_inter_prob{}; + std::array<u8, 5> comp_inter_prob{}; + std::array<u8, 10> single_ref_prob{}; + std::array<u8, 5> comp_ref_prob{}; + std::array<u8, 6> tx_32x32_prob{}; + std::array<u8, 4> tx_16x16_prob{}; + std::array<u8, 2> tx_8x8_prob{}; + std::array<u8, 3> skip_probs{}; + std::array<u8, 3> joints{}; + std::array<u8, 2> sign{}; + std::array<u8, 20> classes{}; + std::array<u8, 2> class_0{}; + std::array<u8, 20> prob_bits{}; + std::array<u8, 12> class_0_fr{}; + std::array<u8, 6> fr{}; + std::array<u8, 2> class_0_hp{}; + std::array<u8, 2> high_precision{}; +}; +static_assert(sizeof(Vp9EntropyProbs) == 0x7B4, "Vp9EntropyProbs is an invalid size"); + +struct Vp9PictureInfo { + bool is_key_frame{}; + bool intra_only{}; + bool last_frame_was_key{}; + bool frame_size_changed{}; + bool error_resilient_mode{}; + bool last_frame_shown{}; + bool show_frame{}; + std::array<s8, 4> ref_frame_sign_bias{}; + s32 base_q_index{}; + s32 y_dc_delta_q{}; + s32 uv_dc_delta_q{}; + s32 uv_ac_delta_q{}; + bool lossless{}; + s32 transform_mode{}; + bool allow_high_precision_mv{}; + s32 interp_filter{}; + s32 reference_mode{}; + s8 comp_fixed_ref{}; + std::array<s8, 2> comp_var_ref{}; + s32 log2_tile_cols{}; + s32 log2_tile_rows{}; + bool segment_enabled{}; + bool segment_map_update{}; + bool segment_map_temporal_update{}; + s32 segment_abs_delta{}; + std::array<u32, 8> segment_feature_enable{}; + std::array<std::array<s16, 4>, 8> segment_feature_data{}; + bool mode_ref_delta_enabled{}; + bool use_prev_in_find_mv_refs{}; + std::array<s8, 4> ref_deltas{}; + std::array<s8, 2> mode_deltas{}; + Vp9EntropyProbs entropy{}; + Vp9FrameDimensions frame_size{}; + u8 first_level{}; + u8 sharpness_level{}; + u32 bitstream_size{}; + std::array<u64, 4> frame_offsets{}; + std::array<bool, 4> refresh_frame{}; +}; + +struct Vp9FrameContainer { + Vp9PictureInfo info{}; + std::vector<u8> bit_stream; +}; + +struct PictureInfo { + INSERT_PADDING_WORDS(12); + u32 bitstream_size{}; + INSERT_PADDING_WORDS(5); + Vp9FrameDimensions last_frame_size{}; + Vp9FrameDimensions golden_frame_size{}; + Vp9FrameDimensions alt_frame_size{}; + Vp9FrameDimensions current_frame_size{}; + u32 vp9_flags{}; + std::array<s8, 4> ref_frame_sign_bias{}; + u8 first_level{}; + u8 sharpness_level{}; + u8 base_q_index{}; + u8 y_dc_delta_q{}; + u8 uv_ac_delta_q{}; + u8 uv_dc_delta_q{}; + u8 lossless{}; + u8 tx_mode{}; + u8 allow_high_precision_mv{}; + u8 interp_filter{}; + u8 reference_mode{}; + s8 comp_fixed_ref{}; + std::array<s8, 2> comp_var_ref{}; + u8 log2_tile_cols{}; + u8 log2_tile_rows{}; + Segmentation segmentation{}; + LoopFilter loop_filter{}; + INSERT_PADDING_BYTES(5); + u32 surface_params{}; + INSERT_PADDING_WORDS(3); + + [[nodiscard]] Vp9PictureInfo Convert() const { + return { + .is_key_frame = (vp9_flags & FrameFlags::IsKeyFrame) != 0, + .intra_only = (vp9_flags & FrameFlags::IntraOnly) != 0, + .last_frame_was_key = (vp9_flags & FrameFlags::LastFrameIsKeyFrame) != 0, + .frame_size_changed = (vp9_flags & FrameFlags::FrameSizeChanged) != 0, + .error_resilient_mode = (vp9_flags & FrameFlags::ErrorResilientMode) != 0, + .last_frame_shown = (vp9_flags & FrameFlags::LastShowFrame) != 0, + .ref_frame_sign_bias = ref_frame_sign_bias, + .base_q_index = base_q_index, + .y_dc_delta_q = y_dc_delta_q, + .uv_dc_delta_q = uv_dc_delta_q, + .uv_ac_delta_q = uv_ac_delta_q, + .lossless = lossless != 0, + .transform_mode = tx_mode, + .allow_high_precision_mv = allow_high_precision_mv != 0, + .interp_filter = interp_filter, + .reference_mode = reference_mode, + .comp_fixed_ref = comp_fixed_ref, + .comp_var_ref = comp_var_ref, + .log2_tile_cols = log2_tile_cols, + .log2_tile_rows = log2_tile_rows, + .segment_enabled = segmentation.enabled != 0, + .segment_map_update = segmentation.update_map != 0, + .segment_map_temporal_update = segmentation.temporal_update != 0, + .segment_abs_delta = segmentation.abs_delta, + .segment_feature_enable = segmentation.feature_mask, + .segment_feature_data = segmentation.feature_data, + .mode_ref_delta_enabled = loop_filter.mode_ref_delta_enabled != 0, + .use_prev_in_find_mv_refs = !(vp9_flags == (FrameFlags::ErrorResilientMode)) && + !(vp9_flags == (FrameFlags::FrameSizeChanged)) && + !(vp9_flags == (FrameFlags::IntraOnly)) && + (vp9_flags == (FrameFlags::LastShowFrame)) && + !(vp9_flags == (FrameFlags::LastFrameIsKeyFrame)), + .ref_deltas = loop_filter.ref_deltas, + .mode_deltas = loop_filter.mode_deltas, + .frame_size = current_frame_size, + .first_level = first_level, + .sharpness_level = sharpness_level, + .bitstream_size = bitstream_size, + }; + } +}; +static_assert(sizeof(PictureInfo) == 0x100, "PictureInfo is an invalid size"); + +struct EntropyProbs { + INSERT_PADDING_BYTES(1024); + std::array<u8, 28> inter_mode_prob{}; + std::array<u8, 4> intra_inter_prob{}; + INSERT_PADDING_BYTES(80); + std::array<u8, 2> tx_8x8_prob{}; + std::array<u8, 4> tx_16x16_prob{}; + std::array<u8, 6> tx_32x32_prob{}; + std::array<u8, 4> y_mode_prob_e8{}; + std::array<std::array<u8, 8>, 4> y_mode_prob_e0e7{}; + INSERT_PADDING_BYTES(64); + std::array<u8, 64> partition_prob{}; + INSERT_PADDING_BYTES(10); + std::array<u8, 8> switchable_interp_prob{}; + std::array<u8, 5> comp_inter_prob{}; + std::array<u8, 3> skip_probs{}; + INSERT_PADDING_BYTES(1); + std::array<u8, 3> joints{}; + std::array<u8, 2> sign{}; + std::array<u8, 2> class_0{}; + std::array<u8, 6> fr{}; + std::array<u8, 2> class_0_hp{}; + std::array<u8, 2> high_precision{}; + std::array<u8, 20> classes{}; + std::array<u8, 12> class_0_fr{}; + std::array<u8, 20> pred_bits{}; + std::array<u8, 10> single_ref_prob{}; + std::array<u8, 5> comp_ref_prob{}; + INSERT_PADDING_BYTES(17); + std::array<u8, 2304> coef_probs{}; + + void Convert(Vp9EntropyProbs& fc) { + fc.inter_mode_prob = inter_mode_prob; + fc.intra_inter_prob = intra_inter_prob; + fc.tx_8x8_prob = tx_8x8_prob; + fc.tx_16x16_prob = tx_16x16_prob; + fc.tx_32x32_prob = tx_32x32_prob; + + for (std::size_t i = 0; i < 4; i++) { + for (std::size_t j = 0; j < 9; j++) { + fc.y_mode_prob[j + 9 * i] = j < 8 ? y_mode_prob_e0e7[i][j] : y_mode_prob_e8[i]; + } + } + + fc.partition_prob = partition_prob; + fc.switchable_interp_prob = switchable_interp_prob; + fc.comp_inter_prob = comp_inter_prob; + fc.skip_probs = skip_probs; + fc.joints = joints; + fc.sign = sign; + fc.class_0 = class_0; + fc.fr = fr; + fc.class_0_hp = class_0_hp; + fc.high_precision = high_precision; + fc.classes = classes; + fc.class_0_fr = class_0_fr; + fc.prob_bits = pred_bits; + fc.single_ref_prob = single_ref_prob; + fc.comp_ref_prob = comp_ref_prob; + + // Skip the 4th element as it goes unused + for (std::size_t i = 0; i < coef_probs.size(); i += 4) { + const std::size_t j = i - i / 4; + fc.coef_probs[j] = coef_probs[i]; + fc.coef_probs[j + 1] = coef_probs[i + 1]; + fc.coef_probs[j + 2] = coef_probs[i + 2]; + } + } +}; +static_assert(sizeof(EntropyProbs) == 0xEA0, "EntropyProbs is an invalid size"); + +enum class Ref { Last, Golden, AltRef }; + +struct RefPoolElement { + s64 frame{}; + Ref ref{}; + bool refresh{}; +}; + +struct FrameContexts { + s64 from{}; + bool adapted{}; + Vp9EntropyProbs probs{}; +}; + +}; // namespace Decoder +}; // namespace Tegra diff --git a/src/video_core/command_classes/host1x.cpp b/src/video_core/command_classes/host1x.cpp new file mode 100644 index 000000000..b12494528 --- /dev/null +++ b/src/video_core/command_classes/host1x.cpp @@ -0,0 +1,30 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/command_classes/host1x.h" +#include "video_core/gpu.h" + +Tegra::Host1x::Host1x(GPU& gpu_) : gpu(gpu_) {} + +Tegra::Host1x::~Host1x() = default; + +void Tegra::Host1x::ProcessMethod(Method method, u32 argument) { + switch (method) { + case Method::LoadSyncptPayload32: + syncpoint_value = argument; + break; + case Method::WaitSyncpt: + case Method::WaitSyncpt32: + Execute(argument); + break; + default: + UNIMPLEMENTED_MSG("Host1x method 0x{:X}", static_cast<u32>(method)); + break; + } +} + +void Tegra::Host1x::Execute(u32 data) { + gpu.WaitFence(data, syncpoint_value); +} diff --git a/src/video_core/command_classes/host1x.h b/src/video_core/command_classes/host1x.h new file mode 100644 index 000000000..7e94799dd --- /dev/null +++ b/src/video_core/command_classes/host1x.h @@ -0,0 +1,37 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <vector> +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class GPU; +class Nvdec; + +class Host1x { +public: + enum class Method : u32 { + WaitSyncpt = 0x8, + LoadSyncptPayload32 = 0x4e, + WaitSyncpt32 = 0x50, + }; + + explicit Host1x(GPU& gpu); + ~Host1x(); + + /// Writes the method into the state, Invoke Execute() if encountered + void ProcessMethod(Method method, u32 argument); + +private: + /// For Host1x, execute is waiting on a syncpoint previously written into the state + void Execute(u32 data); + + u32 syncpoint_value{}; + GPU& gpu; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.cpp b/src/video_core/command_classes/nvdec.cpp new file mode 100644 index 000000000..79e1f4e13 --- /dev/null +++ b/src/video_core/command_classes/nvdec.cpp @@ -0,0 +1,48 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/command_classes/nvdec.h" +#include "video_core/gpu.h" + +namespace Tegra { + +Nvdec::Nvdec(GPU& gpu_) : gpu(gpu_), codec(std::make_unique<Codec>(gpu)) {} + +Nvdec::~Nvdec() = default; + +void Nvdec::ProcessMethod(Method method, const std::vector<u32>& arguments) { + if (method == Method::SetVideoCodec) { + codec->StateWrite(static_cast<u32>(method), arguments[0]); + } else { + codec->StateWrite(static_cast<u32>(method), static_cast<u64>(arguments[0]) << 8); + } + + switch (method) { + case Method::SetVideoCodec: + codec->SetTargetCodec(static_cast<NvdecCommon::VideoCodec>(arguments[0])); + break; + case Method::Execute: + Execute(); + break; + } +} + +AVFramePtr Nvdec::GetFrame() { + return codec->GetCurrentFrame(); +} + +void Nvdec::Execute() { + switch (codec->GetCurrentCodec()) { + case NvdecCommon::VideoCodec::H264: + case NvdecCommon::VideoCodec::Vp9: + codec->Decode(); + break; + default: + UNIMPLEMENTED_MSG("Unknown codec {}", static_cast<u32>(codec->GetCurrentCodec())); + break; + } +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec.h b/src/video_core/command_classes/nvdec.h new file mode 100644 index 000000000..e4877c533 --- /dev/null +++ b/src/video_core/command_classes/nvdec.h @@ -0,0 +1,38 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <vector> +#include "common/common_types.h" +#include "video_core/command_classes/codecs/codec.h" + +namespace Tegra { +class GPU; + +class Nvdec { +public: + enum class Method : u32 { + SetVideoCodec = 0x80, + Execute = 0xc0, + }; + + explicit Nvdec(GPU& gpu); + ~Nvdec(); + + /// Writes the method into the state, Invoke Execute() if encountered + void ProcessMethod(Method method, const std::vector<u32>& arguments); + + /// Return most recently decoded frame + [[nodiscard]] AVFramePtr GetFrame(); + +private: + /// Invoke codec to decode a frame + void Execute(); + + GPU& gpu; + std::unique_ptr<Codec> codec; +}; +} // namespace Tegra diff --git a/src/video_core/command_classes/nvdec_common.h b/src/video_core/command_classes/nvdec_common.h new file mode 100644 index 000000000..01b5e086d --- /dev/null +++ b/src/video_core/command_classes/nvdec_common.h @@ -0,0 +1,48 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra::NvdecCommon { + +struct NvdecRegisters { + INSERT_PADDING_WORDS(256); + u64 set_codec_id{}; + INSERT_PADDING_WORDS(254); + u64 set_platform_id{}; + u64 picture_info_offset{}; + u64 frame_bitstream_offset{}; + u64 frame_number{}; + u64 h264_slice_data_offsets{}; + u64 h264_mv_dump_offset{}; + INSERT_PADDING_WORDS(6); + u64 frame_stats_offset{}; + u64 h264_last_surface_luma_offset{}; + u64 h264_last_surface_chroma_offset{}; + std::array<u64, 17> surface_luma_offset{}; + std::array<u64, 17> surface_chroma_offset{}; + INSERT_PADDING_WORDS(132); + u64 vp9_entropy_probs_offset{}; + u64 vp9_backward_updates_offset{}; + u64 vp9_last_frame_segmap_offset{}; + u64 vp9_curr_frame_segmap_offset{}; + INSERT_PADDING_WORDS(2); + u64 vp9_last_frame_mvs_offset{}; + u64 vp9_curr_frame_mvs_offset{}; + INSERT_PADDING_WORDS(2); +}; +static_assert(sizeof(NvdecRegisters) == (0xBC0), "NvdecRegisters is incorrect size"); + +enum class VideoCodec : u32 { + None = 0x0, + H264 = 0x3, + Vp8 = 0x5, + H265 = 0x7, + Vp9 = 0x9, +}; + +} // namespace Tegra::NvdecCommon diff --git a/src/video_core/command_classes/sync_manager.cpp b/src/video_core/command_classes/sync_manager.cpp new file mode 100644 index 000000000..19dc9e0ab --- /dev/null +++ b/src/video_core/command_classes/sync_manager.cpp @@ -0,0 +1,60 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <algorithm> +#include "sync_manager.h" +#include "video_core/gpu.h" + +namespace Tegra { +SyncptIncrManager::SyncptIncrManager(GPU& gpu_) : gpu(gpu_) {} +SyncptIncrManager::~SyncptIncrManager() = default; + +void SyncptIncrManager::Increment(u32 id) { + increments.emplace_back(0, 0, id, true); + IncrementAllDone(); +} + +u32 SyncptIncrManager::IncrementWhenDone(u32 class_id, u32 id) { + const u32 handle = current_id++; + increments.emplace_back(handle, class_id, id); + return handle; +} + +void SyncptIncrManager::SignalDone(u32 handle) { + const auto done_incr = + std::find_if(increments.begin(), increments.end(), + [handle](const SyncptIncr& incr) { return incr.id == handle; }); + if (done_incr != increments.cend()) { + done_incr->complete = true; + } + IncrementAllDone(); +} + +void SyncptIncrManager::IncrementAllDone() { + std::size_t done_count = 0; + for (; done_count < increments.size(); ++done_count) { + if (!increments[done_count].complete) { + break; + } + gpu.IncrementSyncPoint(increments[done_count].syncpt_id); + } + increments.erase(increments.begin(), increments.begin() + done_count); +} +} // namespace Tegra diff --git a/src/video_core/command_classes/sync_manager.h b/src/video_core/command_classes/sync_manager.h new file mode 100644 index 000000000..2c321ec58 --- /dev/null +++ b/src/video_core/command_classes/sync_manager.h @@ -0,0 +1,64 @@ +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#pragma once + +#include <mutex> +#include <vector> +#include "common/common_types.h" + +namespace Tegra { +class GPU; +struct SyncptIncr { + u32 id; + u32 class_id; + u32 syncpt_id; + bool complete; + + SyncptIncr(u32 id_, u32 class_id_, u32 syncpt_id_, bool done = false) + : id(id_), class_id(class_id_), syncpt_id(syncpt_id_), complete(done) {} +}; + +class SyncptIncrManager { +public: + explicit SyncptIncrManager(GPU& gpu); + ~SyncptIncrManager(); + + /// Add syncpoint id and increment all + void Increment(u32 id); + + /// Returns a handle to increment later + u32 IncrementWhenDone(u32 class_id, u32 id); + + /// IncrememntAllDone, including handle + void SignalDone(u32 handle); + + /// Increment all sequential pending increments that are already done. + void IncrementAllDone(); + +private: + std::vector<SyncptIncr> increments; + std::mutex increment_lock; + u32 current_id{}; + + GPU& gpu; +}; + +} // namespace Tegra diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp new file mode 100644 index 000000000..55e632346 --- /dev/null +++ b/src/video_core/command_classes/vic.cpp @@ -0,0 +1,175 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include "common/assert.h" +#include "video_core/command_classes/nvdec.h" +#include "video_core/command_classes/vic.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/textures/decoders.h" + +extern "C" { +#include <libswscale/swscale.h> +} + +namespace Tegra { + +Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) + : gpu(gpu_), nvdec_processor(std::move(nvdec_processor_)) {} +Vic::~Vic() = default; + +void Vic::VicStateWrite(u32 offset, u32 arguments) { + u8* const state_offset = reinterpret_cast<u8*>(&vic_state) + offset * sizeof(u32); + std::memcpy(state_offset, &arguments, sizeof(u32)); +} + +void Vic::ProcessMethod(Method method, const std::vector<u32>& arguments) { + LOG_DEBUG(HW_GPU, "Vic method 0x{:X}", method); + VicStateWrite(static_cast<u32>(method), arguments[0]); + const u64 arg = static_cast<u64>(arguments[0]) << 8; + switch (method) { + case Method::Execute: + Execute(); + break; + case Method::SetConfigStructOffset: + config_struct_address = arg; + break; + case Method::SetOutputSurfaceLumaOffset: + output_surface_luma_address = arg; + break; + case Method::SetOutputSurfaceChromaUOffset: + output_surface_chroma_u_address = arg; + break; + case Method::SetOutputSurfaceChromaVOffset: + output_surface_chroma_v_address = arg; + break; + default: + break; + } +} + +void Vic::Execute() { + if (output_surface_luma_address == 0) { + LOG_ERROR(Service_NVDRV, "VIC Luma address not set. Received 0x{:X}", + vic_state.output_surface.luma_offset); + return; + } + const VicConfig config{gpu.MemoryManager().Read<u64>(config_struct_address + 0x20)}; + const AVFramePtr frame_ptr = nvdec_processor->GetFrame(); + const auto* frame = frame_ptr.get(); + if (!frame || frame->width == 0 || frame->height == 0) { + return; + } + const VideoPixelFormat pixel_format = + static_cast<VideoPixelFormat>(config.pixel_format.Value()); + switch (pixel_format) { + case VideoPixelFormat::BGRA8: + case VideoPixelFormat::RGBA8: { + LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); + + if (scaler_ctx == nullptr || frame->width != scaler_width || + frame->height != scaler_height) { + const AVPixelFormat target_format = + (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA; + + sws_freeContext(scaler_ctx); + scaler_ctx = nullptr; + + // FFmpeg returns all frames in YUV420, convert it into expected format + scaler_ctx = + sws_getContext(frame->width, frame->height, AV_PIX_FMT_YUV420P, frame->width, + frame->height, target_format, 0, nullptr, nullptr, nullptr); + + scaler_width = frame->width; + scaler_height = frame->height; + } + // Get Converted frame + const std::size_t linear_size = frame->width * frame->height * 4; + + using AVMallocPtr = std::unique_ptr<u8, decltype(&av_free)>; + AVMallocPtr converted_frame_buffer{static_cast<u8*>(av_malloc(linear_size)), av_free}; + + const int converted_stride{frame->width * 4}; + u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; + + sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, + &converted_frame_buf_addr, &converted_stride); + + const u32 blk_kind = static_cast<u32>(config.block_linear_kind); + if (blk_kind != 0) { + // swizzle pitch linear to block linear + const u32 block_height = static_cast<u32>(config.block_linear_height_log2); + const auto size = Tegra::Texture::CalculateSize(true, 4, frame->width, frame->height, 1, + block_height, 0); + std::vector<u8> swizzled_data(size); + Tegra::Texture::SwizzleSubrect(frame->width, frame->height, frame->width * 4, + frame->width, 4, swizzled_data.data(), + converted_frame_buffer.get(), block_height, 0, 0); + + gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); + gpu.Maxwell3D().OnMemoryWrite(); + } else { + // send pitch linear frame + gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, + linear_size); + gpu.Maxwell3D().OnMemoryWrite(); + } + break; + } + case VideoPixelFormat::Yuv420: { + LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); + + const std::size_t surface_width = config.surface_width_minus1 + 1; + const std::size_t surface_height = config.surface_height_minus1 + 1; + const std::size_t half_width = surface_width / 2; + const std::size_t half_height = config.surface_height_minus1 / 2; + const std::size_t aligned_width = (surface_width + 0xff) & ~0xff; + + const auto* luma_ptr = frame->data[0]; + const auto* chroma_b_ptr = frame->data[1]; + const auto* chroma_r_ptr = frame->data[2]; + const auto stride = frame->linesize[0]; + const auto half_stride = frame->linesize[1]; + + std::vector<u8> luma_buffer(aligned_width * surface_height); + std::vector<u8> chroma_buffer(aligned_width * half_height); + + // Populate luma buffer + for (std::size_t y = 0; y < surface_height - 1; ++y) { + std::size_t src = y * stride; + std::size_t dst = y * aligned_width; + + std::size_t size = surface_width; + + for (std::size_t offset = 0; offset < size; ++offset) { + luma_buffer[dst + offset] = luma_ptr[src + offset]; + } + } + gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), + luma_buffer.size()); + + // Populate chroma buffer from both channels with interleaving. + for (std::size_t y = 0; y < half_height; ++y) { + std::size_t src = y * half_stride; + std::size_t dst = y * aligned_width; + + for (std::size_t x = 0; x < half_width; ++x) { + chroma_buffer[dst + x * 2] = chroma_b_ptr[src + x]; + chroma_buffer[dst + x * 2 + 1] = chroma_r_ptr[src + x]; + } + } + gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), + chroma_buffer.size()); + gpu.Maxwell3D().OnMemoryWrite(); + break; + } + default: + UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value()); + break; + } +} + +} // namespace Tegra diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h new file mode 100644 index 000000000..8c4e284a1 --- /dev/null +++ b/src/video_core/command_classes/vic.h @@ -0,0 +1,110 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <memory> +#include <vector> +#include "common/bit_field.h" +#include "common/common_types.h" + +struct SwsContext; + +namespace Tegra { +class GPU; +class Nvdec; + +struct PlaneOffsets { + u32 luma_offset{}; + u32 chroma_u_offset{}; + u32 chroma_v_offset{}; +}; + +struct VicRegisters { + INSERT_PADDING_WORDS(64); + u32 nop{}; + INSERT_PADDING_WORDS(15); + u32 pm_trigger{}; + INSERT_PADDING_WORDS(47); + u32 set_application_id{}; + u32 set_watchdog_timer{}; + INSERT_PADDING_WORDS(17); + u32 context_save_area{}; + u32 context_switch{}; + INSERT_PADDING_WORDS(43); + u32 execute{}; + INSERT_PADDING_WORDS(63); + std::array<std::array<PlaneOffsets, 8>, 8> surfacex_slots{}; + u32 picture_index{}; + u32 control_params{}; + u32 config_struct_offset{}; + u32 filter_struct_offset{}; + u32 palette_offset{}; + u32 hist_offset{}; + u32 context_id{}; + u32 fce_ucode_size{}; + PlaneOffsets output_surface{}; + u32 fce_ucode_offset{}; + INSERT_PADDING_WORDS(4); + std::array<u32, 8> slot_context_id{}; + INSERT_PADDING_WORDS(16); +}; +static_assert(sizeof(VicRegisters) == 0x7A0, "VicRegisters is an invalid size"); + +class Vic { +public: + enum class Method : u32 { + Execute = 0xc0, + SetControlParams = 0x1c1, + SetConfigStructOffset = 0x1c2, + SetOutputSurfaceLumaOffset = 0x1c8, + SetOutputSurfaceChromaUOffset = 0x1c9, + SetOutputSurfaceChromaVOffset = 0x1ca + }; + + explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor); + ~Vic(); + + /// Write to the device state. + void ProcessMethod(Method method, const std::vector<u32>& arguments); + +private: + void Execute(); + + void VicStateWrite(u32 offset, u32 arguments); + VicRegisters vic_state{}; + + enum class VideoPixelFormat : u64_le { + RGBA8 = 0x1f, + BGRA8 = 0x20, + Yuv420 = 0x44, + }; + + union VicConfig { + u64_le raw{}; + BitField<0, 7, u64_le> pixel_format; + BitField<7, 2, u64_le> chroma_loc_horiz; + BitField<9, 2, u64_le> chroma_loc_vert; + BitField<11, 4, u64_le> block_linear_kind; + BitField<15, 4, u64_le> block_linear_height_log2; + BitField<19, 3, u64_le> reserved0; + BitField<22, 10, u64_le> reserved1; + BitField<32, 14, u64_le> surface_width_minus1; + BitField<46, 14, u64_le> surface_height_minus1; + }; + + GPU& gpu; + std::shared_ptr<Tegra::Nvdec> nvdec_processor; + + GPUVAddr config_struct_address{}; + GPUVAddr output_surface_luma_address{}; + GPUVAddr output_surface_chroma_u_address{}; + GPUVAddr output_surface_chroma_v_address{}; + + SwsContext* scaler_ctx{}; + s32 scaler_width{}; + s32 scaler_height{}; +}; + +} // namespace Tegra diff --git a/src/video_core/compatible_formats.cpp b/src/video_core/compatible_formats.cpp index b06c32c84..acf2668dc 100644 --- a/src/video_core/compatible_formats.cpp +++ b/src/video_core/compatible_formats.cpp @@ -3,33 +3,33 @@ // Refer to the license.txt file included. #include <array> -#include <bitset> #include <cstddef> +#include "common/common_types.h" #include "video_core/compatible_formats.h" #include "video_core/surface.h" namespace VideoCore::Surface { - namespace { +using Table = std::array<std::array<u64, 2>, MaxPixelFormat>; // Compatibility table taken from Table 3.X.2 in: // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_view.txt -constexpr std::array VIEW_CLASS_128_BITS = { +constexpr std::array VIEW_CLASS_128_BITS{ PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_SINT, }; -constexpr std::array VIEW_CLASS_96_BITS = { +constexpr std::array VIEW_CLASS_96_BITS{ PixelFormat::R32G32B32_FLOAT, }; // Missing formats: // PixelFormat::RGB32UI, // PixelFormat::RGB32I, -constexpr std::array VIEW_CLASS_64_BITS = { +constexpr std::array VIEW_CLASS_64_BITS{ PixelFormat::R32G32_FLOAT, PixelFormat::R32G32_UINT, PixelFormat::R32G32_SINT, PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM, @@ -38,7 +38,7 @@ constexpr std::array VIEW_CLASS_64_BITS = { // TODO: How should we handle 48 bits? -constexpr std::array VIEW_CLASS_32_BITS = { +constexpr std::array VIEW_CLASS_32_BITS{ PixelFormat::R16G16_FLOAT, PixelFormat::B10G11R11_FLOAT, PixelFormat::R32_FLOAT, PixelFormat::A2B10G10R10_UNORM, PixelFormat::R16G16_UINT, PixelFormat::R32_UINT, PixelFormat::R16G16_SINT, PixelFormat::R32_SINT, PixelFormat::A8B8G8R8_UNORM, @@ -50,43 +50,105 @@ constexpr std::array VIEW_CLASS_32_BITS = { // TODO: How should we handle 24 bits? -constexpr std::array VIEW_CLASS_16_BITS = { +constexpr std::array VIEW_CLASS_16_BITS{ PixelFormat::R16_FLOAT, PixelFormat::R8G8_UINT, PixelFormat::R16_UINT, PixelFormat::R16_SINT, PixelFormat::R8G8_UNORM, PixelFormat::R16_UNORM, PixelFormat::R8G8_SNORM, PixelFormat::R16_SNORM, PixelFormat::R8G8_SINT, }; -constexpr std::array VIEW_CLASS_8_BITS = { +constexpr std::array VIEW_CLASS_8_BITS{ PixelFormat::R8_UINT, PixelFormat::R8_UNORM, PixelFormat::R8_SINT, PixelFormat::R8_SNORM, }; -constexpr std::array VIEW_CLASS_RGTC1_RED = { +constexpr std::array VIEW_CLASS_RGTC1_RED{ PixelFormat::BC4_UNORM, PixelFormat::BC4_SNORM, }; -constexpr std::array VIEW_CLASS_RGTC2_RG = { +constexpr std::array VIEW_CLASS_RGTC2_RG{ PixelFormat::BC5_UNORM, PixelFormat::BC5_SNORM, }; -constexpr std::array VIEW_CLASS_BPTC_UNORM = { +constexpr std::array VIEW_CLASS_BPTC_UNORM{ PixelFormat::BC7_UNORM, PixelFormat::BC7_SRGB, }; -constexpr std::array VIEW_CLASS_BPTC_FLOAT = { +constexpr std::array VIEW_CLASS_BPTC_FLOAT{ PixelFormat::BC6H_SFLOAT, PixelFormat::BC6H_UFLOAT, }; +constexpr std::array VIEW_CLASS_ASTC_4x4_RGBA{ + PixelFormat::ASTC_2D_4X4_UNORM, + PixelFormat::ASTC_2D_4X4_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_5x4_RGBA{ + PixelFormat::ASTC_2D_5X4_UNORM, + PixelFormat::ASTC_2D_5X4_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_5x5_RGBA{ + PixelFormat::ASTC_2D_5X5_UNORM, + PixelFormat::ASTC_2D_5X5_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_6x5_RGBA{ + PixelFormat::ASTC_2D_6X5_UNORM, + PixelFormat::ASTC_2D_6X5_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_6x6_RGBA{ + PixelFormat::ASTC_2D_6X6_UNORM, + PixelFormat::ASTC_2D_6X6_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_8x5_RGBA{ + PixelFormat::ASTC_2D_8X5_UNORM, + PixelFormat::ASTC_2D_8X5_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_8x8_RGBA{ + PixelFormat::ASTC_2D_8X8_UNORM, + PixelFormat::ASTC_2D_8X8_SRGB, +}; + +// Missing formats: +// PixelFormat::ASTC_2D_10X5_UNORM +// PixelFormat::ASTC_2D_10X5_SRGB + +// Missing formats: +// PixelFormat::ASTC_2D_10X6_UNORM +// PixelFormat::ASTC_2D_10X6_SRGB + +constexpr std::array VIEW_CLASS_ASTC_10x8_RGBA{ + PixelFormat::ASTC_2D_10X8_UNORM, + PixelFormat::ASTC_2D_10X8_SRGB, +}; + +constexpr std::array VIEW_CLASS_ASTC_10x10_RGBA{ + PixelFormat::ASTC_2D_10X10_UNORM, + PixelFormat::ASTC_2D_10X10_SRGB, +}; + +// Missing formats +// ASTC_2D_12X10_UNORM, +// ASTC_2D_12X10_SRGB, + +constexpr std::array VIEW_CLASS_ASTC_12x12_RGBA{ + PixelFormat::ASTC_2D_12X12_UNORM, + PixelFormat::ASTC_2D_12X12_SRGB, +}; + // Compatibility table taken from Table 4.X.1 in: // https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_copy_image.txt -constexpr std::array COPY_CLASS_128_BITS = { +constexpr std::array COPY_CLASS_128_BITS{ PixelFormat::R32G32B32A32_UINT, PixelFormat::R32G32B32A32_FLOAT, PixelFormat::R32G32B32A32_SINT, PixelFormat::BC2_UNORM, PixelFormat::BC2_SRGB, PixelFormat::BC3_UNORM, PixelFormat::BC3_SRGB, PixelFormat::BC5_UNORM, PixelFormat::BC5_SNORM, @@ -97,7 +159,7 @@ constexpr std::array COPY_CLASS_128_BITS = { // PixelFormat::RGBA32I // COMPRESSED_RG_RGTC2 -constexpr std::array COPY_CLASS_64_BITS = { +constexpr std::array COPY_CLASS_64_BITS{ PixelFormat::R16G16B16A16_FLOAT, PixelFormat::R16G16B16A16_UINT, PixelFormat::R16G16B16A16_UNORM, PixelFormat::R16G16B16A16_SNORM, PixelFormat::R16G16B16A16_SINT, PixelFormat::R32G32_UINT, @@ -110,32 +172,36 @@ constexpr std::array COPY_CLASS_64_BITS = { // COMPRESSED_RGBA_S3TC_DXT1_EXT // COMPRESSED_SIGNED_RED_RGTC1 -void Enable(FormatCompatibility::Table& compatiblity, size_t format_a, size_t format_b) { - compatiblity[format_a][format_b] = true; - compatiblity[format_b][format_a] = true; +constexpr void Enable(Table& table, size_t format_a, size_t format_b) { + table[format_a][format_b / 64] |= u64(1) << (format_b % 64); + table[format_b][format_a / 64] |= u64(1) << (format_a % 64); } -void Enable(FormatCompatibility::Table& compatibility, PixelFormat format_a, PixelFormat format_b) { - Enable(compatibility, static_cast<size_t>(format_a), static_cast<size_t>(format_b)); +constexpr void Enable(Table& table, PixelFormat format_a, PixelFormat format_b) { + Enable(table, static_cast<size_t>(format_a), static_cast<size_t>(format_b)); } template <typename Range> -void EnableRange(FormatCompatibility::Table& compatibility, const Range& range) { +constexpr void EnableRange(Table& table, const Range& range) { for (auto it_a = range.begin(); it_a != range.end(); ++it_a) { for (auto it_b = it_a; it_b != range.end(); ++it_b) { - Enable(compatibility, *it_a, *it_b); + Enable(table, *it_a, *it_b); } } } -} // Anonymous namespace +constexpr bool IsSupported(const Table& table, PixelFormat format_a, PixelFormat format_b) { + const size_t a = static_cast<size_t>(format_a); + const size_t b = static_cast<size_t>(format_b); + return ((table[a][b / 64] >> (b % 64)) & 1) != 0; +} -FormatCompatibility::FormatCompatibility() { +constexpr Table MakeViewTable() { + Table view{}; for (size_t i = 0; i < MaxPixelFormat; ++i) { // Identity is allowed Enable(view, i, i); } - EnableRange(view, VIEW_CLASS_128_BITS); EnableRange(view, VIEW_CLASS_96_BITS); EnableRange(view, VIEW_CLASS_64_BITS); @@ -146,10 +212,39 @@ FormatCompatibility::FormatCompatibility() { EnableRange(view, VIEW_CLASS_RGTC2_RG); EnableRange(view, VIEW_CLASS_BPTC_UNORM); EnableRange(view, VIEW_CLASS_BPTC_FLOAT); + EnableRange(view, VIEW_CLASS_ASTC_4x4_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_5x4_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_5x5_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_6x5_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_6x6_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_8x5_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_8x8_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_10x8_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_10x10_RGBA); + EnableRange(view, VIEW_CLASS_ASTC_12x12_RGBA); + return view; +} - copy = view; +constexpr Table MakeCopyTable() { + Table copy = MakeViewTable(); EnableRange(copy, COPY_CLASS_128_BITS); EnableRange(copy, COPY_CLASS_64_BITS); + return copy; +} +} // Anonymous namespace + +bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views) { + if (broken_views) { + // If format views are broken, only accept formats that are identical. + return format_a == format_b; + } + static constexpr Table TABLE = MakeViewTable(); + return IsSupported(TABLE, format_a, format_b); +} + +bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b) { + static constexpr Table TABLE = MakeCopyTable(); + return IsSupported(TABLE, format_a, format_b); } } // namespace VideoCore::Surface diff --git a/src/video_core/compatible_formats.h b/src/video_core/compatible_formats.h index 51766349b..9a0522988 100644 --- a/src/video_core/compatible_formats.h +++ b/src/video_core/compatible_formats.h @@ -4,31 +4,12 @@ #pragma once -#include <array> -#include <bitset> -#include <cstddef> - #include "video_core/surface.h" namespace VideoCore::Surface { -class FormatCompatibility { -public: - using Table = std::array<std::bitset<MaxPixelFormat>, MaxPixelFormat>; - - explicit FormatCompatibility(); - - bool TestView(PixelFormat format_a, PixelFormat format_b) const noexcept { - return view[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; - } - - bool TestCopy(PixelFormat format_a, PixelFormat format_b) const noexcept { - return copy[static_cast<size_t>(format_a)][static_cast<size_t>(format_b)]; - } +bool IsViewCompatible(PixelFormat format_a, PixelFormat format_b, bool broken_views); -private: - Table view; - Table copy; -}; +bool IsCopyCompatible(PixelFormat format_a, PixelFormat format_b); } // namespace VideoCore::Surface diff --git a/src/video_core/delayed_destruction_ring.h b/src/video_core/delayed_destruction_ring.h new file mode 100644 index 000000000..4f1d29c04 --- /dev/null +++ b/src/video_core/delayed_destruction_ring.h @@ -0,0 +1,32 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <cstddef> +#include <utility> +#include <vector> + +namespace VideoCommon { + +/// Container to push objects to be destroyed a few ticks in the future +template <typename T, size_t TICKS_TO_DESTROY> +class DelayedDestructionRing { +public: + void Tick() { + index = (index + 1) % TICKS_TO_DESTROY; + elements[index].clear(); + } + + void Push(T&& object) { + elements[index].push_back(std::move(object)); + } + +private: + size_t index = 0; + std::array<std::vector<T>, TICKS_TO_DESTROY> elements; +}; + +} // namespace VideoCommon diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp index e16075993..b1eaac00c 100644 --- a/src/video_core/dirty_flags.cpp +++ b/src/video_core/dirty_flags.cpp @@ -9,13 +9,16 @@ #include "video_core/dirty_flags.h" #define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) -#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / sizeof(u32)) +#define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32))) namespace VideoCommon::Dirty { using Tegra::Engines::Maxwell3D; void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) { + FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors); + FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors); + static constexpr std::size_t num_per_rt = NUM(rt[0]); static constexpr std::size_t begin = OFF(rt); static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets; @@ -23,6 +26,10 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl FillBlock(tables[0], begin + rt * num_per_rt, num_per_rt, ColorBuffer0 + rt); } FillBlock(tables[1], begin, num, RenderTargets); + FillBlock(tables[0], OFF(render_area), NUM(render_area), RenderTargets); + + tables[0][OFF(rt_control)] = RenderTargets; + tables[1][OFF(rt_control)] = RenderTargetControl; static constexpr std::array zeta_flags{ZetaBuffer, RenderTargets}; for (std::size_t i = 0; i < std::size(zeta_flags); ++i) { diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h index 3f6c1d83a..875527ddd 100644 --- a/src/video_core/dirty_flags.h +++ b/src/video_core/dirty_flags.h @@ -16,7 +16,10 @@ namespace VideoCommon::Dirty { enum : u8 { NullEntry = 0, + Descriptors, + RenderTargets, + RenderTargetControl, ColorBuffer0, ColorBuffer1, ColorBuffer2, diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index f2f96ac33..2c8b20024 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include "common/cityhash.h" #include "common/microprofile.h" #include "core/core.h" #include "core/memory.h" @@ -12,7 +13,7 @@ namespace Tegra { -DmaPusher::DmaPusher(Core::System& system, GPU& gpu) : gpu{gpu}, system{system} {} +DmaPusher::DmaPusher(Core::System& system_, GPU& gpu_) : gpu{gpu_}, system{system_} {} DmaPusher::~DmaPusher() = default; @@ -45,32 +46,41 @@ bool DmaPusher::Step() { return false; } - const CommandList& command_list{dma_pushbuffer.front()}; - ASSERT_OR_EXECUTE(!command_list.empty(), { - // Somehow the command_list is empty, in order to avoid a crash - // We ignore it and assume its size is 0. - dma_pushbuffer.pop(); - dma_pushbuffer_subindex = 0; - return true; - }); - const CommandListHeader command_list_header{command_list[dma_pushbuffer_subindex++]}; - const GPUVAddr dma_get = command_list_header.addr; - - if (dma_pushbuffer_subindex >= command_list.size()) { - // We've gone through the current list, remove it from the queue - dma_pushbuffer.pop(); - dma_pushbuffer_subindex = 0; - } + CommandList& command_list{dma_pushbuffer.front()}; - if (command_list_header.size == 0) { - return true; - } + ASSERT_OR_EXECUTE( + command_list.command_lists.size() || command_list.prefetch_command_list.size(), { + // Somehow the command_list is empty, in order to avoid a crash + // We ignore it and assume its size is 0. + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + return true; + }); + + if (command_list.prefetch_command_list.size()) { + // Prefetched command list from nvdrv, used for things like synchronization + command_headers = std::move(command_list.prefetch_command_list); + dma_pushbuffer.pop(); + } else { + const CommandListHeader command_list_header{ + command_list.command_lists[dma_pushbuffer_subindex++]}; + const GPUVAddr dma_get = command_list_header.addr; + + if (dma_pushbuffer_subindex >= command_list.command_lists.size()) { + // We've gone through the current list, remove it from the queue + dma_pushbuffer.pop(); + dma_pushbuffer_subindex = 0; + } - // Push buffer non-empty, read a word - command_headers.resize(command_list_header.size); - gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), - command_list_header.size * sizeof(u32)); + if (command_list_header.size == 0) { + return true; + } + // Push buffer non-empty, read a word + command_headers.resize(command_list_header.size); + gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), + command_list_header.size * sizeof(u32)); + } for (std::size_t index = 0; index < command_headers.size();) { const CommandHeader& command_header = command_headers[index]; @@ -142,7 +152,12 @@ void DmaPusher::SetState(const CommandHeader& command_header) { void DmaPusher::CallMethod(u32 argument) const { if (dma_state.method < non_puller_methods) { - gpu.CallMethod({dma_state.method, argument, dma_state.subchannel, dma_state.method_count}); + gpu.CallMethod(GPU::MethodCall{ + dma_state.method, + argument, + dma_state.subchannel, + dma_state.method_count, + }); } else { subchannels[dma_state.subchannel]->CallMethod(dma_state.method, argument, dma_state.is_last_call); diff --git a/src/video_core/dma_pusher.h b/src/video_core/dma_pusher.h index efa90d170..19f286fa7 100644 --- a/src/video_core/dma_pusher.h +++ b/src/video_core/dma_pusher.h @@ -18,6 +18,8 @@ class System; namespace Tegra { +class GPU; + enum class SubmissionMode : u32 { IncreasingOld = 0, Increasing = 1, @@ -27,6 +29,31 @@ enum class SubmissionMode : u32 { IncreaseOnce = 5 }; +// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence +// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. +// So the values you see in docs might be multiplied by 4. +enum class BufferMethods : u32 { + BindObject = 0x0, + Nop = 0x2, + SemaphoreAddressHigh = 0x4, + SemaphoreAddressLow = 0x5, + SemaphoreSequence = 0x6, + SemaphoreTrigger = 0x7, + NotifyIntr = 0x8, + WrcacheFlush = 0x9, + Unk28 = 0xA, + UnkCacheFlush = 0xB, + RefCnt = 0x14, + SemaphoreAcquire = 0x1A, + SemaphoreRelease = 0x1B, + FenceValue = 0x1C, + FenceAction = 0x1D, + WaitForInterrupt = 0x1E, + Unk7c = 0x1F, + Yield = 0x20, + NonPullerMethods = 0x40, +}; + struct CommandListHeader { union { u64 raw; @@ -49,9 +76,23 @@ union CommandHeader { static_assert(std::is_standard_layout_v<CommandHeader>, "CommandHeader is not standard layout"); static_assert(sizeof(CommandHeader) == sizeof(u32), "CommandHeader has incorrect size!"); -class GPU; +inline CommandHeader BuildCommandHeader(BufferMethods method, u32 arg_count, SubmissionMode mode) { + CommandHeader result{}; + result.method.Assign(static_cast<u32>(method)); + result.arg_count.Assign(arg_count); + result.mode.Assign(mode); + return result; +} -using CommandList = std::vector<Tegra::CommandListHeader>; +struct CommandList final { + CommandList() = default; + explicit CommandList(std::size_t size) : command_lists(size) {} + explicit CommandList(std::vector<CommandHeader>&& prefetch_command_list_) + : prefetch_command_list{std::move(prefetch_command_list_)} {} + + std::vector<CommandListHeader> command_lists; + std::vector<CommandHeader> prefetch_command_list; +}; /** * The DmaPusher class implements DMA submission to FIFOs, providing an area of memory that the @@ -60,9 +101,9 @@ using CommandList = std::vector<Tegra::CommandListHeader>; * See https://envytools.readthedocs.io/en/latest/hw/fifo/dma-pusher.html#fifo-dma-pusher for * details on this implementation. */ -class DmaPusher { +class DmaPusher final { public: - explicit DmaPusher(Core::System& system, GPU& gpu); + explicit DmaPusher(Core::System& system_, GPU& gpu_); ~DmaPusher(); void Push(CommandList&& entries) { @@ -71,7 +112,7 @@ public: void DispatchCalls(); - void BindSubchannel(Tegra::Engines::EngineInterface* engine, u32 subchannel_id) { + void BindSubchannel(Engines::EngineInterface* engine, u32 subchannel_id) { subchannels[subchannel_id] = engine; } @@ -104,7 +145,7 @@ private: bool ib_enable{true}; ///< IB mode enabled - std::array<Tegra::Engines::EngineInterface*, max_subchannels> subchannels{}; + std::array<Engines::EngineInterface*, max_subchannels> subchannels{}; GPU& gpu; Core::System& system; diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp index d44ad0cd8..71d7e1473 100644 --- a/src/video_core/engines/engine_upload.cpp +++ b/src/video_core/engines/engine_upload.cpp @@ -11,16 +11,16 @@ namespace Tegra::Engines::Upload { -State::State(MemoryManager& memory_manager, Registers& regs) - : regs{regs}, memory_manager{memory_manager} {} +State::State(MemoryManager& memory_manager_, Registers& regs_) + : regs{regs_}, memory_manager{memory_manager_} {} State::~State() = default; -void State::ProcessExec(const bool is_linear) { +void State::ProcessExec(const bool is_linear_) { write_offset = 0; copy_size = regs.line_length_in * regs.line_count; inner_buffer.resize(copy_size); - this->is_linear = is_linear; + is_linear = is_linear_; } void State::ProcessData(const u32 data, const bool is_last_call) { diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h index 462da419e..1c7f1effa 100644 --- a/src/video_core/engines/engine_upload.h +++ b/src/video_core/engines/engine_upload.h @@ -54,10 +54,10 @@ struct Registers { class State { public: - State(MemoryManager& memory_manager, Registers& regs); + explicit State(MemoryManager& memory_manager_, Registers& regs_); ~State(); - void ProcessExec(bool is_linear); + void ProcessExec(bool is_linear_); void ProcessData(u32 data, bool is_last_call); private: diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index 9409c4075..a01d334ad 100644 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -10,7 +10,11 @@ namespace Tegra::Engines { -Fermi2D::Fermi2D() = default; +Fermi2D::Fermi2D() { + // Nvidia's OpenGL driver seems to assume these values + regs.src.depth = 1; + regs.dst.depth = 1; +} Fermi2D::~Fermi2D() = default; @@ -21,79 +25,43 @@ void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Fermi2D register, increase the size of the Regs structure"); - regs.reg_array[method] = method_argument; - switch (method) { - // Trigger the surface copy on the last register write. This is blit_src_y, but this is 64-bit, - // so trigger on the second 32-bit write. - case FERMI2D_REG_INDEX(blit_src_y) + 1: { - HandleSurfaceCopy(); - break; - } + if (method == FERMI2D_REG_INDEX(pixels_from_memory.src_y0) + 1) { + Blit(); } } void Fermi2D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) { - for (std::size_t i = 0; i < amount; i++) { - CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + for (u32 i = 0; i < amount; ++i) { + CallMethod(method, base_start[i], methods_pending - i <= 1); } } -static std::pair<u32, u32> DelimitLine(u32 src_1, u32 src_2, u32 dst_1, u32 dst_2, u32 src_line) { - const u32 line_a = src_2 - src_1; - const u32 line_b = dst_2 - dst_1; - const u32 excess = std::max<s32>(0, line_a - src_line + src_1); - return {line_b - (excess * line_b) / line_a, excess}; -} - -void Fermi2D::HandleSurfaceCopy() { - LOG_DEBUG(HW_GPU, "Requested a surface copy with operation {}", - static_cast<u32>(regs.operation)); +void Fermi2D::Blit() { + LOG_DEBUG(HW_GPU, "called. source address=0x{:x}, destination address=0x{:x}", + regs.src.Address(), regs.dst.Address()); - // TODO(Subv): Only raw copies are implemented. - ASSERT(regs.operation == Operation::SrcCopy); + UNIMPLEMENTED_IF_MSG(regs.operation != Operation::SrcCopy, "Operation is not copy"); + UNIMPLEMENTED_IF_MSG(regs.src.layer != 0, "Source layer is not zero"); + UNIMPLEMENTED_IF_MSG(regs.dst.layer != 0, "Destination layer is not zero"); + UNIMPLEMENTED_IF_MSG(regs.src.depth != 1, "Source depth is not one"); + UNIMPLEMENTED_IF_MSG(regs.clip_enable != 0, "Clipped blit enabled"); - const u32 src_blit_x1{static_cast<u32>(regs.blit_src_x >> 32)}; - const u32 src_blit_y1{static_cast<u32>(regs.blit_src_y >> 32)}; - u32 src_blit_x2, src_blit_y2; - if (regs.blit_control.origin == Origin::Corner) { - src_blit_x2 = - static_cast<u32>((regs.blit_src_x + (regs.blit_du_dx * regs.blit_dst_width)) >> 32); - src_blit_y2 = - static_cast<u32>((regs.blit_src_y + (regs.blit_dv_dy * regs.blit_dst_height)) >> 32); - } else { - src_blit_x2 = static_cast<u32>((regs.blit_src_x >> 32) + regs.blit_dst_width); - src_blit_y2 = static_cast<u32>((regs.blit_src_y >> 32) + regs.blit_dst_height); - } - u32 dst_blit_x2 = regs.blit_dst_x + regs.blit_dst_width; - u32 dst_blit_y2 = regs.blit_dst_y + regs.blit_dst_height; - const auto [new_dst_w, src_excess_x] = - DelimitLine(src_blit_x1, src_blit_x2, regs.blit_dst_x, dst_blit_x2, regs.src.width); - const auto [new_dst_h, src_excess_y] = - DelimitLine(src_blit_y1, src_blit_y2, regs.blit_dst_y, dst_blit_y2, regs.src.height); - dst_blit_x2 = new_dst_w + regs.blit_dst_x; - src_blit_x2 = src_blit_x2 - src_excess_x; - dst_blit_y2 = new_dst_h + regs.blit_dst_y; - src_blit_y2 = src_blit_y2 - src_excess_y; - const auto [new_src_w, dst_excess_x] = - DelimitLine(regs.blit_dst_x, dst_blit_x2, src_blit_x1, src_blit_x2, regs.dst.width); - const auto [new_src_h, dst_excess_y] = - DelimitLine(regs.blit_dst_y, dst_blit_y2, src_blit_y1, src_blit_y2, regs.dst.height); - src_blit_x2 = new_src_w + src_blit_x1; - dst_blit_x2 = dst_blit_x2 - dst_excess_x; - src_blit_y2 = new_src_h + src_blit_y1; - dst_blit_y2 = dst_blit_y2 - dst_excess_y; - const Common::Rectangle<u32> src_rect{src_blit_x1, src_blit_y1, src_blit_x2, src_blit_y2}; - const Common::Rectangle<u32> dst_rect{regs.blit_dst_x, regs.blit_dst_y, dst_blit_x2, - dst_blit_y2}; - const Config copy_config{ + const auto& args = regs.pixels_from_memory; + const Config config{ .operation = regs.operation, - .filter = regs.blit_control.filter, - .src_rect = src_rect, - .dst_rect = dst_rect, + .filter = args.sample_mode.filter, + .dst_x0 = args.dst_x0, + .dst_y0 = args.dst_y0, + .dst_x1 = args.dst_x0 + args.dst_width, + .dst_y1 = args.dst_y0 + args.dst_height, + .src_x0 = static_cast<s32>(args.src_x0 >> 32), + .src_y0 = static_cast<s32>(args.src_y0 >> 32), + .src_x1 = static_cast<s32>((args.du_dx * args.dst_width + args.src_x0) >> 32), + .src_y1 = static_cast<s32>((args.dv_dy * args.dst_height + args.src_y0) >> 32), }; - if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, copy_config)) { + if (!rasterizer->AccelerateSurfaceCopy(regs.src, regs.dst, config)) { UNIMPLEMENTED(); } } diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 0909709ec..81522988e 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -53,8 +53,8 @@ public: }; enum class Filter : u32 { - PointSample = 0, // Nearest - Linear = 1, + Point = 0, + Bilinear = 1, }; enum class Operation : u32 { @@ -67,88 +67,235 @@ public: BlendPremult = 6, }; - struct Regs { - static constexpr std::size_t NUM_REGS = 0x258; + enum class MemoryLayout : u32 { + BlockLinear = 0, + Pitch = 1, + }; - struct Surface { - RenderTargetFormat format; - BitField<0, 1, u32> linear; - union { - BitField<0, 4, u32> block_width; - BitField<4, 4, u32> block_height; - BitField<8, 4, u32> block_depth; - }; - u32 depth; - u32 layer; - u32 pitch; - u32 width; - u32 height; - u32 address_high; - u32 address_low; - - GPUVAddr Address() const { - return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | - address_low); - } - - u32 BlockWidth() const { - return block_width.Value(); - } - - u32 BlockHeight() const { - return block_height.Value(); - } - - u32 BlockDepth() const { - return block_depth.Value(); - } - }; - static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size"); + enum class CpuIndexWrap : u32 { + Wrap = 0, + NoWrap = 1, + }; + struct Surface { + RenderTargetFormat format; + MemoryLayout linear; union { - struct { - INSERT_UNION_PADDING_WORDS(0x80); + BitField<0, 4, u32> block_width; + BitField<4, 4, u32> block_height; + BitField<8, 4, u32> block_depth; + }; + u32 depth; + u32 layer; + u32 pitch; + u32 width; + u32 height; + u32 addr_upper; + u32 addr_lower; + + [[nodiscard]] constexpr GPUVAddr Address() const noexcept { + return (static_cast<GPUVAddr>(addr_upper) << 32) | static_cast<GPUVAddr>(addr_lower); + } + }; + static_assert(sizeof(Surface) == 0x28, "Surface has incorrect size"); - Surface dst; + enum class SectorPromotion : u32 { + NoPromotion = 0, + PromoteTo2V = 1, + PromoteTo2H = 2, + PromoteTo4 = 3, + }; + + enum class NumTpcs : u32 { + All = 0, + One = 1, + }; - INSERT_UNION_PADDING_WORDS(2); + enum class RenderEnableMode : u32 { + False = 0, + True = 1, + Conditional = 2, + RenderIfEqual = 3, + RenderIfNotEqual = 4, + }; - Surface src; + enum class ColorKeyFormat : u32 { + A16R56G6B5 = 0, + A1R5G55B5 = 1, + A8R8G8B8 = 2, + A2R10G10B10 = 3, + Y8 = 4, + Y16 = 5, + Y32 = 6, + }; - INSERT_UNION_PADDING_WORDS(0x15); + union Beta4 { + BitField<0, 8, u32> b; + BitField<8, 8, u32> g; + BitField<16, 8, u32> r; + BitField<24, 8, u32> a; + }; - Operation operation; + struct Point { + u32 x; + u32 y; + }; - INSERT_UNION_PADDING_WORDS(0x177); + enum class PatternSelect : u32 { + MonoChrome8x8 = 0, + MonoChrome64x1 = 1, + MonoChrome1x64 = 2, + Color = 3, + }; + enum class NotifyType : u32 { + WriteOnly = 0, + WriteThenAwaken = 1, + }; + + enum class MonochromePatternColorFormat : u32 { + A8X8R8G6B5 = 0, + A1R5G5B5 = 1, + A8R8G8B8 = 2, + A8Y8 = 3, + A8X8Y16 = 4, + Y32 = 5, + }; + + enum class MonochromePatternFormat : u32 { + CGA6_M1 = 0, + LE_M1 = 1, + }; + + union Regs { + static constexpr std::size_t NUM_REGS = 0x258; + struct { + u32 object; + INSERT_UNION_PADDING_WORDS(0x3F); + u32 no_operation; + NotifyType notify; + INSERT_UNION_PADDING_WORDS(0x2); + u32 wait_for_idle; + INSERT_UNION_PADDING_WORDS(0xB); + u32 pm_trigger; + INSERT_UNION_PADDING_WORDS(0xF); + u32 context_dma_notify; + u32 dst_context_dma; + u32 src_context_dma; + u32 semaphore_context_dma; + INSERT_UNION_PADDING_WORDS(0x1C); + Surface dst; + CpuIndexWrap pixels_from_cpu_index_wrap; + u32 kind2d_check_enable; + Surface src; + SectorPromotion pixels_from_memory_sector_promotion; + INSERT_UNION_PADDING_WORDS(0x1); + NumTpcs num_tpcs; + u32 render_enable_addr_upper; + u32 render_enable_addr_lower; + RenderEnableMode render_enable_mode; + INSERT_UNION_PADDING_WORDS(0x4); + u32 clip_x0; + u32 clip_y0; + u32 clip_width; + u32 clip_height; + BitField<0, 1, u32> clip_enable; + BitField<0, 3, ColorKeyFormat> color_key_format; + u32 color_key; + BitField<0, 1, u32> color_key_enable; + BitField<0, 8, u32> rop; + u32 beta1; + Beta4 beta4; + Operation operation; + union { + BitField<0, 6, u32> x; + BitField<8, 6, u32> y; + } pattern_offset; + BitField<0, 2, PatternSelect> pattern_select; + INSERT_UNION_PADDING_WORDS(0xC); + struct { + BitField<0, 3, MonochromePatternColorFormat> color_format; + BitField<0, 1, MonochromePatternFormat> format; + u32 color0; + u32 color1; + u32 pattern0; + u32 pattern1; + } monochrome_pattern; + struct { + std::array<u32, 0x40> X8R8G8B8; + std::array<u32, 0x20> R5G6B5; + std::array<u32, 0x20> X1R5G5B5; + std::array<u32, 0x10> Y8; + } color_pattern; + INSERT_UNION_PADDING_WORDS(0x10); + struct { + u32 prim_mode; + u32 prim_color_format; + u32 prim_color; + u32 line_tie_break_bits; + INSERT_UNION_PADDING_WORDS(0x14); + u32 prim_point_xy; + INSERT_UNION_PADDING_WORDS(0x7); + std::array<Point, 0x40> prim_point; + } render_solid; + struct { + u32 data_type; + u32 color_format; + u32 index_format; + u32 mono_format; + u32 wrap; + u32 color0; + u32 color1; + u32 mono_opacity; + INSERT_UNION_PADDING_WORDS(0x6); + u32 src_width; + u32 src_height; + u32 dx_du_frac; + u32 dx_du_int; + u32 dx_dv_frac; + u32 dy_dv_int; + u32 dst_x0_frac; + u32 dst_x0_int; + u32 dst_y0_frac; + u32 dst_y0_int; + u32 data; + } pixels_from_cpu; + INSERT_UNION_PADDING_WORDS(0x3); + u32 big_endian_control; + INSERT_UNION_PADDING_WORDS(0x3); + struct { + BitField<0, 3, u32> block_shape; + BitField<0, 5, u32> corral_size; + BitField<0, 1, u32> safe_overlap; union { - u32 raw; BitField<0, 1, Origin> origin; BitField<4, 1, Filter> filter; - } blit_control; - + } sample_mode; INSERT_UNION_PADDING_WORDS(0x8); - - u32 blit_dst_x; - u32 blit_dst_y; - u32 blit_dst_width; - u32 blit_dst_height; - u64 blit_du_dx; - u64 blit_dv_dy; - u64 blit_src_x; - u64 blit_src_y; - - INSERT_UNION_PADDING_WORDS(0x21); - }; - std::array<u32, NUM_REGS> reg_array; + s32 dst_x0; + s32 dst_y0; + s32 dst_width; + s32 dst_height; + s64 du_dx; + s64 dv_dy; + s64 src_x0; + s64 src_y0; + } pixels_from_memory; }; + std::array<u32, NUM_REGS> reg_array; } regs{}; struct Config { - Operation operation{}; - Filter filter{}; - Common::Rectangle<u32> src_rect; - Common::Rectangle<u32> dst_rect; + Operation operation; + Filter filter; + s32 dst_x0; + s32 dst_y0; + s32 dst_x1; + s32 dst_y1; + s32 src_x0; + s32 src_y0; + s32 src_x1; + s32 src_y1; }; private: @@ -156,25 +303,49 @@ private: /// Performs the copy from the source surface to the destination surface as configured in the /// registers. - void HandleSurfaceCopy(); + void Blit(); }; #define ASSERT_REG_POSITION(field_name, position) \ - static_assert(offsetof(Fermi2D::Regs, field_name) == position * 4, \ + static_assert(offsetof(Fermi2D::Regs, field_name) == position, \ "Field " #field_name " has invalid position") -ASSERT_REG_POSITION(dst, 0x80); -ASSERT_REG_POSITION(src, 0x8C); -ASSERT_REG_POSITION(operation, 0xAB); -ASSERT_REG_POSITION(blit_control, 0x223); -ASSERT_REG_POSITION(blit_dst_x, 0x22c); -ASSERT_REG_POSITION(blit_dst_y, 0x22d); -ASSERT_REG_POSITION(blit_dst_width, 0x22e); -ASSERT_REG_POSITION(blit_dst_height, 0x22f); -ASSERT_REG_POSITION(blit_du_dx, 0x230); -ASSERT_REG_POSITION(blit_dv_dy, 0x232); -ASSERT_REG_POSITION(blit_src_x, 0x234); -ASSERT_REG_POSITION(blit_src_y, 0x236); +ASSERT_REG_POSITION(object, 0x0); +ASSERT_REG_POSITION(no_operation, 0x100); +ASSERT_REG_POSITION(notify, 0x104); +ASSERT_REG_POSITION(wait_for_idle, 0x110); +ASSERT_REG_POSITION(pm_trigger, 0x140); +ASSERT_REG_POSITION(context_dma_notify, 0x180); +ASSERT_REG_POSITION(dst_context_dma, 0x184); +ASSERT_REG_POSITION(src_context_dma, 0x188); +ASSERT_REG_POSITION(semaphore_context_dma, 0x18C); +ASSERT_REG_POSITION(dst, 0x200); +ASSERT_REG_POSITION(pixels_from_cpu_index_wrap, 0x228); +ASSERT_REG_POSITION(kind2d_check_enable, 0x22C); +ASSERT_REG_POSITION(src, 0x230); +ASSERT_REG_POSITION(pixels_from_memory_sector_promotion, 0x258); +ASSERT_REG_POSITION(num_tpcs, 0x260); +ASSERT_REG_POSITION(render_enable_addr_upper, 0x264); +ASSERT_REG_POSITION(render_enable_addr_lower, 0x268); +ASSERT_REG_POSITION(clip_x0, 0x280); +ASSERT_REG_POSITION(clip_y0, 0x284); +ASSERT_REG_POSITION(clip_width, 0x288); +ASSERT_REG_POSITION(clip_height, 0x28c); +ASSERT_REG_POSITION(clip_enable, 0x290); +ASSERT_REG_POSITION(color_key_format, 0x294); +ASSERT_REG_POSITION(color_key, 0x298); +ASSERT_REG_POSITION(rop, 0x2A0); +ASSERT_REG_POSITION(beta1, 0x2A4); +ASSERT_REG_POSITION(beta4, 0x2A8); +ASSERT_REG_POSITION(operation, 0x2AC); +ASSERT_REG_POSITION(pattern_offset, 0x2B0); +ASSERT_REG_POSITION(pattern_select, 0x2B4); +ASSERT_REG_POSITION(monochrome_pattern, 0x2E8); +ASSERT_REG_POSITION(color_pattern, 0x300); +ASSERT_REG_POSITION(render_solid, 0x580); +ASSERT_REG_POSITION(pixels_from_cpu, 0x800); +ASSERT_REG_POSITION(big_endian_control, 0x870); +ASSERT_REG_POSITION(pixels_from_memory, 0x880); #undef ASSERT_REG_POSITION diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 898370739..ba387506e 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -58,24 +58,6 @@ void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amoun } } -Texture::FullTextureInfo KeplerCompute::GetTexture(std::size_t offset) const { - const std::bitset<8> cbuf_mask = launch_description.const_buffer_enable_mask.Value(); - ASSERT(cbuf_mask[regs.tex_cb_index]); - - const auto& texinfo = launch_description.const_buffer_config[regs.tex_cb_index]; - ASSERT(texinfo.Address() != 0); - - const GPUVAddr address = texinfo.Address() + offset * sizeof(Texture::TextureHandle); - ASSERT(address < texinfo.Address() + texinfo.size); - - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(address)}; - return GetTextureInfo(tex_handle); -} - -Texture::FullTextureInfo KeplerCompute::GetTextureInfo(Texture::TextureHandle tex_handle) const { - return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)}; -} - u32 KeplerCompute::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const { ASSERT(stage == ShaderType::Compute); const auto& buffer = launch_description.const_buffer_config[const_buffer]; @@ -98,9 +80,11 @@ SamplerDescriptor KeplerCompute::AccessBindlessSampler(ShaderType stage, u64 con SamplerDescriptor KeplerCompute::AccessSampler(u32 handle) const { const Texture::TextureHandle tex_handle{handle}; - const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); - SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); - result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); + const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id); + const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id); + + SamplerDescriptor result = SamplerDescriptor::FromTIC(tic); + result.is_shadow.Assign(tsc.depth_compare_enabled.Value()); return result; } diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 7f2500aab..51a041202 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -209,11 +209,6 @@ public: void CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) override; - Texture::FullTextureInfo GetTexture(std::size_t offset) const; - - /// Given a texture handle, returns the TSC and TIC entries. - Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const; - u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override; SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index dc71b2eec..9911140e9 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -14,8 +14,8 @@ namespace Tegra::Engines { -KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager) - : system{system}, upload_state{memory_manager, regs.upload} {} +KeplerMemory::KeplerMemory(Core::System& system_, MemoryManager& memory_manager) + : system{system_}, upload_state{memory_manager, regs.upload} {} KeplerMemory::~KeplerMemory() = default; diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index 5b7f71a00..62483589e 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -35,7 +35,7 @@ namespace Tegra::Engines { class KeplerMemory final : public EngineInterface { public: - KeplerMemory(Core::System& system, MemoryManager& memory_manager); + explicit KeplerMemory(Core::System& system_, MemoryManager& memory_manager); ~KeplerMemory(); /// Write the value to the register identified by method. diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 57ebc785f..9be651e24 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -2,7 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <cinttypes> #include <cstring> #include <optional> #include "common/assert.h" @@ -124,6 +123,116 @@ void Maxwell3D::InitializeRegisterDefaults() { mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; } +void Maxwell3D::ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call) { + if (executing_macro == 0) { + // A macro call must begin by writing the macro method's register, not its argument. + ASSERT_MSG((method % 2) == 0, + "Can't start macro execution by writing to the ARGS register"); + executing_macro = method; + } + + macro_params.insert(macro_params.end(), base_start, base_start + amount); + + // Call the macro when there are no more parameters in the command buffer + if (is_last_call) { + CallMacroMethod(executing_macro, macro_params); + macro_params.clear(); + } +} + +u32 Maxwell3D::ProcessShadowRam(u32 method, u32 argument) { + // Keep track of the register value in shadow_state when requested. + const auto control = shadow_state.shadow_ram_control; + if (control == Regs::ShadowRamControl::Track || + control == Regs::ShadowRamControl::TrackWithFilter) { + shadow_state.reg_array[method] = argument; + return argument; + } + if (control == Regs::ShadowRamControl::Replay) { + return shadow_state.reg_array[method]; + } + return argument; +} + +void Maxwell3D::ProcessDirtyRegisters(u32 method, u32 argument) { + if (regs.reg_array[method] == argument) { + return; + } + regs.reg_array[method] = argument; + + for (const auto& table : dirty.tables) { + dirty.flags[table[method]] = true; + } +} + +void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, + bool is_last_call) { + switch (method) { + case MAXWELL3D_REG_INDEX(wait_for_idle): + return rasterizer->WaitForIdle(); + case MAXWELL3D_REG_INDEX(shadow_ram_control): + shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(nonshadow_argument); + return; + case MAXWELL3D_REG_INDEX(macros.data): + return macro_engine->AddCode(regs.macros.upload_address, argument); + case MAXWELL3D_REG_INDEX(macros.bind): + return ProcessMacroBind(argument); + case MAXWELL3D_REG_INDEX(firmware[4]): + return ProcessFirmwareCall4(); + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): + return StartCBData(method); + case MAXWELL3D_REG_INDEX(cb_bind[0]): + return ProcessCBBind(0); + case MAXWELL3D_REG_INDEX(cb_bind[1]): + return ProcessCBBind(1); + case MAXWELL3D_REG_INDEX(cb_bind[2]): + return ProcessCBBind(2); + case MAXWELL3D_REG_INDEX(cb_bind[3]): + return ProcessCBBind(3); + case MAXWELL3D_REG_INDEX(cb_bind[4]): + return ProcessCBBind(4); + case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): + return DrawArrays(); + case MAXWELL3D_REG_INDEX(clear_buffers): + return ProcessClearBuffers(); + case MAXWELL3D_REG_INDEX(query.query_get): + return ProcessQueryGet(); + case MAXWELL3D_REG_INDEX(condition.mode): + return ProcessQueryCondition(); + case MAXWELL3D_REG_INDEX(counter_reset): + return ProcessCounterReset(); + case MAXWELL3D_REG_INDEX(sync_info): + return ProcessSyncPoint(); + case MAXWELL3D_REG_INDEX(exec_upload): + return upload_state.ProcessExec(regs.exec_upload.linear != 0); + case MAXWELL3D_REG_INDEX(data_upload): + upload_state.ProcessData(argument, is_last_call); + if (is_last_call) { + OnMemoryWrite(); + } + return; + case MAXWELL3D_REG_INDEX(fragment_barrier): + return rasterizer->FragmentBarrier(); + case MAXWELL3D_REG_INDEX(tiled_cache_barrier): + return rasterizer->TiledCacheBarrier(); + } +} + void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) { // Reset the current macro. executing_macro = 0; @@ -157,142 +266,16 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { // Methods after 0xE00 are special, they're actually triggers for some microcode that was // uploaded to the GPU during initialization. if (method >= MacroRegistersStart) { - // We're trying to execute a macro - if (executing_macro == 0) { - // A macro call must begin by writing the macro method's register, not its argument. - ASSERT_MSG((method % 2) == 0, - "Can't start macro execution by writing to the ARGS register"); - executing_macro = method; - } - - macro_params.push_back(method_argument); - - // Call the macro when there are no more parameters in the command buffer - if (is_last_call) { - CallMacroMethod(executing_macro, macro_params); - macro_params.clear(); - } + ProcessMacro(method, &method_argument, 1, is_last_call); return; } ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register, increase the size of the Regs structure"); - u32 arg = method_argument; - // Keep track of the register value in shadow_state when requested. - if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Track || - shadow_state.shadow_ram_control == Regs::ShadowRamControl::TrackWithFilter) { - shadow_state.reg_array[method] = arg; - } else if (shadow_state.shadow_ram_control == Regs::ShadowRamControl::Replay) { - arg = shadow_state.reg_array[method]; - } - - if (regs.reg_array[method] != arg) { - regs.reg_array[method] = arg; - - for (const auto& table : dirty.tables) { - dirty.flags[table[method]] = true; - } - } - - switch (method) { - case MAXWELL3D_REG_INDEX(wait_for_idle): { - rasterizer->WaitForIdle(); - break; - } - case MAXWELL3D_REG_INDEX(shadow_ram_control): { - shadow_state.shadow_ram_control = static_cast<Regs::ShadowRamControl>(method_argument); - break; - } - case MAXWELL3D_REG_INDEX(macros.data): { - macro_engine->AddCode(regs.macros.upload_address, arg); - break; - } - case MAXWELL3D_REG_INDEX(macros.bind): { - ProcessMacroBind(arg); - break; - } - case MAXWELL3D_REG_INDEX(firmware[4]): { - ProcessFirmwareCall4(); - break; - } - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[0]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[1]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[2]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[3]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[4]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[5]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[6]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[7]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[8]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[9]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[10]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[11]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { - StartCBData(method); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[0]): { - ProcessCBBind(0); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[1]): { - ProcessCBBind(1); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[2]): { - ProcessCBBind(2); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[3]): { - ProcessCBBind(3); - break; - } - case MAXWELL3D_REG_INDEX(cb_bind[4]): { - ProcessCBBind(4); - break; - } - case MAXWELL3D_REG_INDEX(draw.vertex_end_gl): { - DrawArrays(); - break; - } - case MAXWELL3D_REG_INDEX(clear_buffers): { - ProcessClearBuffers(); - break; - } - case MAXWELL3D_REG_INDEX(query.query_get): { - ProcessQueryGet(); - break; - } - case MAXWELL3D_REG_INDEX(condition.mode): { - ProcessQueryCondition(); - break; - } - case MAXWELL3D_REG_INDEX(counter_reset): { - ProcessCounterReset(); - break; - } - case MAXWELL3D_REG_INDEX(sync_info): { - ProcessSyncPoint(); - break; - } - case MAXWELL3D_REG_INDEX(exec_upload): { - upload_state.ProcessExec(regs.exec_upload.linear != 0); - break; - } - case MAXWELL3D_REG_INDEX(data_upload): { - upload_state.ProcessData(arg, is_last_call); - if (is_last_call) { - OnMemoryWrite(); - } - break; - } - default: - break; - } + const u32 argument = ProcessShadowRam(method, method_argument); + ProcessDirtyRegisters(method, argument); + ProcessMethodCall(method, argument, method_argument, is_last_call); } void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, @@ -300,23 +283,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, // Methods after 0xE00 are special, they're actually triggers for some microcode that was // uploaded to the GPU during initialization. if (method >= MacroRegistersStart) { - // We're trying to execute a macro - if (executing_macro == 0) { - // A macro call must begin by writing the macro method's register, not its argument. - ASSERT_MSG((method % 2) == 0, - "Can't start macro execution by writing to the ARGS register"); - executing_macro = method; - } - - for (std::size_t i = 0; i < amount; i++) { - macro_params.push_back(base_start[i]); - } - - // Call the macro when there are no more parameters in the command buffer - if (amount == methods_pending) { - CallMacroMethod(executing_macro, macro_params); - macro_params.clear(); - } + ProcessMacro(method, base_start, amount, amount == methods_pending); return; } switch (method) { @@ -335,15 +302,14 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, case MAXWELL3D_REG_INDEX(const_buffer.cb_data[12]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[13]): case MAXWELL3D_REG_INDEX(const_buffer.cb_data[14]): - case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): { + case MAXWELL3D_REG_INDEX(const_buffer.cb_data[15]): ProcessCBMultiData(method, base_start, amount); break; - } - default: { + default: for (std::size_t i = 0; i < amount; i++) { CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); } - } + break; } } @@ -396,7 +362,7 @@ void Maxwell3D::CallMethodFromMME(u32 method, u32 method_argument) { } void Maxwell3D::FlushMMEInlineDraw() { - LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), + LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(), regs.vertex_buffer.count); ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?"); ASSERT(mme_draw.instance_count == mme_draw.gl_end_count); @@ -541,8 +507,7 @@ void Maxwell3D::ProcessCounterReset() { rasterizer->ResetCounter(QueryType::SamplesPassed); break; default: - LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", - static_cast<int>(regs.counter_reset)); + LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.counter_reset); break; } } @@ -557,7 +522,7 @@ void Maxwell3D::ProcessSyncPoint() { } void Maxwell3D::DrawArrays() { - LOG_TRACE(HW_GPU, "called, topology={}, count={}", static_cast<u32>(regs.draw.topology.Value()), + LOG_TRACE(HW_GPU, "called, topology={}, count={}", regs.draw.topology.Value(), regs.vertex_buffer.count); ASSERT_MSG(!(regs.index_array.count && regs.vertex_buffer.count), "Both indexed and direct?"); @@ -595,12 +560,12 @@ std::optional<u64> Maxwell3D::GetQueryResult() { return 0; case Regs::QuerySelect::SamplesPassed: // Deferred. - rasterizer->Query(regs.query.QueryAddress(), VideoCore::QueryType::SamplesPassed, + rasterizer->Query(regs.query.QueryAddress(), QueryType::SamplesPassed, system.GPU().GetTicks()); return std::nullopt; default: LOG_DEBUG(HW_GPU, "Unimplemented query select type {}", - static_cast<u32>(regs.query.query_get.select.Value())); + regs.query.query_get.select.Value()); return 1; } } @@ -677,7 +642,7 @@ void Maxwell3D::FinishCBData() { } Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { - const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)}; + const GPUVAddr tic_address_gpu{regs.tic.Address() + tic_index * sizeof(Texture::TICEntry)}; Texture::TICEntry tic_entry; memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); @@ -686,43 +651,19 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { } Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const { - const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)}; + const GPUVAddr tsc_address_gpu{regs.tsc.Address() + tsc_index * sizeof(Texture::TSCEntry)}; Texture::TSCEntry tsc_entry; memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry)); return tsc_entry; } -Texture::FullTextureInfo Maxwell3D::GetTextureInfo(Texture::TextureHandle tex_handle) const { - return Texture::FullTextureInfo{GetTICEntry(tex_handle.tic_id), GetTSCEntry(tex_handle.tsc_id)}; -} - -Texture::FullTextureInfo Maxwell3D::GetStageTexture(ShaderType stage, std::size_t offset) const { - const auto stage_index = static_cast<std::size_t>(stage); - const auto& shader = state.shader_stages[stage_index]; - const auto& tex_info_buffer = shader.const_buffers[regs.tex_cb_index]; - ASSERT(tex_info_buffer.enabled && tex_info_buffer.address != 0); - - const GPUVAddr tex_info_address = - tex_info_buffer.address + offset * sizeof(Texture::TextureHandle); - - ASSERT(tex_info_address < tex_info_buffer.address + tex_info_buffer.size); - - const Texture::TextureHandle tex_handle{memory_manager.Read<u32>(tex_info_address)}; - - return GetTextureInfo(tex_handle); -} - u32 Maxwell3D::GetRegisterValue(u32 method) const { ASSERT_MSG(method < Regs::NUM_REGS, "Invalid Maxwell3D register"); return regs.reg_array[method]; } void Maxwell3D::ProcessClearBuffers() { - ASSERT(regs.clear_buffers.R == regs.clear_buffers.G && - regs.clear_buffers.R == regs.clear_buffers.B && - regs.clear_buffers.R == regs.clear_buffers.A); - rasterizer->Clear(); } @@ -730,9 +671,7 @@ u32 Maxwell3D::AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offse ASSERT(stage != ShaderType::Compute); const auto& shader_stage = state.shader_stages[static_cast<std::size_t>(stage)]; const auto& buffer = shader_stage.const_buffers[const_buffer]; - u32 result; - std::memcpy(&result, memory_manager.GetPointer(buffer.address + offset), sizeof(u32)); - return result; + return memory_manager.Read<u32>(buffer.address + offset); } SamplerDescriptor Maxwell3D::AccessBoundSampler(ShaderType stage, u64 offset) const { @@ -750,9 +689,11 @@ SamplerDescriptor Maxwell3D::AccessBindlessSampler(ShaderType stage, u64 const_b SamplerDescriptor Maxwell3D::AccessSampler(u32 handle) const { const Texture::TextureHandle tex_handle{handle}; - const Texture::FullTextureInfo tex_info = GetTextureInfo(tex_handle); - SamplerDescriptor result = SamplerDescriptor::FromTIC(tex_info.tic); - result.is_shadow.Assign(tex_info.tsc.depth_compare_enabled.Value()); + const Texture::TICEntry tic = GetTICEntry(tex_handle.tic_id); + const Texture::TSCEntry tsc = GetTSCEntry(tex_handle.tsc_id); + + SamplerDescriptor result = SamplerDescriptor::FromTIC(tic); + result.is_shadow.Assign(tsc.depth_compare_enabled.Value()); return result; } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index bc289c55d..bf9e07c9b 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -438,16 +438,6 @@ public: DecrWrapOGL = 0x8508, }; - enum class MemoryLayout : u32 { - Linear = 0, - BlockLinear = 1, - }; - - enum class InvMemoryLayout : u32 { - BlockLinear = 0, - Linear = 1, - }; - enum class CounterReset : u32 { SampleCnt = 0x01, Unk02 = 0x02, @@ -589,21 +579,31 @@ public: NegativeW = 7, }; + enum class SamplerIndex : u32 { + Independently = 0, + ViaHeaderIndex = 1, + }; + + struct TileMode { + union { + BitField<0, 4, u32> block_width; + BitField<4, 4, u32> block_height; + BitField<8, 4, u32> block_depth; + BitField<12, 1, u32> is_pitch_linear; + BitField<16, 1, u32> is_3d; + }; + }; + static_assert(sizeof(TileMode) == 4); + struct RenderTargetConfig { u32 address_high; u32 address_low; u32 width; u32 height; Tegra::RenderTargetFormat format; + TileMode tile_mode; union { - BitField<0, 3, u32> block_width; - BitField<4, 3, u32> block_height; - BitField<8, 3, u32> block_depth; - BitField<12, 1, InvMemoryLayout> type; - BitField<16, 1, u32> is_3d; - } memory_layout; - union { - BitField<0, 16, u32> layers; + BitField<0, 16, u32> depth; BitField<16, 1, u32> volume; }; u32 layer_stride; @@ -755,7 +755,11 @@ public: u32 data_upload; - INSERT_UNION_PADDING_WORDS(0x44); + INSERT_UNION_PADDING_WORDS(0x16); + + u32 force_early_fragment_tests; + + INSERT_UNION_PADDING_WORDS(0x2D); struct { union { @@ -828,7 +832,11 @@ public: u32 patch_vertices; - INSERT_UNION_PADDING_WORDS(0xC); + INSERT_UNION_PADDING_WORDS(0x4); + + u32 fragment_barrier; + + INSERT_UNION_PADDING_WORDS(0x7); std::array<ScissorTest, NumViewports> scissor_test; @@ -838,7 +846,15 @@ public: u32 stencil_back_mask; u32 stencil_back_func_mask; - INSERT_UNION_PADDING_WORDS(0xC); + INSERT_UNION_PADDING_WORDS(0x5); + + u32 invalidate_texture_data_cache; + + INSERT_UNION_PADDING_WORDS(0x1); + + u32 tiled_cache_barrier; + + INSERT_UNION_PADDING_WORDS(0x4); u32 color_mask_common; @@ -862,12 +878,7 @@ public: u32 address_high; u32 address_low; Tegra::DepthFormat format; - union { - BitField<0, 4, u32> block_width; - BitField<4, 4, u32> block_height; - BitField<8, 4, u32> block_depth; - BitField<20, 1, InvMemoryLayout> type; - } memory_layout; + TileMode tile_mode; u32 layer_stride; GPUVAddr Address() const { @@ -876,7 +887,18 @@ public: } } zeta; - INSERT_UNION_PADDING_WORDS(0x41); + struct { + union { + BitField<0, 16, u32> x; + BitField<16, 16, u32> width; + }; + union { + BitField<0, 16, u32> y; + BitField<16, 16, u32> height; + }; + } render_area; + + INSERT_UNION_PADDING_WORDS(0x3F); union { BitField<0, 4, u32> stencil; @@ -917,7 +939,7 @@ public: BitField<25, 3, u32> map_7; }; - u32 GetMap(std::size_t index) const { + u32 Map(std::size_t index) const { const std::array<u32, NumRenderTargets> maps{map_0, map_1, map_2, map_3, map_4, map_5, map_6, map_7}; ASSERT(index < maps.size()); @@ -930,11 +952,13 @@ public: u32 zeta_width; u32 zeta_height; union { - BitField<0, 16, u32> zeta_layers; + BitField<0, 16, u32> zeta_depth; BitField<16, 1, u32> zeta_volume; }; - INSERT_UNION_PADDING_WORDS(0x26); + SamplerIndex sampler_index; + + INSERT_UNION_PADDING_WORDS(0x25); u32 depth_test_enable; @@ -960,6 +984,7 @@ public: float b; float a; } blend_color; + INSERT_UNION_PADDING_WORDS(0x4); struct { @@ -997,7 +1022,12 @@ public: float line_width_smooth; float line_width_aliased; - INSERT_UNION_PADDING_WORDS(0x1F); + INSERT_UNION_PADDING_WORDS(0x1B); + + u32 invalidate_sampler_cache_no_wfi; + u32 invalidate_texture_header_cache_no_wfi; + + INSERT_UNION_PADDING_WORDS(0x2); u32 vb_element_base; u32 vb_base_instance; @@ -1041,13 +1071,13 @@ public: } condition; struct { - u32 tsc_address_high; - u32 tsc_address_low; - u32 tsc_limit; + u32 address_high; + u32 address_low; + u32 limit; - GPUVAddr TSCAddress() const { - return static_cast<GPUVAddr>( - (static_cast<GPUVAddr>(tsc_address_high) << 32) | tsc_address_low); + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); } } tsc; @@ -1058,13 +1088,13 @@ public: u32 line_smooth_enable; struct { - u32 tic_address_high; - u32 tic_address_low; - u32 tic_limit; + u32 address_high; + u32 address_low; + u32 limit; - GPUVAddr TICAddress() const { - return static_cast<GPUVAddr>( - (static_cast<GPUVAddr>(tic_address_high) << 32) | tic_address_low); + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); } } tic; @@ -1393,12 +1423,6 @@ public: void FlushMMEInlineDraw(); - /// Given a texture handle, returns the TSC and TIC entries. - Texture::FullTextureInfo GetTextureInfo(Texture::TextureHandle tex_handle) const; - - /// Returns the texture information for a specific texture in a specific shader stage. - Texture::FullTextureInfo GetStageTexture(ShaderType stage, std::size_t offset) const; - u32 AccessConstBuffer32(ShaderType stage, u64 const_buffer, u64 offset) const override; SamplerDescriptor AccessBoundSampler(ShaderType stage, u64 offset) const override; @@ -1461,38 +1485,13 @@ public: private: void InitializeRegisterDefaults(); - Core::System& system; - MemoryManager& memory_manager; - - VideoCore::RasterizerInterface* rasterizer = nullptr; - - /// Start offsets of each macro in macro_memory - std::array<u32, 0x80> macro_positions = {}; - - std::array<bool, Regs::NUM_REGS> mme_inline{}; - - /// Macro method that is currently being executed / being fed parameters. - u32 executing_macro = 0; - /// Parameters that have been submitted to the macro call so far. - std::vector<u32> macro_params; - - /// Interpreter for the macro codes uploaded to the GPU. - std::unique_ptr<MacroEngine> macro_engine; - - static constexpr u32 null_cb_data = 0xFFFFFFFF; - struct { - std::array<std::array<u32, 0x4000>, 16> buffer; - u32 current{null_cb_data}; - u32 id{null_cb_data}; - u32 start_pos{}; - u32 counter{}; - } cb_data_state; + void ProcessMacro(u32 method, const u32* base_start, u32 amount, bool is_last_call); - Upload::State upload_state; + u32 ProcessShadowRam(u32 method, u32 argument); - bool execute_on{true}; + void ProcessDirtyRegisters(u32 method, u32 argument); - std::array<u8, Regs::NUM_REGS> dirty_pointers{}; + void ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argument, bool is_last_call); /// Retrieves information about a specific TIC entry from the TIC buffer. Texture::TICEntry GetTICEntry(u32 tic_index) const; @@ -1502,8 +1501,8 @@ private: /** * Call a macro on this engine. + * * @param method Method to call - * @param num_parameters Number of arguments * @param parameters Arguments to the method call */ void CallMacroMethod(u32 method, const std::vector<u32>& parameters); @@ -1552,6 +1551,38 @@ private: /// Returns a query's value or an empty object if the value will be deferred through a cache. std::optional<u64> GetQueryResult(); + + Core::System& system; + MemoryManager& memory_manager; + + VideoCore::RasterizerInterface* rasterizer = nullptr; + + /// Start offsets of each macro in macro_memory + std::array<u32, 0x80> macro_positions{}; + + std::array<bool, Regs::NUM_REGS> mme_inline{}; + + /// Macro method that is currently being executed / being fed parameters. + u32 executing_macro = 0; + /// Parameters that have been submitted to the macro call so far. + std::vector<u32> macro_params; + + /// Interpreter for the macro codes uploaded to the GPU. + std::unique_ptr<MacroEngine> macro_engine; + + static constexpr u32 null_cb_data = 0xFFFFFFFF; + struct CBDataState { + std::array<std::array<u32, 0x4000>, 16> buffer; + u32 current{null_cb_data}; + u32 id{null_cb_data}; + u32 start_pos{}; + u32 counter{}; + }; + CBDataState cb_data_state; + + Upload::State upload_state; + + bool execute_on{true}; }; #define ASSERT_REG_POSITION(field_name, position) \ @@ -1564,6 +1595,7 @@ ASSERT_REG_POSITION(shadow_ram_control, 0x49); ASSERT_REG_POSITION(upload, 0x60); ASSERT_REG_POSITION(exec_upload, 0x6C); ASSERT_REG_POSITION(data_upload, 0x6D); +ASSERT_REG_POSITION(force_early_fragment_tests, 0x84); ASSERT_REG_POSITION(sync_info, 0xB2); ASSERT_REG_POSITION(tess_mode, 0xC8); ASSERT_REG_POSITION(tess_level_outer, 0xC9); @@ -1586,10 +1618,13 @@ ASSERT_REG_POSITION(polygon_offset_point_enable, 0x370); ASSERT_REG_POSITION(polygon_offset_line_enable, 0x371); ASSERT_REG_POSITION(polygon_offset_fill_enable, 0x372); ASSERT_REG_POSITION(patch_vertices, 0x373); +ASSERT_REG_POSITION(fragment_barrier, 0x378); ASSERT_REG_POSITION(scissor_test, 0x380); ASSERT_REG_POSITION(stencil_back_func_ref, 0x3D5); ASSERT_REG_POSITION(stencil_back_mask, 0x3D6); ASSERT_REG_POSITION(stencil_back_func_mask, 0x3D7); +ASSERT_REG_POSITION(invalidate_texture_data_cache, 0x3DD); +ASSERT_REG_POSITION(tiled_cache_barrier, 0x3DF); ASSERT_REG_POSITION(color_mask_common, 0x3E4); ASSERT_REG_POSITION(depth_bounds, 0x3E7); ASSERT_REG_POSITION(rt_separate_frag_data, 0x3EB); @@ -1597,6 +1632,7 @@ ASSERT_REG_POSITION(multisample_raster_enable, 0x3ED); ASSERT_REG_POSITION(multisample_raster_samples, 0x3EE); ASSERT_REG_POSITION(multisample_sample_mask, 0x3EF); ASSERT_REG_POSITION(zeta, 0x3F8); +ASSERT_REG_POSITION(render_area, 0x3FD); ASSERT_REG_POSITION(clear_flags, 0x43E); ASSERT_REG_POSITION(fill_rectangle, 0x44F); ASSERT_REG_POSITION(vertex_attrib_format, 0x458); @@ -1605,7 +1641,8 @@ ASSERT_REG_POSITION(multisample_coverage_to_color, 0x47E); ASSERT_REG_POSITION(rt_control, 0x487); ASSERT_REG_POSITION(zeta_width, 0x48a); ASSERT_REG_POSITION(zeta_height, 0x48b); -ASSERT_REG_POSITION(zeta_layers, 0x48c); +ASSERT_REG_POSITION(zeta_depth, 0x48c); +ASSERT_REG_POSITION(sampler_index, 0x48D); ASSERT_REG_POSITION(depth_test_enable, 0x4B3); ASSERT_REG_POSITION(independent_blend_enable, 0x4B9); ASSERT_REG_POSITION(depth_write_enabled, 0x4BA); @@ -1629,6 +1666,8 @@ ASSERT_REG_POSITION(frag_color_clamp, 0x4EA); ASSERT_REG_POSITION(screen_y_control, 0x4EB); ASSERT_REG_POSITION(line_width_smooth, 0x4EC); ASSERT_REG_POSITION(line_width_aliased, 0x4ED); +ASSERT_REG_POSITION(invalidate_sampler_cache_no_wfi, 0x509); +ASSERT_REG_POSITION(invalidate_texture_header_cache_no_wfi, 0x50A); ASSERT_REG_POSITION(vb_element_base, 0x50D); ASSERT_REG_POSITION(vb_base_instance, 0x50E); ASSERT_REG_POSITION(clip_distance_enabled, 0x544); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index e88290754..ba750748c 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -16,8 +16,10 @@ namespace Tegra::Engines { using namespace Texture; -MaxwellDMA::MaxwellDMA(Core::System& system, MemoryManager& memory_manager) - : system{system}, memory_manager{memory_manager} {} +MaxwellDMA::MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_) + : system{system_}, memory_manager{memory_manager_} {} + +MaxwellDMA::~MaxwellDMA() = default; void MaxwellDMA::CallMethod(u32 method, u32 method_argument, bool is_last_call) { ASSERT_MSG(method < NUM_REGS, "Invalid MaxwellDMA register"); @@ -94,6 +96,7 @@ void MaxwellDMA::CopyPitchToPitch() { } void MaxwellDMA::CopyBlockLinearToPitch() { + UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0); UNIMPLEMENTED_IF(regs.src_params.layer != 0); @@ -114,8 +117,6 @@ void MaxwellDMA::CopyBlockLinearToPitch() { const u32 block_depth = src_params.block_size.depth; const size_t src_size = CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); - const size_t src_layer_size = - CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth); if (read_buffer.size() < src_size) { read_buffer.resize(src_size); @@ -135,6 +136,8 @@ void MaxwellDMA::CopyBlockLinearToPitch() { } void MaxwellDMA::CopyPitchToBlockLinear() { + UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one"); + const auto& dst_params = regs.dst_params; const u32 bytes_per_pixel = regs.pitch_in / regs.line_length_in; const u32 width = dst_params.width; diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 50f445efc..3c59eeb13 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -72,11 +72,13 @@ public: struct RenderEnable { enum class Mode : u32 { - FALSE = 0, - TRUE = 1, - CONDITIONAL = 2, - RENDER_IF_EQUAL = 3, - RENDER_IF_NOT_EQUAL = 4, + // Note: This uses Pascal case in order to avoid the identifiers + // FALSE and TRUE, which are reserved on Darwin. + False = 0, + True = 1, + Conditional = 2, + RenderIfEqual = 3, + RenderIfNotEqual = 4, }; PackedGPUVAddr address; @@ -185,8 +187,8 @@ public: }; static_assert(sizeof(RemapConst) == 12); - explicit MaxwellDMA(Core::System& system, MemoryManager& memory_manager); - ~MaxwellDMA() = default; + explicit MaxwellDMA(Core::System& system_, MemoryManager& memory_manager_); + ~MaxwellDMA(); /// Write the value to the register identified by method. void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index d374b73cf..8b45f1b62 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -32,31 +32,31 @@ struct Register { constexpr Register() = default; - constexpr Register(u64 value) : value(value) {} + constexpr Register(u64 value_) : value(value_) {} - constexpr operator u64() const { + [[nodiscard]] constexpr operator u64() const { return value; } template <typename T> - constexpr u64 operator-(const T& oth) const { + [[nodiscard]] constexpr u64 operator-(const T& oth) const { return value - oth; } template <typename T> - constexpr u64 operator&(const T& oth) const { + [[nodiscard]] constexpr u64 operator&(const T& oth) const { return value & oth; } - constexpr u64 operator&(const Register& oth) const { + [[nodiscard]] constexpr u64 operator&(const Register& oth) const { return value & oth.value; } - constexpr u64 operator~() const { + [[nodiscard]] constexpr u64 operator~() const { return ~value; } - u64 GetSwizzledIndex(u64 elem) const { + [[nodiscard]] u64 GetSwizzledIndex(u64 elem) const { elem = (value + elem) & 3; return (value & ~3) + elem; } @@ -75,7 +75,7 @@ enum class AttributeSize : u64 { union Attribute { Attribute() = default; - constexpr explicit Attribute(u64 value) : value(value) {} + constexpr explicit Attribute(u64 value_) : value(value_) {} enum class Index : u64 { LayerViewportPointSize = 6, @@ -107,7 +107,7 @@ union Attribute { BitField<31, 1, u64> patch; BitField<47, 3, AttributeSize> size; - bool IsPhysical() const { + [[nodiscard]] bool IsPhysical() const { return patch == 0 && element == 0 && static_cast<u64>(index.Value()) == 0; } } fmt20; @@ -124,7 +124,7 @@ union Attribute { union Sampler { Sampler() = default; - constexpr explicit Sampler(u64 value) : value(value) {} + constexpr explicit Sampler(u64 value_) : value(value_) {} enum class Index : u64 { Sampler_0 = 8, @@ -137,7 +137,7 @@ union Sampler { union Image { Image() = default; - constexpr explicit Image(u64 value) : value{value} {} + constexpr explicit Image(u64 value_) : value{value_} {} BitField<36, 13, u64> index; u64 value; @@ -505,14 +505,14 @@ struct IpaMode { IpaInterpMode interpolation_mode; IpaSampleMode sampling_mode; - bool operator==(const IpaMode& a) const { + [[nodiscard]] bool operator==(const IpaMode& a) const { return std::tie(interpolation_mode, sampling_mode) == std::tie(a.interpolation_mode, a.sampling_mode); } - bool operator!=(const IpaMode& a) const { + [[nodiscard]] bool operator!=(const IpaMode& a) const { return !operator==(a); } - bool operator<(const IpaMode& a) const { + [[nodiscard]] bool operator<(const IpaMode& a) const { return std::tie(interpolation_mode, sampling_mode) < std::tie(a.interpolation_mode, a.sampling_mode); } @@ -658,10 +658,10 @@ union Instruction { return *this; } - constexpr Instruction(u64 value) : value{value} {} + constexpr Instruction(u64 value_) : value{value_} {} constexpr Instruction(const Instruction& instr) : value(instr.value) {} - constexpr bool Bit(u64 offset) const { + [[nodiscard]] constexpr bool Bit(u64 offset) const { return ((value >> offset) & 1) != 0; } @@ -746,34 +746,34 @@ union Instruction { BitField<28, 8, u64> imm_lut28; BitField<48, 8, u64> imm_lut48; - u32 GetImmLut28() const { + [[nodiscard]] u32 GetImmLut28() const { return static_cast<u32>(imm_lut28); } - u32 GetImmLut48() const { + [[nodiscard]] u32 GetImmLut48() const { return static_cast<u32>(imm_lut48); } } lop3; - u16 GetImm20_16() const { + [[nodiscard]] u16 GetImm20_16() const { return static_cast<u16>(imm20_16); } - u32 GetImm20_19() const { + [[nodiscard]] u32 GetImm20_19() const { u32 imm{static_cast<u32>(imm20_19)}; imm <<= 12; imm |= negate_imm ? 0x80000000 : 0; return imm; } - u32 GetImm20_32() const { + [[nodiscard]] u32 GetImm20_32() const { return static_cast<u32>(imm20_32); } - s32 GetSignedImm20_20() const { - u32 immediate = static_cast<u32>(imm20_19 | (negate_imm << 19)); + [[nodiscard]] s32 GetSignedImm20_20() const { + const auto immediate = static_cast<u32>(imm20_19 | (negate_imm << 19)); // Sign extend the 20-bit value. - u32 mask = 1U << (20 - 1); + const auto mask = 1U << (20 - 1); return static_cast<s32>((immediate ^ mask) - mask); } } alu; @@ -857,7 +857,7 @@ union Instruction { BitField<56, 1, u64> second_negate; BitField<30, 9, u64> second; - u32 PackImmediates() const { + [[nodiscard]] u32 PackImmediates() const { // Immediates are half floats shifted. constexpr u32 imm_shift = 6; return static_cast<u32>((first << imm_shift) | (second << (16 + imm_shift))); @@ -1033,7 +1033,7 @@ union Instruction { BitField<28, 2, AtomicType> type; BitField<30, 22, s64> offset; - s32 GetImmediateOffset() const { + [[nodiscard]] s32 GetImmediateOffset() const { return static_cast<s32>(offset << 2); } } atoms; @@ -1215,7 +1215,7 @@ union Instruction { BitField<39, 4, u64> rounding; // H0, H1 extract for F16 missing BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value - F2fRoundingOp GetRoundingMode() const { + [[nodiscard]] F2fRoundingOp GetRoundingMode() const { constexpr u64 rounding_mask = 0x0B; return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask); } @@ -1239,15 +1239,15 @@ union Instruction { BitField<54, 1, u64> aoffi_flag; BitField<55, 3, TextureProcessMode> process_mode; - bool IsComponentEnabled(std::size_t component) const { - return ((1ull << component) & component_mask) != 0; + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { + return ((1ULL << component) & component_mask) != 0; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1271,15 +1271,15 @@ union Instruction { BitField<36, 1, u64> aoffi_flag; BitField<37, 3, TextureProcessMode> process_mode; - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { return ((1ULL << component) & component_mask) != 0; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1299,7 +1299,7 @@ union Instruction { BitField<31, 4, u64> component_mask; BitField<49, 1, u64> nodep_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NODEP: return nodep_flag != 0; @@ -1309,7 +1309,7 @@ union Instruction { return false; } - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { return ((1ULL << component) & component_mask) != 0; } } txq; @@ -1321,11 +1321,11 @@ union Instruction { BitField<35, 1, u64> ndv_flag; BitField<49, 1, u64> nodep_flag; - bool IsComponentEnabled(std::size_t component) const { - return ((1ull << component) & component_mask) != 0; + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { + return ((1ULL << component) & component_mask) != 0; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return (ndv_flag != 0); @@ -1347,7 +1347,7 @@ union Instruction { BitField<54, 2, u64> offset_mode; BitField<56, 2, u64> component; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return ndv_flag != 0; @@ -1373,7 +1373,7 @@ union Instruction { BitField<33, 2, u64> offset_mode; BitField<37, 2, u64> component; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::NDV: return ndv_flag != 0; @@ -1399,7 +1399,7 @@ union Instruction { BitField<52, 2, u64> component; BitField<55, 1, u64> fp16_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return dc_flag != 0; @@ -1422,24 +1422,27 @@ union Instruction { BitField<53, 4, u64> texture_info; BitField<59, 1, u64> fp32_flag; - TextureType GetTextureType() const { + [[nodiscard]] TextureType GetTextureType() const { // The TEXS instruction has a weird encoding for the texture type. - if (texture_info == 0) + if (texture_info == 0) { return TextureType::Texture1D; - if (texture_info >= 1 && texture_info <= 9) + } + if (texture_info >= 1 && texture_info <= 9) { return TextureType::Texture2D; - if (texture_info >= 10 && texture_info <= 11) + } + if (texture_info >= 10 && texture_info <= 11) { return TextureType::Texture3D; - if (texture_info >= 12 && texture_info <= 13) + } + if (texture_info >= 12 && texture_info <= 13) { return TextureType::TextureCube; + } - LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", - static_cast<u32>(texture_info.Value())); + LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", texture_info.Value()); UNREACHABLE(); return TextureType::Texture1D; } - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { switch (texture_info) { case 0: case 2: @@ -1458,7 +1461,7 @@ union Instruction { return TextureProcessMode::None; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::DC: return (texture_info >= 4 && texture_info <= 6) || texture_info == 9; @@ -1470,16 +1473,16 @@ union Instruction { return false; } - bool IsArrayTexture() const { + [[nodiscard]] bool IsArrayTexture() const { // TEXS only supports Texture2D arrays. return texture_info >= 7 && texture_info <= 9; } - bool HasTwoDestinations() const { + [[nodiscard]] bool HasTwoDestinations() const { return gpr28.Value() != Register::ZeroIndex; } - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { static constexpr std::array<std::array<u32, 8>, 4> mask_lut{{ {}, {0x1, 0x2, 0x4, 0x8, 0x3, 0x9, 0xa, 0xc}, @@ -1506,7 +1509,7 @@ union Instruction { BitField<54, 1, u64> cl; BitField<55, 1, u64> process_mode; - TextureProcessMode GetTextureProcessMode() const { + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { return process_mode == 0 ? TextureProcessMode::LZ : TextureProcessMode::LL; } } tld; @@ -1516,7 +1519,7 @@ union Instruction { BitField<53, 4, u64> texture_info; BitField<59, 1, u64> fp32_flag; - TextureType GetTextureType() const { + [[nodiscard]] TextureType GetTextureType() const { // The TLDS instruction has a weird encoding for the texture type. if (texture_info <= 1) { return TextureType::Texture1D; @@ -1529,19 +1532,19 @@ union Instruction { return TextureType::Texture3D; } - LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", - static_cast<u32>(texture_info.Value())); + LOG_CRITICAL(HW_GPU, "Unhandled texture_info: {}", texture_info.Value()); UNREACHABLE(); return TextureType::Texture1D; } - TextureProcessMode GetTextureProcessMode() const { - if (texture_info == 1 || texture_info == 5 || texture_info == 12) + [[nodiscard]] TextureProcessMode GetTextureProcessMode() const { + if (texture_info == 1 || texture_info == 5 || texture_info == 12) { return TextureProcessMode::LL; + } return TextureProcessMode::LZ; } - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::AOFFI: return texture_info == 12 || texture_info == 4; @@ -1555,7 +1558,7 @@ union Instruction { return false; } - bool IsArrayTexture() const { + [[nodiscard]] bool IsArrayTexture() const { // TEXS only supports Texture2D arrays. return texture_info == 8; } @@ -1567,7 +1570,7 @@ union Instruction { BitField<35, 1, u64> aoffi_flag; BitField<49, 1, u64> nodep_flag; - bool UsesMiscMode(TextureMiscMode mode) const { + [[nodiscard]] bool UsesMiscMode(TextureMiscMode mode) const { switch (mode) { case TextureMiscMode::AOFFI: return aoffi_flag != 0; @@ -1591,7 +1594,7 @@ union Instruction { BitField<20, 3, StoreType> store_data_layout; BitField<20, 4, u64> component_mask_selector; - bool IsComponentEnabled(std::size_t component) const { + [[nodiscard]] bool IsComponentEnabled(std::size_t component) const { ASSERT(mode == SurfaceDataMode::P); constexpr u8 R = 0b0001; constexpr u8 G = 0b0010; @@ -1604,7 +1607,7 @@ union Instruction { return std::bitset<4>{mask.at(component_mask_selector)}.test(component); } - StoreType GetStoreDataLayout() const { + [[nodiscard]] StoreType GetStoreDataLayout() const { ASSERT(mode == SurfaceDataMode::D_BA); return store_data_layout; } @@ -1622,14 +1625,15 @@ union Instruction { BitField<20, 24, u64> target; BitField<5, 1, u64> constant_buffer; - s32 GetBranchTarget() const { + [[nodiscard]] s32 GetBranchTarget() const { // Sign extend the branch target offset - u32 mask = 1U << (24 - 1); - u32 value = static_cast<u32>(target); + const auto mask = 1U << (24 - 1); + const auto target_value = static_cast<u32>(target); + constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction)); + // The branch offset is relative to the next instruction and is stored in bytes, so // divide it by the size of an instruction and add 1 to it. - return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) + - 1; + return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1; } } bra; @@ -1637,14 +1641,15 @@ union Instruction { BitField<20, 24, u64> target; BitField<5, 1, u64> constant_buffer; - s32 GetBranchExtend() const { + [[nodiscard]] s32 GetBranchExtend() const { // Sign extend the branch target offset - u32 mask = 1U << (24 - 1); - u32 value = static_cast<u32>(target); + const auto mask = 1U << (24 - 1); + const auto target_value = static_cast<u32>(target); + constexpr auto instruction_size = static_cast<s32>(sizeof(Instruction)); + // The branch offset is relative to the next instruction and is stored in bytes, so // divide it by the size of an instruction and add 1 to it. - return static_cast<s32>((value ^ mask) - mask) / static_cast<s32>(sizeof(Instruction)) + - 1; + return static_cast<s32>((target_value ^ mask) - mask) / instruction_size + 1; } } brx; @@ -1697,7 +1702,7 @@ union Instruction { BitField<50, 1, u64> is_op_b_register; BitField<51, 3, VmnmxOperation> operation; - VmnmxType SourceFormatA() const { + [[nodiscard]] VmnmxType SourceFormatA() const { switch (src_format_a) { case 0b11: return VmnmxType::Bits32; @@ -1708,7 +1713,7 @@ union Instruction { } } - VmnmxType SourceFormatB() const { + [[nodiscard]] VmnmxType SourceFormatB() const { switch (src_format_b) { case 0b11: return VmnmxType::Bits32; @@ -1739,7 +1744,7 @@ union Instruction { BitField<20, 14, u64> shifted_offset; BitField<34, 5, u64> index; - u64 GetOffset() const { + [[nodiscard]] u64 GetOffset() const { return shifted_offset * 4; } } cbuf34; @@ -1748,7 +1753,7 @@ union Instruction { BitField<20, 16, s64> offset; BitField<36, 5, u64> index; - s64 GetOffset() const { + [[nodiscard]] s64 GetOffset() const { return offset; } } cbuf36; @@ -1893,6 +1898,7 @@ public: ICMP_IMM, FCMP_RR, FCMP_RC, + FCMP_IMMR, MUFU, // Multi-Function Operator RRO_C, // Range Reduction Operator RRO_R, @@ -1996,29 +2002,29 @@ public: /// Returns whether an opcode has an execution predicate field or not (ie, whether it can be /// conditionally executed). - static bool IsPredicatedInstruction(Id opcode) { + [[nodiscard]] static bool IsPredicatedInstruction(Id opcode) { // TODO(Subv): Add the rest of unpredicated instructions. return opcode != Id::SSY && opcode != Id::PBK; } class Matcher { public: - constexpr Matcher(const char* const name, u16 mask, u16 expected, Id id, Type type) - : name{name}, mask{mask}, expected{expected}, id{id}, type{type} {} + constexpr Matcher(const char* const name_, u16 mask_, u16 expected_, Id id_, Type type_) + : name{name_}, mask{mask_}, expected{expected_}, id{id_}, type{type_} {} - constexpr const char* GetName() const { + [[nodiscard]] constexpr const char* GetName() const { return name; } - constexpr u16 GetMask() const { + [[nodiscard]] constexpr u16 GetMask() const { return mask; } - constexpr Id GetId() const { + [[nodiscard]] constexpr Id GetId() const { return id; } - constexpr Type GetType() const { + [[nodiscard]] constexpr Type GetType() const { return type; } @@ -2027,7 +2033,7 @@ public: * @param instruction The instruction to test * @returns true if the given instruction matches. */ - constexpr bool Matches(u16 instruction) const { + [[nodiscard]] constexpr bool Matches(u16 instruction) const { return (instruction & mask) == expected; } @@ -2039,7 +2045,8 @@ public: Type type; }; - static std::optional<std::reference_wrapper<const Matcher>> Decode(Instruction instr) { + using DecodeResult = std::optional<std::reference_wrapper<const Matcher>>; + [[nodiscard]] static DecodeResult Decode(Instruction instr) { static const auto table{GetDecodeTable()}; const auto matches_instruction = [instr](const auto& matcher) { @@ -2061,7 +2068,7 @@ private: * A '0' in a bitstring indicates that a zero must be present at that bit position. * A '1' in a bitstring indicates that a one must be present at that bit position. */ - static constexpr auto GetMaskAndExpect(const char* const bitstring) { + [[nodiscard]] static constexpr auto GetMaskAndExpect(const char* const bitstring) { u16 mask = 0, expect = 0; for (std::size_t i = 0; i < opcode_bitsize; i++) { const std::size_t bit_position = opcode_bitsize - i - 1; @@ -2083,14 +2090,14 @@ private: public: /// Creates a matcher that can match and parse instructions based on bitstring. - static constexpr auto GetMatcher(const char* const bitstring, Id op, Type type, - const char* const name) { + [[nodiscard]] static constexpr auto GetMatcher(const char* const bitstring, Id op, + Type type, const char* const name) { const auto [mask, expected] = GetMaskAndExpect(bitstring); return Matcher(name, mask, expected, op, type); } }; - static std::vector<Matcher> GetDecodeTable() { + [[nodiscard]] static std::vector<Matcher> GetDecodeTable() { std::vector<Matcher> table = { #define INST(bitstring, op, type, name) Detail::GetMatcher(bitstring, op, type, name) INST("111000110011----", Id::KIL, Type::Flow, "KIL"), @@ -2205,6 +2212,7 @@ private: INST("0111110-0-------", Id::HSET2_IMM, Type::HalfSet, "HSET2_IMM"), INST("010110111010----", Id::FCMP_RR, Type::Arithmetic, "FCMP_RR"), INST("010010111010----", Id::FCMP_RC, Type::Arithmetic, "FCMP_RC"), + INST("0011011-1010----", Id::FCMP_IMMR, Type::Arithmetic, "FCMP_IMMR"), INST("0101000010000---", Id::MUFU, Type::Arithmetic, "MUFU"), INST("0100110010010---", Id::RRO_C, Type::Arithmetic, "RRO_C"), INST("0101110010010---", Id::RRO_R, Type::Arithmetic, "RRO_R"), diff --git a/src/video_core/engines/shader_header.h b/src/video_core/engines/shader_header.h index 72e2a33d5..ceec05459 100644 --- a/src/video_core/engines/shader_header.h +++ b/src/video_core/engines/shader_header.h @@ -41,30 +41,30 @@ struct Header { BitField<26, 1, u32> does_load_or_store; BitField<27, 1, u32> does_fp64; BitField<28, 4, u32> stream_out_mask; - } common0{}; + } common0; union { BitField<0, 24, u32> shader_local_memory_low_size; BitField<24, 8, u32> per_patch_attribute_count; - } common1{}; + } common1; union { BitField<0, 24, u32> shader_local_memory_high_size; BitField<24, 8, u32> threads_per_input_primitive; - } common2{}; + } common2; union { BitField<0, 24, u32> shader_local_memory_crs_size; BitField<24, 4, OutputTopology> output_topology; BitField<28, 4, u32> reserved; - } common3{}; + } common3; union { BitField<0, 12, u32> max_output_vertices; BitField<12, 8, u32> store_req_start; // NOTE: not used by geometry shaders. BitField<20, 4, u32> reserved; BitField<24, 8, u32> store_req_end; // NOTE: not used by geometry shaders. - } common4{}; + } common4; union { struct { @@ -145,7 +145,7 @@ struct Header { } } ps; - std::array<u32, 0xF> raw{}; + std::array<u32, 0xF> raw; }; u64 GetLocalMemorySize() const { @@ -153,7 +153,6 @@ struct Header { (common2.shader_local_memory_high_size << 24)); } }; - static_assert(sizeof(Header) == 0x50, "Incorrect structure size"); } // namespace Tegra::Shader diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index de6991ef6..3512283ff 100644 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -9,6 +9,7 @@ #include "common/common_types.h" #include "core/core.h" +#include "video_core/delayed_destruction_ring.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" @@ -17,11 +18,11 @@ namespace VideoCommon { class FenceBase { public: - FenceBase(u32 payload, bool is_stubbed) - : address{}, payload{payload}, is_semaphore{false}, is_stubbed{is_stubbed} {} + explicit FenceBase(u32 payload_, bool is_stubbed_) + : address{}, payload{payload_}, is_semaphore{false}, is_stubbed{is_stubbed_} {} - FenceBase(GPUVAddr address, u32 payload, bool is_stubbed) - : address{address}, payload{payload}, is_semaphore{true}, is_stubbed{is_stubbed} {} + explicit FenceBase(GPUVAddr address_, u32 payload_, bool is_stubbed_) + : address{address_}, payload{payload_}, is_semaphore{true}, is_stubbed{is_stubbed_} {} GPUVAddr GetAddress() const { return address; @@ -47,6 +48,11 @@ protected: template <typename TFence, typename TTextureCache, typename TTBufferCache, typename TQueryCache> class FenceManager { public: + /// Notify the fence manager about a new frame + void TickFrame() { + delayed_destruction_ring.Tick(); + } + void SignalSemaphore(GPUVAddr addr, u32 value) { TryReleasePendingFences(); const bool should_flush = ShouldFlush(); @@ -86,7 +92,7 @@ public: } else { gpu.IncrementSyncPoint(current_fence->GetPayload()); } - fences.pop(); + PopFence(); } } @@ -132,7 +138,7 @@ private: } else { gpu.IncrementSyncPoint(current_fence->GetPayload()); } - fences.pop(); + PopFence(); } } @@ -158,7 +164,14 @@ private: query_cache.CommitAsyncFlushes(); } + void PopFence() { + delayed_destruction_ring.Push(std::move(fences.front())); + fences.pop(); + } + std::queue<TFence> fences; + + DelayedDestructionRing<TFence, 6> delayed_destruction_ring; }; } // namespace VideoCommon diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h new file mode 100644 index 000000000..b86c3a757 --- /dev/null +++ b/src/video_core/framebuffer_config.h @@ -0,0 +1,31 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +namespace Tegra { + +/** + * Struct describing framebuffer configuration + */ +struct FramebufferConfig { + enum class PixelFormat : u32 { + A8B8G8R8_UNORM = 1, + RGB565_UNORM = 4, + B8G8R8A8_UNORM = 5, + }; + + VAddr address{}; + u32 offset{}; + u32 width{}; + u32 height{}; + u32 stride{}; + PixelFormat pixel_format{}; + + using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags; + TransformFlags transform_flags{}; + Common::Rectangle<int> crop_rect; +}; + +} // namespace Tegra diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 4bb9256e9..6ab06775f 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -10,6 +10,7 @@ #include "core/core_timing.h" #include "core/core_timing_util.h" #include "core/frontend/emu_window.h" +#include "core/hardware_interrupt_manager.h" #include "core/memory.h" #include "core/settings.h" #include "video_core/engines/fermi_2d.h" @@ -27,15 +28,17 @@ namespace Tegra { MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); -GPU::GPU(Core::System& system_, bool is_async_) +GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, + cdma_pusher{std::make_unique<Tegra::CDmaPusher>(*this)}, use_nvdec{use_nvdec_}, maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, fermi_2d{std::make_unique<Engines::Fermi2D>()}, kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)}, kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)}, - shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_} {} + shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_}, + gpu_thread{system_, is_async_} {} GPU::~GPU() = default; @@ -77,31 +80,46 @@ DmaPusher& GPU::DmaPusher() { return *dma_pusher; } +Tegra::CDmaPusher& GPU::CDmaPusher() { + return *cdma_pusher; +} + const DmaPusher& GPU::DmaPusher() const { return *dma_pusher; } +const Tegra::CDmaPusher& GPU::CDmaPusher() const { + return *cdma_pusher; +} + void GPU::WaitFence(u32 syncpoint_id, u32 value) { // Synced GPU, is always in sync if (!is_async) { return; } + if (syncpoint_id == UINT32_MAX) { + // TODO: Research what this does. + LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); + return; + } MICROPROFILE_SCOPE(GPU_wait); std::unique_lock lock{sync_mutex}; - sync_cv.wait(lock, [=, this] { return syncpoints[syncpoint_id].load() >= value; }); + sync_cv.wait(lock, [=, this] { return syncpoints.at(syncpoint_id).load() >= value; }); } void GPU::IncrementSyncPoint(const u32 syncpoint_id) { - syncpoints[syncpoint_id]++; + auto& syncpoint = syncpoints.at(syncpoint_id); + syncpoint++; std::lock_guard lock{sync_mutex}; sync_cv.notify_all(); - if (!syncpt_interrupts[syncpoint_id].empty()) { - u32 value = syncpoints[syncpoint_id].load(); - auto it = syncpt_interrupts[syncpoint_id].begin(); - while (it != syncpt_interrupts[syncpoint_id].end()) { + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + if (!interrupt.empty()) { + u32 value = syncpoint.load(); + auto it = interrupt.begin(); + while (it != interrupt.end()) { if (value >= *it) { TriggerCpuInterrupt(syncpoint_id, *it); - it = syncpt_interrupts[syncpoint_id].erase(it); + it = interrupt.erase(it); continue; } it++; @@ -110,22 +128,22 @@ void GPU::IncrementSyncPoint(const u32 syncpoint_id) { } u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const { - return syncpoints[syncpoint_id].load(); + return syncpoints.at(syncpoint_id).load(); } void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) { - auto& interrupt = syncpt_interrupts[syncpoint_id]; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); bool contains = std::any_of(interrupt.begin(), interrupt.end(), [value](u32 in_value) { return in_value == value; }); if (contains) { return; } - syncpt_interrupts[syncpoint_id].emplace_back(value); + interrupt.emplace_back(value); } bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { std::lock_guard lock{sync_mutex}; - auto& interrupt = syncpt_interrupts[syncpoint_id]; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); const auto iter = std::find_if(interrupt.begin(), interrupt.end(), [value](u32 interrupt_value) { return value == interrupt_value; }); @@ -182,34 +200,6 @@ void GPU::SyncGuestHost() { renderer->Rasterizer().SyncGuestHost(); } -void GPU::OnCommandListEnd() { - renderer->Rasterizer().ReleaseFences(); -} -// Note that, traditionally, methods are treated as 4-byte addressable locations, and hence -// their numbers are written down multiplied by 4 in Docs. Here we are not multiply by 4. -// So the values you see in docs might be multiplied by 4. -enum class BufferMethods { - BindObject = 0x0, - Nop = 0x2, - SemaphoreAddressHigh = 0x4, - SemaphoreAddressLow = 0x5, - SemaphoreSequence = 0x6, - SemaphoreTrigger = 0x7, - NotifyIntr = 0x8, - WrcacheFlush = 0x9, - Unk28 = 0xA, - UnkCacheFlush = 0xB, - RefCnt = 0x14, - SemaphoreAcquire = 0x1A, - SemaphoreRelease = 0x1B, - FenceValue = 0x1C, - FenceAction = 0x1D, - Unk78 = 0x1E, - Unk7c = 0x1F, - Yield = 0x20, - NonPullerMethods = 0x40, -}; - enum class GpuSemaphoreOperation { AcquireEqual = 0x1, WriteLong = 0x2, @@ -240,8 +230,12 @@ void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); } else { for (std::size_t i = 0; i < amount; i++) { - CallPullerMethod( - {method, base_start[i], subchannel, methods_pending - static_cast<u32>(i)}); + CallPullerMethod(MethodCall{ + method, + base_start[i], + subchannel, + methods_pending - static_cast<u32>(i), + }); } } } @@ -268,7 +262,12 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { case BufferMethods::UnkCacheFlush: case BufferMethods::WrcacheFlush: case BufferMethods::FenceValue: + break; case BufferMethods::FenceAction: + ProcessFenceActionMethod(); + break; + case BufferMethods::WaitForInterrupt: + ProcessWaitForInterruptMethod(); break; case BufferMethods::SemaphoreTrigger: { ProcessSemaphoreTriggerMethod(); @@ -298,8 +297,7 @@ void GPU::CallPullerMethod(const MethodCall& method_call) { break; } default: - LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", - static_cast<u32>(method)); + LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); break; } } @@ -378,10 +376,28 @@ void GPU::ProcessBindMethod(const MethodCall& method_call) { dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); break; default: - UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", static_cast<u32>(engine_id)); + UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); + } +} + +void GPU::ProcessFenceActionMethod() { + switch (regs.fence_action.op) { + case FenceOperation::Acquire: + WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); + break; + case FenceOperation::Increment: + IncrementSyncPoint(regs.fence_action.syncpoint_id); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); } } +void GPU::ProcessWaitForInterruptMethod() { + // TODO(bunnei) ImplementMe + LOG_WARNING(HW_GPU, "(STUBBED) called"); +} + void GPU::ProcessSemaphoreTriggerMethod() { const auto semaphoreOperationMask = 0xF; const auto op = @@ -443,4 +459,75 @@ void GPU::ProcessSemaphoreAcquire() { } } +void GPU::Start() { + gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher, *cdma_pusher); + cpu_context = renderer->GetRenderWindow().CreateSharedContext(); + cpu_context->MakeCurrent(); +} + +void GPU::ObtainContext() { + cpu_context->MakeCurrent(); +} + +void GPU::ReleaseContext() { + cpu_context->DoneCurrent(); +} + +void GPU::PushGPUEntries(Tegra::CommandList&& entries) { + gpu_thread.SubmitList(std::move(entries)); +} + +void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { + if (!use_nvdec) { + return; + } + // This condition fires when a video stream ends, clear all intermediary data + if (entries[0].raw == 0xDEADB33F) { + cdma_pusher.reset(); + return; + } + if (!cdma_pusher) { + cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this); + } + + // SubmitCommandBuffer would make the nvdec operations async, this is not currently working + // TODO(ameerj): RE proper async nvdec operation + // gpu_thread.SubmitCommandBuffer(std::move(entries)); + + cdma_pusher->Push(std::move(entries)); + cdma_pusher->DispatchCalls(); +} + +void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { + gpu_thread.SwapBuffers(framebuffer); +} + +void GPU::FlushRegion(VAddr addr, u64 size) { + gpu_thread.FlushRegion(addr, size); +} + +void GPU::InvalidateRegion(VAddr addr, u64 size) { + gpu_thread.InvalidateRegion(addr, size); +} + +void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { + gpu_thread.FlushAndInvalidateRegion(addr, size); +} + +void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { + auto& interrupt_manager = system.InterruptManager(); + interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); +} + +void GPU::WaitIdle() const { + gpu_thread.WaitIdle(); +} + +void GPU::OnCommandListEnd() { + if (is_async) { + // This command only applies to asynchronous GPU mode + gpu_thread.OnCommandListEnd(); + } +} + } // namespace Tegra diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index 2d15d1c6f..d81e38680 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -13,14 +13,17 @@ #include "common/common_types.h" #include "core/hle/service/nvdrv/nvdata.h" #include "core/hle/service/nvflinger/buffer_queue.h" +#include "video_core/cdma_pusher.h" #include "video_core/dma_pusher.h" +#include "video_core/framebuffer_config.h" +#include "video_core/gpu_thread.h" using CacheAddr = std::uintptr_t; -inline CacheAddr ToCacheAddr(const void* host_ptr) { +[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) { return reinterpret_cast<CacheAddr>(host_ptr); } -inline u8* FromCacheAddr(CacheAddr cache_addr) { +[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) { return reinterpret_cast<u8*>(cache_addr); } @@ -100,28 +103,6 @@ enum class DepthFormat : u32 { struct CommandListHeader; class DebugContext; -/** - * Struct describing framebuffer configuration - */ -struct FramebufferConfig { - enum class PixelFormat : u32 { - A8B8G8R8_UNORM = 1, - RGB565_UNORM = 4, - B8G8R8A8_UNORM = 5, - }; - - VAddr address; - u32 offset; - u32 width; - u32 height; - u32 stride; - PixelFormat pixel_format; - - using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags; - TransformFlags transform_flags; - Common::Rectangle<int> crop_rect; -}; - namespace Engines { class Fermi2D; class Maxwell3D; @@ -140,7 +121,7 @@ enum class EngineID { class MemoryManager; -class GPU { +class GPU final { public: struct MethodCall { u32 method{}; @@ -148,17 +129,17 @@ public: u32 subchannel{}; u32 method_count{}; - bool IsLastCall() const { + explicit MethodCall(u32 method_, u32 argument_, u32 subchannel_ = 0, u32 method_count_ = 0) + : method(method_), argument(argument_), subchannel(subchannel_), + method_count(method_count_) {} + + [[nodiscard]] bool IsLastCall() const { return method_count <= 1; } - - MethodCall(u32 method, u32 argument, u32 subchannel = 0, u32 method_count = 0) - : method(method), argument(argument), subchannel(subchannel), - method_count(method_count) {} }; - explicit GPU(Core::System& system, bool is_async); - virtual ~GPU(); + explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_); + ~GPU(); /// Binds a renderer to the GPU. void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer); @@ -175,13 +156,13 @@ public: /// Synchronizes CPU writes with Host GPU memory. void SyncGuestHost(); /// Signal the ending of command list. - virtual void OnCommandListEnd(); + void OnCommandListEnd(); /// Request a host GPU memory flush from the CPU. - u64 RequestFlush(VAddr addr, std::size_t size); + [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size); /// Obtains current flush request fence id. - u64 CurrentFlushRequestFence() const { + [[nodiscard]] u64 CurrentFlushRequestFence() const { return current_flush_fence.load(std::memory_order_relaxed); } @@ -189,68 +170,100 @@ public: void TickWork(); /// Returns a reference to the Maxwell3D GPU engine. - Engines::Maxwell3D& Maxwell3D(); + [[nodiscard]] Engines::Maxwell3D& Maxwell3D(); /// Returns a const reference to the Maxwell3D GPU engine. - const Engines::Maxwell3D& Maxwell3D() const; + [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const; /// Returns a reference to the KeplerCompute GPU engine. - Engines::KeplerCompute& KeplerCompute(); + [[nodiscard]] Engines::KeplerCompute& KeplerCompute(); /// Returns a reference to the KeplerCompute GPU engine. - const Engines::KeplerCompute& KeplerCompute() const; + [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const; /// Returns a reference to the GPU memory manager. - Tegra::MemoryManager& MemoryManager(); + [[nodiscard]] Tegra::MemoryManager& MemoryManager(); /// Returns a const reference to the GPU memory manager. - const Tegra::MemoryManager& MemoryManager() const; + [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const; /// Returns a reference to the GPU DMA pusher. - Tegra::DmaPusher& DmaPusher(); + [[nodiscard]] Tegra::DmaPusher& DmaPusher(); + + /// Returns a const reference to the GPU DMA pusher. + [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const; + + /// Returns a reference to the GPU CDMA pusher. + [[nodiscard]] Tegra::CDmaPusher& CDmaPusher(); + + /// Returns a const reference to the GPU CDMA pusher. + [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const; - VideoCore::RendererBase& Renderer() { + /// Returns a reference to the underlying renderer. + [[nodiscard]] VideoCore::RendererBase& Renderer() { return *renderer; } - const VideoCore::RendererBase& Renderer() const { + /// Returns a const reference to the underlying renderer. + [[nodiscard]] const VideoCore::RendererBase& Renderer() const { return *renderer; } - VideoCore::ShaderNotify& ShaderNotify() { + /// Returns a reference to the shader notifier. + [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { return *shader_notify; } - const VideoCore::ShaderNotify& ShaderNotify() const { + /// Returns a const reference to the shader notifier. + [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { return *shader_notify; } // Waits for the GPU to finish working - virtual void WaitIdle() const = 0; + void WaitIdle() const; /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. void WaitFence(u32 syncpoint_id, u32 value); void IncrementSyncPoint(u32 syncpoint_id); - u32 GetSyncpointValue(u32 syncpoint_id) const; + [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const; void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value); - bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); + [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value); - u64 GetTicks() const; + [[nodiscard]] u64 GetTicks() const; - std::unique_lock<std::mutex> LockSync() { + [[nodiscard]] std::unique_lock<std::mutex> LockSync() { return std::unique_lock{sync_mutex}; } - bool IsAsync() const { + [[nodiscard]] bool IsAsync() const { return is_async; } - /// Returns a const reference to the GPU DMA pusher. - const Tegra::DmaPusher& DmaPusher() const; + [[nodiscard]] bool UseNvdec() const { + return use_nvdec; + } + + enum class FenceOperation : u32 { + Acquire = 0, + Increment = 1, + }; + + union FenceAction { + u32 raw; + BitField<0, 1, FenceOperation> op; + BitField<8, 24, u32> syncpoint_id; + + [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) { + FenceAction result{}; + result.op.Assign(op); + result.syncpoint_id.Assign(syncpoint_id); + return {result.raw}; + } + }; struct Regs { static constexpr size_t NUM_REGS = 0x40; @@ -262,7 +275,7 @@ public: u32 address_high; u32 address_low; - GPUVAddr SemaphoreAddress() const { + [[nodiscard]] GPUVAddr SemaphoreAddress() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); } @@ -280,10 +293,7 @@ public: u32 semaphore_acquire; u32 semaphore_release; u32 fence_value; - union { - BitField<4, 4, u32> operation; - BitField<8, 8, u32> id; - } fence_action; + FenceAction fence_action; INSERT_UNION_PADDING_WORDS(0xE2); // Puller state @@ -300,34 +310,39 @@ public: /// Performs any additional setup necessary in order to begin GPU emulation. /// This can be used to launch any necessary threads and register any necessary /// core timing events. - virtual void Start() = 0; + void Start(); /// Obtain the CPU Context - virtual void ObtainContext() = 0; + void ObtainContext(); /// Release the CPU Context - virtual void ReleaseContext() = 0; + void ReleaseContext(); /// Push GPU command entries to be processed - virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; + void PushGPUEntries(Tegra::CommandList&& entries); + + /// Push GPU command buffer entries to be processed + void PushCommandBuffer(Tegra::ChCommandHeaderList& entries); /// Swap buffers (render frame) - virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; + void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory - virtual void FlushRegion(VAddr addr, u64 size) = 0; + void FlushRegion(VAddr addr, u64 size); /// Notify rasterizer that any caches of the specified region should be invalidated - virtual void InvalidateRegion(VAddr addr, u64 size) = 0; + void InvalidateRegion(VAddr addr, u64 size); /// Notify rasterizer that any caches of the specified region should be flushed and invalidated - virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; + void FlushAndInvalidateRegion(VAddr addr, u64 size); protected: - virtual void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const = 0; + void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const; private: void ProcessBindMethod(const MethodCall& method_call); + void ProcessFenceActionMethod(); + void ProcessWaitForInterruptMethod(); void ProcessSemaphoreTriggerMethod(); void ProcessSemaphoreRelease(); void ProcessSemaphoreAcquire(); @@ -343,13 +358,15 @@ private: u32 methods_pending); /// Determines where the method should be executed. - bool ExecuteMethodOnEngine(u32 method); + [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); protected: Core::System& system; std::unique_ptr<Tegra::MemoryManager> memory_manager; std::unique_ptr<Tegra::DmaPusher> dma_pusher; + std::unique_ptr<Tegra::CDmaPusher> cdma_pusher; std::unique_ptr<VideoCore::RendererBase> renderer; + const bool use_nvdec; private: /// Mapping of command subchannels to their bound engine ids @@ -372,12 +389,13 @@ private: std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; std::mutex sync_mutex; + std::mutex device_mutex; std::condition_variable sync_cv; struct FlushRequest { - FlushRequest(u64 fence, VAddr addr, std::size_t size) - : fence{fence}, addr{addr}, size{size} {} + explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) + : fence{fence_}, addr{addr_}, size{size_} {} u64 fence; VAddr addr; std::size_t size; @@ -389,6 +407,9 @@ private: std::mutex flush_request_mutex; const bool is_async; + + VideoCommon::GPUThread::ThreadManager gpu_thread; + std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; }; #define ASSERT_REG_POSITION(field_name, position) \ diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp deleted file mode 100644 index 70a3d5738..000000000 --- a/src/video_core/gpu_asynch.cpp +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "core/core.h" -#include "core/hardware_interrupt_manager.h" -#include "video_core/gpu_asynch.h" -#include "video_core/gpu_thread.h" -#include "video_core/renderer_base.h" - -namespace VideoCommon { - -GPUAsynch::GPUAsynch(Core::System& system) : GPU{system, true}, gpu_thread{system} {} - -GPUAsynch::~GPUAsynch() = default; - -void GPUAsynch::Start() { - gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); - cpu_context = renderer->GetRenderWindow().CreateSharedContext(); - cpu_context->MakeCurrent(); -} - -void GPUAsynch::ObtainContext() { - cpu_context->MakeCurrent(); -} - -void GPUAsynch::ReleaseContext() { - cpu_context->DoneCurrent(); -} - -void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) { - gpu_thread.SubmitList(std::move(entries)); -} - -void GPUAsynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - gpu_thread.SwapBuffers(framebuffer); -} - -void GPUAsynch::FlushRegion(VAddr addr, u64 size) { - gpu_thread.FlushRegion(addr, size); -} - -void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) { - gpu_thread.InvalidateRegion(addr, size); -} - -void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) { - gpu_thread.FlushAndInvalidateRegion(addr, size); -} - -void GPUAsynch::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { - auto& interrupt_manager = system.InterruptManager(); - interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); -} - -void GPUAsynch::WaitIdle() const { - gpu_thread.WaitIdle(); -} - -void GPUAsynch::OnCommandListEnd() { - gpu_thread.OnCommandListEnd(); -} - -} // namespace VideoCommon diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h deleted file mode 100644 index f89c855a5..000000000 --- a/src/video_core/gpu_asynch.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "video_core/gpu.h" -#include "video_core/gpu_thread.h" - -namespace Core::Frontend { -class GraphicsContext; -} - -namespace VideoCore { -class RendererBase; -} // namespace VideoCore - -namespace VideoCommon { - -/// Implementation of GPU interface that runs the GPU asynchronously -class GPUAsynch final : public Tegra::GPU { -public: - explicit GPUAsynch(Core::System& system); - ~GPUAsynch() override; - - void Start() override; - void ObtainContext() override; - void ReleaseContext() override; - void PushGPUEntries(Tegra::CommandList&& entries) override; - void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - void FlushRegion(VAddr addr, u64 size) override; - void InvalidateRegion(VAddr addr, u64 size) override; - void FlushAndInvalidateRegion(VAddr addr, u64 size) override; - void WaitIdle() const override; - - void OnCommandListEnd() override; - -protected: - void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const override; - -private: - GPUThread::ThreadManager gpu_thread; - std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; -}; - -} // namespace VideoCommon diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp deleted file mode 100644 index 1ca47ddef..000000000 --- a/src/video_core/gpu_synch.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "video_core/gpu_synch.h" -#include "video_core/renderer_base.h" - -namespace VideoCommon { - -GPUSynch::GPUSynch(Core::System& system) : GPU{system, false} {} - -GPUSynch::~GPUSynch() = default; - -void GPUSynch::Start() {} - -void GPUSynch::ObtainContext() { - renderer->Context().MakeCurrent(); -} - -void GPUSynch::ReleaseContext() { - renderer->Context().DoneCurrent(); -} - -void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) { - dma_pusher->Push(std::move(entries)); - dma_pusher->DispatchCalls(); -} - -void GPUSynch::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - renderer->SwapBuffers(framebuffer); -} - -void GPUSynch::FlushRegion(VAddr addr, u64 size) { - renderer->Rasterizer().FlushRegion(addr, size); -} - -void GPUSynch::InvalidateRegion(VAddr addr, u64 size) { - renderer->Rasterizer().InvalidateRegion(addr, size); -} - -void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) { - renderer->Rasterizer().FlushAndInvalidateRegion(addr, size); -} - -} // namespace VideoCommon diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h deleted file mode 100644 index 297258cb1..000000000 --- a/src/video_core/gpu_synch.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "video_core/gpu.h" - -namespace Core::Frontend { -class GraphicsContext; -} - -namespace VideoCore { -class RendererBase; -} // namespace VideoCore - -namespace VideoCommon { - -/// Implementation of GPU interface that runs the GPU synchronously -class GPUSynch final : public Tegra::GPU { -public: - explicit GPUSynch(Core::System& system); - ~GPUSynch() override; - - void Start() override; - void ObtainContext() override; - void ReleaseContext() override; - void PushGPUEntries(Tegra::CommandList&& entries) override; - void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - void FlushRegion(VAddr addr, u64 size) override; - void InvalidateRegion(VAddr addr, u64 size) override; - void FlushAndInvalidateRegion(VAddr addr, u64 size) override; - void WaitIdle() const override {} - -protected: - void TriggerCpuInterrupt([[maybe_unused]] u32 syncpoint_id, - [[maybe_unused]] u32 value) const override {} -}; - -} // namespace VideoCommon diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index bf761abf2..7e490bcc3 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -4,6 +4,7 @@ #include "common/assert.h" #include "common/microprofile.h" +#include "common/scope_exit.h" #include "common/thread.h" #include "core/core.h" #include "core/frontend/emu_window.h" @@ -18,9 +19,11 @@ namespace VideoCommon::GPUThread { /// Runs the GPU thread static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher, - SynchState& state) { + SynchState& state, Tegra::CDmaPusher& cdma_pusher) { std::string name = "yuzu:GPU"; MicroProfileOnThreadCreate(name.c_str()); + SCOPE_EXIT({ MicroProfileOnThreadExit(); }); + Common::SetCurrentThreadName(name.c_str()); Common::SetCurrentThreadPriority(Common::ThreadPriority::High); system.RegisterHostThread(); @@ -39,19 +42,23 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, CommandDataContainer next; while (state.is_running) { next = state.queue.PopWait(); - if (const auto submit_list = std::get_if<SubmitListCommand>(&next.data)) { + if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) { dma_pusher.Push(std::move(submit_list->entries)); dma_pusher.DispatchCalls(); - } else if (const auto data = std::get_if<SwapBuffersCommand>(&next.data)) { + } else if (auto* command_list = std::get_if<SubmitChCommandEntries>(&next.data)) { + // NVDEC + cdma_pusher.Push(std::move(command_list->entries)); + cdma_pusher.DispatchCalls(); + } else if (const auto* data = std::get_if<SwapBuffersCommand>(&next.data)) { renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); } else if (std::holds_alternative<OnCommandListEndCommand>(next.data)) { renderer.Rasterizer().ReleaseFences(); } else if (std::holds_alternative<GPUTickCommand>(next.data)) { system.GPU().TickWork(); - } else if (const auto data = std::get_if<FlushRegionCommand>(&next.data)) { - renderer.Rasterizer().FlushRegion(data->addr, data->size); - } else if (const auto data = std::get_if<InvalidateRegionCommand>(&next.data)) { - renderer.Rasterizer().OnCPUWrite(data->addr, data->size); + } else if (const auto* flush = std::get_if<FlushRegionCommand>(&next.data)) { + renderer.Rasterizer().FlushRegion(flush->addr, flush->size); + } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) { + renderer.Rasterizer().OnCPUWrite(invalidate->addr, invalidate->size); } else if (std::holds_alternative<EndProcessingCommand>(next.data)) { return; } else { @@ -61,7 +68,8 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, } } -ThreadManager::ThreadManager(Core::System& system) : system{system} {} +ThreadManager::ThreadManager(Core::System& system_, bool is_async_) + : system{system_}, is_async{is_async_} {} ThreadManager::~ThreadManager() { if (!thread.joinable()) { @@ -75,33 +83,48 @@ ThreadManager::~ThreadManager() { void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, - Tegra::DmaPusher& dma_pusher) { - thread = std::thread{RunThread, std::ref(system), std::ref(renderer), - std::ref(context), std::ref(dma_pusher), std::ref(state)}; + Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) { + thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), + std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher)); } void ThreadManager::SubmitList(Tegra::CommandList&& entries) { PushCommand(SubmitListCommand(std::move(entries))); } +void ThreadManager::SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries) { + PushCommand(SubmitChCommandEntries(std::move(entries))); +} + void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { PushCommand(SwapBuffersCommand(framebuffer ? std::make_optional(*framebuffer) : std::nullopt)); } void ThreadManager::FlushRegion(VAddr addr, u64 size) { - if (!Settings::IsGPULevelHigh()) { + if (!is_async) { + // Always flush with synchronous GPU mode PushCommand(FlushRegionCommand(addr, size)); return; } - if (!Settings::IsGPULevelExtreme()) { - return; - } - if (system.Renderer().Rasterizer().MustFlushRegion(addr, size)) { + + // Asynchronous GPU mode + switch (Settings::values.gpu_accuracy.GetValue()) { + case Settings::GPUAccuracy::Normal: + PushCommand(FlushRegionCommand(addr, size)); + break; + case Settings::GPUAccuracy::High: + // TODO(bunnei): Is this right? Preserving existing behavior for now + break; + case Settings::GPUAccuracy::Extreme: { auto& gpu = system.GPU(); u64 fence = gpu.RequestFlush(addr, size); PushCommand(GPUTickCommand()); while (fence > gpu.CurrentFlushRequestFence()) { } + break; + } + default: + UNIMPLEMENTED_MSG("Unsupported gpu_accuracy {}", Settings::values.gpu_accuracy.GetValue()); } } @@ -115,7 +138,8 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { } void ThreadManager::WaitIdle() const { - while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed)) { + while (state.last_fence > state.signaled_fence.load(std::memory_order_relaxed) && + system.IsPoweredOn()) { } } @@ -126,6 +150,12 @@ void ThreadManager::OnCommandListEnd() { u64 ThreadManager::PushCommand(CommandData&& command_data) { const u64 fence{++state.last_fence}; state.queue.Push(CommandDataContainer(std::move(command_data), fence)); + + if (!is_async) { + // In synchronous GPU mode, block the caller until the command has executed + WaitIdle(); + } + return fence; } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 5a28335d6..2775629e7 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -10,8 +10,9 @@ #include <optional> #include <thread> #include <variant> + #include "common/threadsafe_queue.h" -#include "video_core/gpu.h" +#include "video_core/framebuffer_config.h" namespace Tegra { struct FramebufferConfig; @@ -25,6 +26,10 @@ class GraphicsContext; class System; } // namespace Core +namespace VideoCore { +class RendererBase; +} // namespace VideoCore + namespace VideoCommon::GPUThread { /// Command to signal to the GPU thread that processing has ended @@ -32,22 +37,30 @@ struct EndProcessingCommand final {}; /// Command to signal to the GPU thread that a command list is ready for processing struct SubmitListCommand final { - explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {} + explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {} Tegra::CommandList entries; }; +/// Command to signal to the GPU thread that a cdma command list is ready for processing +struct SubmitChCommandEntries final { + explicit SubmitChCommandEntries(Tegra::ChCommandHeaderList&& entries_) + : entries{std::move(entries_)} {} + + Tegra::ChCommandHeaderList entries; +}; + /// Command to signal to the GPU thread that a swap buffers is pending struct SwapBuffersCommand final { - explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer) - : framebuffer{std::move(framebuffer)} {} + explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer_) + : framebuffer{std::move(framebuffer_)} {} std::optional<Tegra::FramebufferConfig> framebuffer; }; /// Command to signal to the GPU thread to flush a region struct FlushRegionCommand final { - explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} + explicit constexpr FlushRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {} VAddr addr; u64 size; @@ -55,7 +68,7 @@ struct FlushRegionCommand final { /// Command to signal to the GPU thread to invalidate a region struct InvalidateRegionCommand final { - explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {} + explicit constexpr InvalidateRegionCommand(VAddr addr_, u64 size_) : addr{addr_}, size{size_} {} VAddr addr; u64 size; @@ -63,8 +76,8 @@ struct InvalidateRegionCommand final { /// Command to signal to the GPU thread to flush and invalidate a region struct FlushAndInvalidateRegionCommand final { - explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size) - : addr{addr}, size{size} {} + explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr_, u64 size_) + : addr{addr_}, size{size_} {} VAddr addr; u64 size; @@ -77,15 +90,15 @@ struct OnCommandListEndCommand final {}; struct GPUTickCommand final {}; using CommandData = - std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, - InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand, - GPUTickCommand>; + std::variant<EndProcessingCommand, SubmitListCommand, SubmitChCommandEntries, + SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand, + FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>; struct CommandDataContainer { CommandDataContainer() = default; - CommandDataContainer(CommandData&& data, u64 next_fence) - : data{std::move(data)}, fence{next_fence} {} + explicit CommandDataContainer(CommandData&& data_, u64 next_fence_) + : data{std::move(data_)}, fence{next_fence_} {} CommandData data; u64 fence{}; @@ -104,16 +117,19 @@ struct SynchState final { /// Class used to manage the GPU thread class ThreadManager final { public: - explicit ThreadManager(Core::System& system); + explicit ThreadManager(Core::System& system_, bool is_async_); ~ThreadManager(); /// Creates and starts the GPU thread. void StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, - Tegra::DmaPusher& dma_pusher); + Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher); /// Push GPU command entries to be processed void SubmitList(Tegra::CommandList&& entries); + /// Push GPU CDMA command buffer entries to be processed + void SubmitCommandBuffer(Tegra::ChCommandHeaderList&& entries); + /// Swap buffers (render frame) void SwapBuffers(const Tegra::FramebufferConfig* framebuffer); @@ -135,11 +151,11 @@ private: /// Pushes a command to be executed by the GPU thread u64 PushCommand(CommandData&& command_data); -private: SynchState state; Core::System& system; std::thread thread; std::thread::id thread_id; + const bool is_async; }; } // namespace VideoCommon::GPUThread diff --git a/src/video_core/guest_driver.h b/src/video_core/guest_driver.h index 99450777e..21e569ba1 100644 --- a/src/video_core/guest_driver.h +++ b/src/video_core/guest_driver.h @@ -19,8 +19,8 @@ namespace VideoCore { class GuestDriverProfile { public: explicit GuestDriverProfile() = default; - explicit GuestDriverProfile(std::optional<u32> texture_handler_size) - : texture_handler_size{texture_handler_size} {} + explicit GuestDriverProfile(std::optional<u32> texture_handler_size_) + : texture_handler_size{texture_handler_size_} {} void DeduceTextureHandlerSize(std::vector<u32> bound_offsets); diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index aa62363a7..4c7399d5a 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -1,18 +1,29 @@ set(SHADER_FILES + block_linear_unswizzle_2d.comp + block_linear_unswizzle_3d.comp + convert_depth_to_float.frag + convert_float_to_depth.frag + full_screen_triangle.vert + opengl_copy_bc4.comp opengl_present.frag opengl_present.vert + pitch_unswizzle.comp + vulkan_blit_color_float.frag + vulkan_blit_depth_stencil.frag + vulkan_present.frag + vulkan_present.vert + vulkan_quad_array.comp + vulkan_quad_indexed.comp + vulkan_uint8.comp ) -set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) -set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE) +find_program(GLSLANGVALIDATOR "glslangValidator" REQUIRED) + +set(GLSL_FLAGS "") +set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) set(SHADER_DIR ${SHADER_INCLUDE}/video_core/host_shaders) -add_custom_command( - OUTPUT - ${SHADER_DIR} - COMMAND - ${CMAKE_COMMAND} -E make_directory ${SHADER_DIR} -) +set(HOST_SHADERS_INCLUDE ${SHADER_INCLUDE} PARENT_SCOPE) set(INPUT_FILE ${CMAKE_CURRENT_SOURCE_DIR}/source_shader.h.in) set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake) @@ -20,19 +31,36 @@ set(HEADER_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/StringShaderHeader.cmake) foreach(FILENAME IN ITEMS ${SHADER_FILES}) string(REPLACE "." "_" SHADER_NAME ${FILENAME}) set(SOURCE_FILE ${CMAKE_CURRENT_SOURCE_DIR}/${FILENAME}) - set(HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h) - add_custom_command( - OUTPUT - ${HEADER_FILE} - COMMAND - ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${HEADER_FILE} ${INPUT_FILE} - MAIN_DEPENDENCY - ${SOURCE_FILE} - DEPENDS - ${HEADER_GENERATOR} - ${INPUT_FILE} - ) - set(SHADER_HEADERS ${SHADER_HEADERS} ${HEADER_FILE}) + # Skip generating source headers on Vulkan exclusive files + if (NOT ${FILENAME} MATCHES "vulkan.*") + set(SOURCE_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}.h) + add_custom_command( + OUTPUT + ${SOURCE_HEADER_FILE} + COMMAND + ${CMAKE_COMMAND} -P ${HEADER_GENERATOR} ${SOURCE_FILE} ${SOURCE_HEADER_FILE} ${INPUT_FILE} + MAIN_DEPENDENCY + ${SOURCE_FILE} + DEPENDS + ${INPUT_FILE} + # HEADER_GENERATOR should be included here but msbuild seems to assume it's always modified + ) + set(SHADER_HEADERS ${SHADER_HEADERS} ${SOURCE_HEADER_FILE}) + endif() + # Skip compiling to SPIR-V OpenGL exclusive files + if (NOT ${FILENAME} MATCHES "opengl.*") + string(TOUPPER ${SHADER_NAME}_SPV SPIRV_VARIABLE_NAME) + set(SPIRV_HEADER_FILE ${SHADER_DIR}/${SHADER_NAME}_spv.h) + add_custom_command( + OUTPUT + ${SPIRV_HEADER_FILE} + COMMAND + ${GLSLANGVALIDATOR} -V ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} + MAIN_DEPENDENCY + ${SOURCE_FILE} + ) + set(SHADER_HEADERS ${SHADER_HEADERS} ${SPIRV_HEADER_FILE}) + endif() endforeach() add_custom_target(host_shaders diff --git a/src/video_core/host_shaders/StringShaderHeader.cmake b/src/video_core/host_shaders/StringShaderHeader.cmake index 368bce0ed..c0fc49768 100644 --- a/src/video_core/host_shaders/StringShaderHeader.cmake +++ b/src/video_core/host_shaders/StringShaderHeader.cmake @@ -8,4 +8,6 @@ string(TOUPPER ${CONTENTS_NAME} CONTENTS_NAME) file(READ ${SOURCE_FILE} CONTENTS) +get_filename_component(OUTPUT_DIR ${HEADER_FILE} DIRECTORY) +make_directory(${OUTPUT_DIR}) configure_file(${INPUT_FILE} ${HEADER_FILE} @ONLY) diff --git a/src/video_core/host_shaders/block_linear_unswizzle_2d.comp b/src/video_core/host_shaders/block_linear_unswizzle_2d.comp new file mode 100644 index 000000000..a131be79e --- /dev/null +++ b/src/video_core/host_shaders/block_linear_unswizzle_2d.comp @@ -0,0 +1,122 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 + +#ifdef VULKAN + +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_8bit_storage : require +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 2 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout (location = n) uniform +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uvec3 origin; +UNIFORM(1) ivec3 destination; +UNIFORM(2) uint bytes_per_block_log2; +UNIFORM(3) uint layer_stride; +UNIFORM(4) uint block_size; +UNIFORM(5) uint x_shift; +UNIFORM(6) uint block_height; +UNIFORM(7) uint block_height_mask; +END_PUSH_CONSTANTS + +layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { + uint swizzle_table[]; +}; + +#if HAS_EXTENDED_TYPES +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; }; +#endif +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; }; + +layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage2DArray output_image; + +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +const uint GOB_SIZE_X = 64; +const uint GOB_SIZE_Y = 8; +const uint GOB_SIZE_Z = 1; +const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; + +const uint GOB_SIZE_X_SHIFT = 6; +const uint GOB_SIZE_Y_SHIFT = 3; +const uint GOB_SIZE_Z_SHIFT = 0; +const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; + +const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); + +uint SwizzleOffset(uvec2 pos) { + pos = pos & SWIZZLE_MASK; + return swizzle_table[pos.y * 64 + pos.x]; +} + +uvec4 ReadTexel(uint offset) { + switch (bytes_per_block_log2) { +#if HAS_EXTENDED_TYPES + case 0: + return uvec4(u8data[offset], 0, 0, 0); + case 1: + return uvec4(u16data[offset / 2], 0, 0, 0); +#else + case 0: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0); + case 1: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0); +#endif + case 2: + return uvec4(u32data[offset / 4], 0, 0, 0); + case 3: + return uvec4(u64data[offset / 8], 0, 0); + case 4: + return u128data[offset / 16]; + } + return uvec4(0); +} + +void main() { + uvec3 pos = gl_GlobalInvocationID + origin; + pos.x <<= bytes_per_block_log2; + + // Read as soon as possible due to its latency + const uint swizzle = SwizzleOffset(pos.xy); + + const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; + + uint offset = 0; + offset += pos.z * layer_stride; + offset += (block_y >> block_height) * block_size; + offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT; + offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; + offset += swizzle; + + const uvec4 texel = ReadTexel(offset); + const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination; + imageStore(output_image, coord, texel); +} diff --git a/src/video_core/host_shaders/block_linear_unswizzle_3d.comp b/src/video_core/host_shaders/block_linear_unswizzle_3d.comp new file mode 100644 index 000000000..bb6872e6b --- /dev/null +++ b/src/video_core/host_shaders/block_linear_unswizzle_3d.comp @@ -0,0 +1,125 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 + +#ifdef VULKAN + +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_8bit_storage : require +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 2 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout (location = n) uniform +#define BINDING_SWIZZLE_BUFFER 0 +#define BINDING_INPUT_BUFFER 1 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uvec3 origin; +UNIFORM(1) ivec3 destination; +UNIFORM(2) uint bytes_per_block_log2; +UNIFORM(3) uint slice_size; +UNIFORM(4) uint block_size; +UNIFORM(5) uint x_shift; +UNIFORM(6) uint block_height; +UNIFORM(7) uint block_height_mask; +UNIFORM(8) uint block_depth; +UNIFORM(9) uint block_depth_mask; +END_PUSH_CONSTANTS + +layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { + uint swizzle_table[]; +}; + +#if HAS_EXTENDED_TYPES +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU8 { uint8_t u8data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU16 { uint16_t u16data[]; }; +#endif +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU32 { uint u32data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU64 { uvec2 u64data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) buffer InputBufferU128 { uvec4 u128data[]; }; + +layout(binding = BINDING_OUTPUT_IMAGE) uniform writeonly uimage3D output_image; + +layout(local_size_x = 16, local_size_y = 8, local_size_z = 8) in; + +const uint GOB_SIZE_X = 64; +const uint GOB_SIZE_Y = 8; +const uint GOB_SIZE_Z = 1; +const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; + +const uint GOB_SIZE_X_SHIFT = 6; +const uint GOB_SIZE_Y_SHIFT = 3; +const uint GOB_SIZE_Z_SHIFT = 0; +const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; + +const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); + +uint SwizzleOffset(uvec2 pos) { + pos = pos & SWIZZLE_MASK; + return swizzle_table[pos.y * 64 + pos.x]; +} + +uvec4 ReadTexel(uint offset) { + switch (bytes_per_block_log2) { +#if HAS_EXTENDED_TYPES + case 0: + return uvec4(u8data[offset], 0, 0, 0); + case 1: + return uvec4(u16data[offset / 2], 0, 0, 0); +#else + case 0: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0); + case 1: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0); +#endif + case 2: + return uvec4(u32data[offset / 4], 0, 0, 0); + case 3: + return uvec4(u64data[offset / 8], 0, 0); + case 4: + return u128data[offset / 16]; + } + return uvec4(0); +} + +void main() { + uvec3 pos = gl_GlobalInvocationID + origin; + pos.x <<= bytes_per_block_log2; + + // Read as soon as possible due to its latency + const uint swizzle = SwizzleOffset(pos.xy); + + const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT; + + uint offset = 0; + offset += (pos.z >> block_depth) * slice_size; + offset += (pos.z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height); + offset += (block_y >> block_height) * block_size; + offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT; + offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift; + offset += swizzle; + + const uvec4 texel = ReadTexel(offset); + const ivec3 coord = ivec3(gl_GlobalInvocationID) + destination; + imageStore(output_image, coord, texel); +} diff --git a/src/video_core/host_shaders/convert_depth_to_float.frag b/src/video_core/host_shaders/convert_depth_to_float.frag new file mode 100644 index 000000000..624c58509 --- /dev/null +++ b/src/video_core/host_shaders/convert_depth_to_float.frag @@ -0,0 +1,13 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +layout(binding = 0) uniform sampler2D depth_texture; +layout(location = 0) out float output_color; + +void main() { + ivec2 coord = ivec2(gl_FragCoord.xy); + output_color = texelFetch(depth_texture, coord, 0).r; +} diff --git a/src/video_core/host_shaders/convert_float_to_depth.frag b/src/video_core/host_shaders/convert_float_to_depth.frag new file mode 100644 index 000000000..d86c795f4 --- /dev/null +++ b/src/video_core/host_shaders/convert_float_to_depth.frag @@ -0,0 +1,13 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +layout(binding = 0) uniform sampler2D color_texture; + +void main() { + ivec2 coord = ivec2(gl_FragCoord.xy); + float color = texelFetch(color_texture, coord, 0).r; + gl_FragDepth = color; +} diff --git a/src/video_core/host_shaders/full_screen_triangle.vert b/src/video_core/host_shaders/full_screen_triangle.vert new file mode 100644 index 000000000..452ad6502 --- /dev/null +++ b/src/video_core/host_shaders/full_screen_triangle.vert @@ -0,0 +1,29 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +#ifdef VULKAN +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout (location = n) uniform +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) vec2 tex_scale; +UNIFORM(1) vec2 tex_offset; +END_PUSH_CONSTANTS + +layout(location = 0) out vec2 texcoord; + +void main() { + float x = float((gl_VertexIndex & 1) << 2); + float y = float((gl_VertexIndex & 2) << 1); + gl_Position = vec4(x - 1.0, y - 1.0, 0.0, 1.0); + texcoord = fma(vec2(x, y) / 2.0, tex_scale, tex_offset); +} diff --git a/src/video_core/host_shaders/opengl_copy_bc4.comp b/src/video_core/host_shaders/opengl_copy_bc4.comp new file mode 100644 index 000000000..7b8e20fbe --- /dev/null +++ b/src/video_core/host_shaders/opengl_copy_bc4.comp @@ -0,0 +1,70 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 core +#extension GL_ARB_gpu_shader_int64 : require + +layout (local_size_x = 4, local_size_y = 4) in; + +layout(binding = 0, rg32ui) readonly uniform uimage3D bc4_input; +layout(binding = 1, rgba8ui) writeonly uniform uimage3D bc4_output; + +layout(location = 0) uniform uvec3 src_offset; +layout(location = 1) uniform uvec3 dst_offset; + +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt +uint DecompressBlock(uint64_t bits, uvec2 coord) { + const uint code_offset = 16 + 3 * (4 * coord.y + coord.x); + const uint code = uint(bits >> code_offset) & 7; + const uint red0 = uint(bits >> 0) & 0xff; + const uint red1 = uint(bits >> 8) & 0xff; + if (red0 > red1) { + switch (code) { + case 0: + return red0; + case 1: + return red1; + case 2: + return (6 * red0 + 1 * red1) / 7; + case 3: + return (5 * red0 + 2 * red1) / 7; + case 4: + return (4 * red0 + 3 * red1) / 7; + case 5: + return (3 * red0 + 4 * red1) / 7; + case 6: + return (2 * red0 + 5 * red1) / 7; + case 7: + return (1 * red0 + 6 * red1) / 7; + } + } else { + switch (code) { + case 0: + return red0; + case 1: + return red1; + case 2: + return (4 * red0 + 1 * red1) / 5; + case 3: + return (3 * red0 + 2 * red1) / 5; + case 4: + return (2 * red0 + 3 * red1) / 5; + case 5: + return (1 * red0 + 4 * red1) / 5; + case 6: + return 0; + case 7: + return 0xff; + } + } + return 0; +} + +void main() { + uvec2 packed_bits = imageLoad(bc4_input, ivec3(gl_WorkGroupID + src_offset)).rg; + uint64_t bits = packUint2x32(packed_bits); + uint red = DecompressBlock(bits, gl_LocalInvocationID.xy); + uvec4 color = uvec4(red & 0xff, 0, 0, 0xff); + imageStore(bc4_output, ivec3(gl_GlobalInvocationID + dst_offset), color); +} diff --git a/src/video_core/host_shaders/opengl_present.frag b/src/video_core/host_shaders/opengl_present.frag index 8a4cb024b..84b818227 100644 --- a/src/video_core/host_shaders/opengl_present.frag +++ b/src/video_core/host_shaders/opengl_present.frag @@ -1,3 +1,7 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + #version 430 core layout (location = 0) in vec2 frag_tex_coord; diff --git a/src/video_core/host_shaders/opengl_present.vert b/src/video_core/host_shaders/opengl_present.vert index 2235d31a4..c3b5adbba 100644 --- a/src/video_core/host_shaders/opengl_present.vert +++ b/src/video_core/host_shaders/opengl_present.vert @@ -1,3 +1,7 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + #version 430 core out gl_PerVertex { diff --git a/src/video_core/host_shaders/pitch_unswizzle.comp b/src/video_core/host_shaders/pitch_unswizzle.comp new file mode 100644 index 000000000..cb48ec170 --- /dev/null +++ b/src/video_core/host_shaders/pitch_unswizzle.comp @@ -0,0 +1,86 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 430 + +#ifdef VULKAN + +#extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_shader_8bit_storage : require +#define HAS_EXTENDED_TYPES 1 +#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants { +#define END_PUSH_CONSTANTS }; +#define UNIFORM(n) +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 1 + +#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv + +#extension GL_NV_gpu_shader5 : enable +#ifdef GL_NV_gpu_shader5 +#define HAS_EXTENDED_TYPES 1 +#else +#define HAS_EXTENDED_TYPES 0 +#endif +#define BEGIN_PUSH_CONSTANTS +#define END_PUSH_CONSTANTS +#define UNIFORM(n) layout (location = n) uniform +#define BINDING_INPUT_BUFFER 0 +#define BINDING_OUTPUT_IMAGE 0 + +#endif + +BEGIN_PUSH_CONSTANTS +UNIFORM(0) uvec2 origin; +UNIFORM(1) ivec2 destination; +UNIFORM(2) uint bytes_per_block; +UNIFORM(3) uint pitch; +END_PUSH_CONSTANTS + +#if HAS_EXTENDED_TYPES +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU8 { uint8_t u8data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU16 { uint16_t u16data[]; }; +#endif +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { uint u32data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU64 { uvec2 u64data[]; }; +layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU128 { uvec4 u128data[]; }; + +layout(binding = BINDING_OUTPUT_IMAGE) writeonly uniform uimage2D output_image; + +layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; + +uvec4 ReadTexel(uint offset) { + switch (bytes_per_block) { +#if HAS_EXTENDED_TYPES + case 1: + return uvec4(u8data[offset], 0, 0, 0); + case 2: + return uvec4(u16data[offset / 2], 0, 0, 0); +#else + case 1: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 24), 8), 0, 0, 0); + case 2: + return uvec4(bitfieldExtract(u32data[offset / 4], int((offset * 8) & 16), 16), 0, 0, 0); +#endif + case 4: + return uvec4(u32data[offset / 4], 0, 0, 0); + case 8: + return uvec4(u64data[offset / 8], 0, 0); + case 16: + return u128data[offset / 16]; + } + return uvec4(0); +} + +void main() { + uvec2 pos = gl_GlobalInvocationID.xy + origin; + + uint offset = 0; + offset += pos.x * bytes_per_block; + offset += pos.y * pitch; + + const uvec4 texel = ReadTexel(offset); + const ivec2 coord = ivec2(gl_GlobalInvocationID.xy) + destination; + imageStore(output_image, coord, texel); +} diff --git a/src/video_core/host_shaders/vulkan_blit_color_float.frag b/src/video_core/host_shaders/vulkan_blit_color_float.frag new file mode 100644 index 000000000..4a6aae410 --- /dev/null +++ b/src/video_core/host_shaders/vulkan_blit_color_float.frag @@ -0,0 +1,14 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 + +layout(binding = 0) uniform sampler2D tex; + +layout(location = 0) in vec2 texcoord; +layout(location = 0) out vec4 color; + +void main() { + color = textureLod(tex, texcoord, 0); +} diff --git a/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag b/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag new file mode 100644 index 000000000..19bb23a5a --- /dev/null +++ b/src/video_core/host_shaders/vulkan_blit_depth_stencil.frag @@ -0,0 +1,16 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#version 450 +#extension GL_ARB_shader_stencil_export : require + +layout(binding = 0) uniform sampler2D depth_tex; +layout(binding = 1) uniform isampler2D stencil_tex; + +layout(location = 0) in vec2 texcoord; + +void main() { + gl_FragDepth = textureLod(depth_tex, texcoord, 0).r; + gl_FragStencilRefARB = textureLod(stencil_tex, texcoord, 0).r; +} diff --git a/src/video_core/renderer_vulkan/shaders/blit.frag b/src/video_core/host_shaders/vulkan_present.frag index a06ecd24a..0979ff3e6 100644 --- a/src/video_core/renderer_vulkan/shaders/blit.frag +++ b/src/video_core/host_shaders/vulkan_present.frag @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V $THIS_FILE -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core layout (location = 0) in vec2 frag_tex_coord; diff --git a/src/video_core/renderer_vulkan/shaders/blit.vert b/src/video_core/host_shaders/vulkan_present.vert index c64d9235a..00b868958 100644 --- a/src/video_core/renderer_vulkan/shaders/blit.vert +++ b/src/video_core/host_shaders/vulkan_present.vert @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V $THIS_FILE -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core layout (location = 0) in vec2 vert_position; diff --git a/src/video_core/renderer_vulkan/shaders/quad_array.comp b/src/video_core/host_shaders/vulkan_quad_array.comp index 5a5703308..212f4e998 100644 --- a/src/video_core/renderer_vulkan/shaders/quad_array.comp +++ b/src/video_core/host_shaders/vulkan_quad_array.comp @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V $THIS_FILE -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core layout (local_size_x = 1024) in; diff --git a/src/video_core/renderer_vulkan/shaders/quad_indexed.comp b/src/video_core/host_shaders/vulkan_quad_indexed.comp index 5a472ba9b..8655591d0 100644 --- a/src/video_core/renderer_vulkan/shaders/quad_indexed.comp +++ b/src/video_core/host_shaders/vulkan_quad_indexed.comp @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V quad_indexed.comp -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core layout (local_size_x = 1024) in; diff --git a/src/video_core/renderer_vulkan/shaders/uint8.comp b/src/video_core/host_shaders/vulkan_uint8.comp index a320f3ae0..ad74d7af9 100644 --- a/src/video_core/renderer_vulkan/shaders/uint8.comp +++ b/src/video_core/host_shaders/vulkan_uint8.comp @@ -2,15 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -/* - * Build instructions: - * $ glslangValidator -V $THIS_FILE -o output.spv - * $ spirv-opt -O --strip-debug output.spv -o optimized.spv - * $ xxd -i optimized.spv - * - * Then copy that bytecode to the C++ file - */ - #version 460 core #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_8bit_storage : require diff --git a/src/video_core/macro/macro_hle.cpp b/src/video_core/macro/macro_hle.cpp index df00b57df..70ac7c620 100644 --- a/src/video_core/macro/macro_hle.cpp +++ b/src/video_core/macro/macro_hle.cpp @@ -85,7 +85,7 @@ constexpr std::array<std::pair<u64, HLEFunction>, 3> hle_funcs{{ {0x0217920100488FF7, &HLE_0217920100488FF7}, }}; -HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} +HLEMacro::HLEMacro(Engines::Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {} HLEMacro::~HLEMacro() = default; std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) const { @@ -99,8 +99,8 @@ std::optional<std::unique_ptr<CachedMacro>> HLEMacro::GetHLEProgram(u64 hash) co HLEMacroImpl::~HLEMacroImpl() = default; -HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d, HLEFunction func) - : maxwell3d(maxwell3d), func(func) {} +HLEMacroImpl::HLEMacroImpl(Engines::Maxwell3D& maxwell3d_, HLEFunction func_) + : maxwell3d{maxwell3d_}, func{func_} {} void HLEMacroImpl::Execute(const std::vector<u32>& parameters, u32 method) { func(maxwell3d, parameters); diff --git a/src/video_core/macro/macro_hle.h b/src/video_core/macro/macro_hle.h index 37af875a0..cb3bd1600 100644 --- a/src/video_core/macro/macro_hle.h +++ b/src/video_core/macro/macro_hle.h @@ -20,7 +20,7 @@ using HLEFunction = void (*)(Engines::Maxwell3D& maxwell3d, const std::vector<u3 class HLEMacro { public: - explicit HLEMacro(Engines::Maxwell3D& maxwell3d); + explicit HLEMacro(Engines::Maxwell3D& maxwell3d_); ~HLEMacro(); std::optional<std::unique_ptr<CachedMacro>> GetHLEProgram(u64 hash) const; diff --git a/src/video_core/macro/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index bd01fd1f2..8da26fd59 100644 --- a/src/video_core/macro/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp @@ -11,29 +11,29 @@ MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); namespace Tegra { -MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) - : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} +MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d_) + : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {} std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) { return std::make_unique<MacroInterpreterImpl>(maxwell3d, code); } -MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, - const std::vector<u32>& code) - : maxwell3d(maxwell3d), code(code) {} +MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, + const std::vector<u32>& code_) + : maxwell3d{maxwell3d_}, code{code_} {} -void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) { +void MacroInterpreterImpl::Execute(const std::vector<u32>& params, u32 method) { MICROPROFILE_SCOPE(MacroInterp); Reset(); - registers[1] = parameters[0]; - num_parameters = parameters.size(); + registers[1] = params[0]; + num_parameters = params.size(); if (num_parameters > parameters_capacity) { parameters_capacity = num_parameters; - this->parameters = std::make_unique<u32[]>(num_parameters); + parameters = std::make_unique<u32[]>(num_parameters); } - std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32)); + std::memcpy(parameters.get(), params.data(), num_parameters * sizeof(u32)); // Execute the code until we hit an exit condition. bool keep_executing = true; @@ -133,8 +133,7 @@ bool MacroInterpreterImpl::Step(bool is_delay_slot) { break; } default: - UNIMPLEMENTED_MSG("Unimplemented macro operation {}", - static_cast<u32>(opcode.operation.Value())); + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", opcode.operation.Value()); } // An instruction with the Exit flag will not actually @@ -182,7 +181,7 @@ u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, return ~(src_a & src_b); default: - UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", operation); return 0; } } @@ -230,7 +229,7 @@ void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 r Send((result >> 12) & 0b111111); break; default: - UNIMPLEMENTED_MSG("Unimplemented result operation {}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented result operation {}", operation); } } diff --git a/src/video_core/macro/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h index 90217fc89..d50c619ce 100644 --- a/src/video_core/macro/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h @@ -17,7 +17,7 @@ class Maxwell3D; class MacroInterpreter final : public MacroEngine { public: - explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); + explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d_); protected: std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; @@ -28,8 +28,8 @@ private: class MacroInterpreterImpl : public CachedMacro { public: - MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); - void Execute(const std::vector<u32>& parameters, u32 method) override; + explicit MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_); + void Execute(const std::vector<u32>& params, u32 method) override; private: /// Resets the execution engine state, zeroing registers, etc. @@ -38,9 +38,9 @@ private: /** * Executes a single macro instruction located at the current program counter. Returns whether * the interpreter should keep running. - * @param offset Offset to start execution at. + * * @param is_delay_slot Whether the current step is being executed due to a delay slot in a - * previous instruction. + * previous instruction. */ bool Step(bool is_delay_slot); diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp index 954b87515..c6b2b2109 100644 --- a/src/video_core/macro/macro_jit_x64.cpp +++ b/src/video_core/macro/macro_jit_x64.cpp @@ -28,15 +28,15 @@ static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ BRANCH_HOLDER, }); -MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) - : MacroEngine::MacroEngine(maxwell3d), maxwell3d(maxwell3d) {} +MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d_) + : MacroEngine{maxwell3d_}, maxwell3d{maxwell3d_} {} std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) { return std::make_unique<MacroJITx64Impl>(maxwell3d, code); } -MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code) - : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) { +MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_) + : CodeGenerator{MAX_CODE_SIZE}, code{code_}, maxwell3d{maxwell3d_} { Compile(); } @@ -165,8 +165,7 @@ void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { } break; default: - UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", - static_cast<std::size_t>(opcode.alu_operation.Value())); + UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", opcode.alu_operation.Value()); break; } Compile_ProcessResult(opcode.result_operation, opcode.dst); @@ -553,15 +552,15 @@ Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { } void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { - const auto SetRegister = [this](u32 reg, const Xbyak::Reg32& result) { + const auto SetRegister = [this](u32 reg_index, const Xbyak::Reg32& result) { // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero // register. - if (reg == 0) { + if (reg_index == 0) { return; } - mov(dword[STATE + offsetof(JITState, registers) + reg * sizeof(u32)], result); + mov(dword[STATE + offsetof(JITState, registers) + reg_index * sizeof(u32)], result); }; - const auto SetMethodAddress = [this](const Xbyak::Reg32& reg) { mov(METHOD_ADDRESS, reg); }; + const auto SetMethodAddress = [this](const Xbyak::Reg32& reg32) { mov(METHOD_ADDRESS, reg32); }; switch (operation) { case Macro::ResultOperation::IgnoreAndFetch: @@ -604,7 +603,7 @@ void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u3 Compile_Send(RESULT); break; default: - UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation)); + UNIMPLEMENTED_MSG("Unimplemented macro operation {}", operation); } } diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h index a180e7428..7f50ac2f8 100644 --- a/src/video_core/macro/macro_jit_x64.h +++ b/src/video_core/macro/macro_jit_x64.h @@ -23,7 +23,7 @@ constexpr size_t MAX_CODE_SIZE = 0x10000; class MacroJITx64 final : public MacroEngine { public: - explicit MacroJITx64(Engines::Maxwell3D& maxwell3d); + explicit MacroJITx64(Engines::Maxwell3D& maxwell3d_); protected: std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; @@ -34,7 +34,7 @@ private: class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { public: - MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); + explicit MacroJITx64Impl(Engines::Maxwell3D& maxwell3d_, const std::vector<u32>& code_); ~MacroJITx64Impl(); void Execute(const std::vector<u32>& parameters, u32 method) override; diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 02cf53d15..65feff588 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -11,6 +11,7 @@ #include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_base.h" namespace Tegra { @@ -44,13 +45,22 @@ GPUVAddr MemoryManager::MapAllocate(VAddr cpu_addr, std::size_t size, std::size_ return Map(cpu_addr, *FindFreeRange(size, align), size); } +GPUVAddr MemoryManager::MapAllocate32(VAddr cpu_addr, std::size_t size) { + const std::optional<GPUVAddr> gpu_addr = FindFreeRange(size, 1, true); + ASSERT(gpu_addr); + return Map(cpu_addr, *gpu_addr, size); +} + void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) { if (!size) { return; } // Flush and invalidate through the GPU interface, to be asynchronous if possible. - system.GPU().FlushAndInvalidateRegion(*GpuToCpuAddress(gpu_addr), size); + const std::optional<VAddr> cpu_addr = GpuToCpuAddress(gpu_addr); + ASSERT(cpu_addr); + + rasterizer->UnmapMemory(*cpu_addr, size); UpdateRange(gpu_addr, PageEntry::State::Unmapped, size); } @@ -108,7 +118,8 @@ void MemoryManager::SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::s page_table[PageEntryIndex(gpu_addr)] = page_entry; } -std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align) const { +std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size_t align, + bool start_32bit_address) const { if (!align) { align = page_size; } else { @@ -116,7 +127,7 @@ std::optional<GPUVAddr> MemoryManager::FindFreeRange(std::size_t size, std::size } u64 available_size{}; - GPUVAddr gpu_addr{address_space_start}; + GPUVAddr gpu_addr{start_32bit_address ? address_space_start_low : address_space_start}; while (gpu_addr + available_size < address_space_size) { if (GetPageEntry(gpu_addr + available_size).IsUnmapped()) { available_size += page_size; diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 53c8d122a..c35e57689 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -28,7 +28,7 @@ public: }; constexpr PageEntry() = default; - constexpr PageEntry(State state) : state{state} {} + constexpr PageEntry(State state_) : state{state_} {} constexpr PageEntry(VAddr addr) : state{static_cast<State>(addr >> ShiftBits)} {} [[nodiscard]] constexpr bool IsUnmapped() const { @@ -68,7 +68,7 @@ static_assert(sizeof(PageEntry) == 4, "PageEntry is too large"); class MemoryManager final { public: - explicit MemoryManager(Core::System& system); + explicit MemoryManager(Core::System& system_); ~MemoryManager(); /// Binds a renderer to the memory manager. @@ -116,6 +116,7 @@ public: [[nodiscard]] GPUVAddr Map(VAddr cpu_addr, GPUVAddr gpu_addr, std::size_t size); [[nodiscard]] GPUVAddr MapAllocate(VAddr cpu_addr, std::size_t size, std::size_t align); + [[nodiscard]] GPUVAddr MapAllocate32(VAddr cpu_addr, std::size_t size); [[nodiscard]] std::optional<GPUVAddr> AllocateFixed(GPUVAddr gpu_addr, std::size_t size); [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align); void Unmap(GPUVAddr gpu_addr, std::size_t size); @@ -124,7 +125,8 @@ private: [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const; void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size); GPUVAddr UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size); - [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align) const; + [[nodiscard]] std::optional<GPUVAddr> FindFreeRange(std::size_t size, std::size_t align, + bool start_32bit_address = false) const; void TryLockPage(PageEntry page_entry, std::size_t size); void TryUnlockPage(PageEntry page_entry, std::size_t size); @@ -135,6 +137,7 @@ private: static constexpr u64 address_space_size = 1ULL << 40; static constexpr u64 address_space_start = 1ULL << 32; + static constexpr u64 address_space_start_low = 1ULL << 16; static constexpr u64 page_bits{16}; static constexpr u64 page_size{1 << page_bits}; static constexpr u64 page_mask{page_size - 1}; diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index 9da9fb4ff..e69de29bb 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -1,250 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <array> -#include <cstring> -#include "common/assert.h" -#include "common/common_types.h" -#include "video_core/morton.h" -#include "video_core/surface.h" -#include "video_core/textures/decoders.h" - -namespace VideoCore { - -using Surface::GetBytesPerPixel; -using Surface::PixelFormat; - -using MortonCopyFn = void (*)(u32, u32, u32, u32, u32, u32, u8*, u8*); -using ConversionArray = std::array<MortonCopyFn, Surface::MaxPixelFormat>; - -template <bool morton_to_linear, PixelFormat format> -static void MortonCopy(u32 stride, u32 block_height, u32 height, u32 block_depth, u32 depth, - u32 tile_width_spacing, u8* buffer, u8* addr) { - constexpr u32 bytes_per_pixel = GetBytesPerPixel(format); - - // With the BCn formats (DXT and DXN), each 4x4 tile is swizzled instead of just individual - // pixel values. - constexpr u32 tile_size_x{GetDefaultBlockWidth(format)}; - constexpr u32 tile_size_y{GetDefaultBlockHeight(format)}; - - if constexpr (morton_to_linear) { - Tegra::Texture::UnswizzleTexture(buffer, addr, tile_size_x, tile_size_y, bytes_per_pixel, - stride, height, depth, block_height, block_depth, - tile_width_spacing); - } else { - Tegra::Texture::CopySwizzledData((stride + tile_size_x - 1) / tile_size_x, - (height + tile_size_y - 1) / tile_size_y, depth, - bytes_per_pixel, bytes_per_pixel, addr, buffer, false, - block_height, block_depth, tile_width_spacing); - } -} - -static constexpr ConversionArray morton_to_linear_fns = { - MortonCopy<true, PixelFormat::A8B8G8R8_UNORM>, - MortonCopy<true, PixelFormat::A8B8G8R8_SNORM>, - MortonCopy<true, PixelFormat::A8B8G8R8_SINT>, - MortonCopy<true, PixelFormat::A8B8G8R8_UINT>, - MortonCopy<true, PixelFormat::R5G6B5_UNORM>, - MortonCopy<true, PixelFormat::B5G6R5_UNORM>, - MortonCopy<true, PixelFormat::A1R5G5B5_UNORM>, - MortonCopy<true, PixelFormat::A2B10G10R10_UNORM>, - MortonCopy<true, PixelFormat::A2B10G10R10_UINT>, - MortonCopy<true, PixelFormat::A1B5G5R5_UNORM>, - MortonCopy<true, PixelFormat::R8_UNORM>, - MortonCopy<true, PixelFormat::R8_SNORM>, - MortonCopy<true, PixelFormat::R8_SINT>, - MortonCopy<true, PixelFormat::R8_UINT>, - MortonCopy<true, PixelFormat::R16G16B16A16_FLOAT>, - MortonCopy<true, PixelFormat::R16G16B16A16_UNORM>, - MortonCopy<true, PixelFormat::R16G16B16A16_SNORM>, - MortonCopy<true, PixelFormat::R16G16B16A16_SINT>, - MortonCopy<true, PixelFormat::R16G16B16A16_UINT>, - MortonCopy<true, PixelFormat::B10G11R11_FLOAT>, - MortonCopy<true, PixelFormat::R32G32B32A32_UINT>, - MortonCopy<true, PixelFormat::BC1_RGBA_UNORM>, - MortonCopy<true, PixelFormat::BC2_UNORM>, - MortonCopy<true, PixelFormat::BC3_UNORM>, - MortonCopy<true, PixelFormat::BC4_UNORM>, - MortonCopy<true, PixelFormat::BC4_SNORM>, - MortonCopy<true, PixelFormat::BC5_UNORM>, - MortonCopy<true, PixelFormat::BC5_SNORM>, - MortonCopy<true, PixelFormat::BC7_UNORM>, - MortonCopy<true, PixelFormat::BC6H_UFLOAT>, - MortonCopy<true, PixelFormat::BC6H_SFLOAT>, - MortonCopy<true, PixelFormat::ASTC_2D_4X4_UNORM>, - MortonCopy<true, PixelFormat::B8G8R8A8_UNORM>, - MortonCopy<true, PixelFormat::R32G32B32A32_FLOAT>, - MortonCopy<true, PixelFormat::R32G32B32A32_SINT>, - MortonCopy<true, PixelFormat::R32G32_FLOAT>, - MortonCopy<true, PixelFormat::R32G32_SINT>, - MortonCopy<true, PixelFormat::R32_FLOAT>, - MortonCopy<true, PixelFormat::R16_FLOAT>, - MortonCopy<true, PixelFormat::R16_UNORM>, - MortonCopy<true, PixelFormat::R16_SNORM>, - MortonCopy<true, PixelFormat::R16_UINT>, - MortonCopy<true, PixelFormat::R16_SINT>, - MortonCopy<true, PixelFormat::R16G16_UNORM>, - MortonCopy<true, PixelFormat::R16G16_FLOAT>, - MortonCopy<true, PixelFormat::R16G16_UINT>, - MortonCopy<true, PixelFormat::R16G16_SINT>, - MortonCopy<true, PixelFormat::R16G16_SNORM>, - MortonCopy<true, PixelFormat::R32G32B32_FLOAT>, - MortonCopy<true, PixelFormat::A8B8G8R8_SRGB>, - MortonCopy<true, PixelFormat::R8G8_UNORM>, - MortonCopy<true, PixelFormat::R8G8_SNORM>, - MortonCopy<true, PixelFormat::R8G8_SINT>, - MortonCopy<true, PixelFormat::R8G8_UINT>, - MortonCopy<true, PixelFormat::R32G32_UINT>, - MortonCopy<true, PixelFormat::R16G16B16X16_FLOAT>, - MortonCopy<true, PixelFormat::R32_UINT>, - MortonCopy<true, PixelFormat::R32_SINT>, - MortonCopy<true, PixelFormat::ASTC_2D_8X8_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_8X5_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_5X4_UNORM>, - MortonCopy<true, PixelFormat::B8G8R8A8_SRGB>, - MortonCopy<true, PixelFormat::BC1_RGBA_SRGB>, - MortonCopy<true, PixelFormat::BC2_SRGB>, - MortonCopy<true, PixelFormat::BC3_SRGB>, - MortonCopy<true, PixelFormat::BC7_SRGB>, - MortonCopy<true, PixelFormat::A4B4G4R4_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_4X4_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X8_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X5_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_5X4_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_5X5_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_5X5_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_10X8_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_10X8_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_6X6_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_6X6_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_10X10_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_10X10_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_12X12_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_12X12_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_8X6_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_8X6_SRGB>, - MortonCopy<true, PixelFormat::ASTC_2D_6X5_UNORM>, - MortonCopy<true, PixelFormat::ASTC_2D_6X5_SRGB>, - MortonCopy<true, PixelFormat::E5B9G9R9_FLOAT>, - MortonCopy<true, PixelFormat::D32_FLOAT>, - MortonCopy<true, PixelFormat::D16_UNORM>, - MortonCopy<true, PixelFormat::D24_UNORM_S8_UINT>, - MortonCopy<true, PixelFormat::S8_UINT_D24_UNORM>, - MortonCopy<true, PixelFormat::D32_FLOAT_S8_UINT>, -}; - -static constexpr ConversionArray linear_to_morton_fns = { - MortonCopy<false, PixelFormat::A8B8G8R8_UNORM>, - MortonCopy<false, PixelFormat::A8B8G8R8_SNORM>, - MortonCopy<false, PixelFormat::A8B8G8R8_SINT>, - MortonCopy<false, PixelFormat::A8B8G8R8_UINT>, - MortonCopy<false, PixelFormat::R5G6B5_UNORM>, - MortonCopy<false, PixelFormat::B5G6R5_UNORM>, - MortonCopy<false, PixelFormat::A1R5G5B5_UNORM>, - MortonCopy<false, PixelFormat::A2B10G10R10_UNORM>, - MortonCopy<false, PixelFormat::A2B10G10R10_UINT>, - MortonCopy<false, PixelFormat::A1B5G5R5_UNORM>, - MortonCopy<false, PixelFormat::R8_UNORM>, - MortonCopy<false, PixelFormat::R8_SNORM>, - MortonCopy<false, PixelFormat::R8_SINT>, - MortonCopy<false, PixelFormat::R8_UINT>, - MortonCopy<false, PixelFormat::R16G16B16A16_FLOAT>, - MortonCopy<false, PixelFormat::R16G16B16A16_SNORM>, - MortonCopy<false, PixelFormat::R16G16B16A16_SINT>, - MortonCopy<false, PixelFormat::R16G16B16A16_UNORM>, - MortonCopy<false, PixelFormat::R16G16B16A16_UINT>, - MortonCopy<false, PixelFormat::B10G11R11_FLOAT>, - MortonCopy<false, PixelFormat::R32G32B32A32_UINT>, - MortonCopy<false, PixelFormat::BC1_RGBA_UNORM>, - MortonCopy<false, PixelFormat::BC2_UNORM>, - MortonCopy<false, PixelFormat::BC3_UNORM>, - MortonCopy<false, PixelFormat::BC4_UNORM>, - MortonCopy<false, PixelFormat::BC4_SNORM>, - MortonCopy<false, PixelFormat::BC5_UNORM>, - MortonCopy<false, PixelFormat::BC5_SNORM>, - MortonCopy<false, PixelFormat::BC7_UNORM>, - MortonCopy<false, PixelFormat::BC6H_UFLOAT>, - MortonCopy<false, PixelFormat::BC6H_SFLOAT>, - // TODO(Subv): Swizzling ASTC formats are not supported - nullptr, - MortonCopy<false, PixelFormat::B8G8R8A8_UNORM>, - MortonCopy<false, PixelFormat::R32G32B32A32_FLOAT>, - MortonCopy<false, PixelFormat::R32G32B32A32_SINT>, - MortonCopy<false, PixelFormat::R32G32_FLOAT>, - MortonCopy<false, PixelFormat::R32G32_SINT>, - MortonCopy<false, PixelFormat::R32_FLOAT>, - MortonCopy<false, PixelFormat::R16_FLOAT>, - MortonCopy<false, PixelFormat::R16_UNORM>, - MortonCopy<false, PixelFormat::R16_SNORM>, - MortonCopy<false, PixelFormat::R16_UINT>, - MortonCopy<false, PixelFormat::R16_SINT>, - MortonCopy<false, PixelFormat::R16G16_UNORM>, - MortonCopy<false, PixelFormat::R16G16_FLOAT>, - MortonCopy<false, PixelFormat::R16G16_UINT>, - MortonCopy<false, PixelFormat::R16G16_SINT>, - MortonCopy<false, PixelFormat::R16G16_SNORM>, - MortonCopy<false, PixelFormat::R32G32B32_FLOAT>, - MortonCopy<false, PixelFormat::A8B8G8R8_SRGB>, - MortonCopy<false, PixelFormat::R8G8_UNORM>, - MortonCopy<false, PixelFormat::R8G8_SNORM>, - MortonCopy<false, PixelFormat::R8G8_SINT>, - MortonCopy<false, PixelFormat::R8G8_UINT>, - MortonCopy<false, PixelFormat::R32G32_UINT>, - MortonCopy<false, PixelFormat::R16G16B16X16_FLOAT>, - MortonCopy<false, PixelFormat::R32_UINT>, - MortonCopy<false, PixelFormat::R32_SINT>, - nullptr, - nullptr, - nullptr, - MortonCopy<false, PixelFormat::B8G8R8A8_SRGB>, - MortonCopy<false, PixelFormat::BC1_RGBA_SRGB>, - MortonCopy<false, PixelFormat::BC2_SRGB>, - MortonCopy<false, PixelFormat::BC3_SRGB>, - MortonCopy<false, PixelFormat::BC7_SRGB>, - MortonCopy<false, PixelFormat::A4B4G4R4_UNORM>, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - MortonCopy<false, PixelFormat::E5B9G9R9_FLOAT>, - MortonCopy<false, PixelFormat::D32_FLOAT>, - MortonCopy<false, PixelFormat::D16_UNORM>, - MortonCopy<false, PixelFormat::D24_UNORM_S8_UINT>, - MortonCopy<false, PixelFormat::S8_UINT_D24_UNORM>, - MortonCopy<false, PixelFormat::D32_FLOAT_S8_UINT>, -}; - -static MortonCopyFn GetSwizzleFunction(MortonSwizzleMode mode, Surface::PixelFormat format) { - switch (mode) { - case MortonSwizzleMode::MortonToLinear: - return morton_to_linear_fns[static_cast<std::size_t>(format)]; - case MortonSwizzleMode::LinearToMorton: - return linear_to_morton_fns[static_cast<std::size_t>(format)]; - } - UNREACHABLE(); - return morton_to_linear_fns[static_cast<std::size_t>(format)]; -} - -void MortonSwizzle(MortonSwizzleMode mode, Surface::PixelFormat format, u32 stride, - u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing, - u8* buffer, u8* addr) { - GetSwizzleFunction(mode, format)(stride, block_height, height, block_depth, depth, - tile_width_spacing, buffer, addr); -} - -} // namespace VideoCore diff --git a/src/video_core/morton.h b/src/video_core/morton.h index b714a7e3f..e69de29bb 100644 --- a/src/video_core/morton.h +++ b/src/video_core/morton.h @@ -1,18 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" -#include "video_core/surface.h" - -namespace VideoCore { - -enum class MortonSwizzleMode { MortonToLinear, LinearToMorton }; - -void MortonSwizzle(MortonSwizzleMode mode, VideoCore::Surface::PixelFormat format, u32 stride, - u32 block_height, u32 height, u32 block_depth, u32 depth, u32 tile_width_spacing, - u8* buffer, u8* addr); - -} // namespace VideoCore diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index fc54ca0ef..203f2af05 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -28,8 +28,8 @@ namespace VideoCommon { template <class QueryCache, class HostCounter> class CounterStreamBase { public: - explicit CounterStreamBase(QueryCache& cache, VideoCore::QueryType type) - : cache{cache}, type{type} {} + explicit CounterStreamBase(QueryCache& cache_, VideoCore::QueryType type_) + : cache{cache_}, type{type_} {} /// Updates the state of the stream, enabling or disabling as needed. void Update(bool enabled) { @@ -334,8 +334,8 @@ private: template <class HostCounter> class CachedQueryBase { public: - explicit CachedQueryBase(VAddr cpu_addr, u8* host_ptr) - : cpu_addr{cpu_addr}, host_ptr{host_ptr} {} + explicit CachedQueryBase(VAddr cpu_addr_, u8* host_ptr_) + : cpu_addr{cpu_addr_}, host_ptr{host_ptr_} {} virtual ~CachedQueryBase() = default; CachedQueryBase(CachedQueryBase&&) noexcept = default; diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index b3e0919f8..0cb0f387d 100644 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -32,7 +32,7 @@ using DiskResourceLoadCallback = std::function<void(LoadCallbackStage, std::size class RasterizerInterface { public: - virtual ~RasterizerInterface() {} + virtual ~RasterizerInterface() = default; /// Dispatches a draw invocation virtual void Draw(bool is_indexed, bool is_instanced) = 0; @@ -76,6 +76,9 @@ public: /// Sync memory between guest and host. virtual void SyncGuestHost() = 0; + /// Unmap memory range + virtual void UnmapMemory(VAddr addr, u64 size) = 0; + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory /// and invalidated virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0; @@ -83,6 +86,12 @@ public: /// Notify the host renderer to wait for previous primitive and compute operations. virtual void WaitForIdle() = 0; + /// Notify the host renderer to wait for reads and writes to render targets and flush caches. + virtual void FragmentBarrier() = 0; + + /// Notify the host renderer to make available previous render target writes. + virtual void TiledCacheBarrier() = 0; + /// Notify the rasterizer to send all written commands to the host GPU. virtual void FlushCommands() = 0; @@ -90,15 +99,15 @@ public: virtual void TickFrame() = 0; /// Attempt to use a faster method to perform a surface copy - virtual bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, - const Tegra::Engines::Fermi2D::Config& copy_config) { + [[nodiscard]] virtual bool AccelerateSurfaceCopy( + const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Config& copy_config) { return false; } /// Attempt to use a faster method to display the framebuffer to screen - virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, - u32 pixel_stride) { + [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config, + VAddr framebuffer_addr, u32 pixel_stride) { return false; } @@ -110,12 +119,12 @@ public: const DiskResourceLoadCallback& callback) {} /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. - GuestDriverProfile& AccessGuestDriverProfile() { + [[nodiscard]] GuestDriverProfile& AccessGuestDriverProfile() { return guest_driver_profile; } /// Grant access to the Guest Driver Profile for recording/obtaining info on the guest driver. - const GuestDriverProfile& AccessGuestDriverProfile() const { + [[nodiscard]] const GuestDriverProfile& AccessGuestDriverProfile() const { return guest_driver_profile; } diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index 5c650808b..51dde8eb5 100644 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -38,7 +38,7 @@ public: virtual ~RendererBase(); /// Initialize the renderer - virtual bool Init() = 0; + [[nodiscard]] virtual bool Init() = 0; /// Shutdown the renderer virtual void ShutDown() = 0; @@ -49,43 +49,43 @@ public: // Getter/setter functions: // ------------------------ - f32 GetCurrentFPS() const { + [[nodiscard]] f32 GetCurrentFPS() const { return m_current_fps; } - int GetCurrentFrame() const { + [[nodiscard]] int GetCurrentFrame() const { return m_current_frame; } - RasterizerInterface& Rasterizer() { + [[nodiscard]] RasterizerInterface& Rasterizer() { return *rasterizer; } - const RasterizerInterface& Rasterizer() const { + [[nodiscard]] const RasterizerInterface& Rasterizer() const { return *rasterizer; } - Core::Frontend::GraphicsContext& Context() { + [[nodiscard]] Core::Frontend::GraphicsContext& Context() { return *context; } - const Core::Frontend::GraphicsContext& Context() const { + [[nodiscard]] const Core::Frontend::GraphicsContext& Context() const { return *context; } - Core::Frontend::EmuWindow& GetRenderWindow() { + [[nodiscard]] Core::Frontend::EmuWindow& GetRenderWindow() { return render_window; } - const Core::Frontend::EmuWindow& GetRenderWindow() const { + [[nodiscard]] const Core::Frontend::EmuWindow& GetRenderWindow() const { return render_window; } - RendererSettings& Settings() { + [[nodiscard]] RendererSettings& Settings() { return renderer_settings; } - const RendererSettings& Settings() const { + [[nodiscard]] const RendererSettings& Settings() const { return renderer_settings; } diff --git a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp index b7e9ed2e9..3e4d88c30 100644 --- a/src/video_core/renderer_opengl/gl_arb_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_arb_decompiler.cpp @@ -39,8 +39,8 @@ using Operation = const OperationNode&; constexpr std::array INTERNAL_FLAG_NAMES = {"ZERO", "SIGN", "CARRY", "OVERFLOW"}; char Swizzle(std::size_t component) { - ASSERT(component < 4); - return component["xyzw"]; + static constexpr std::string_view SWIZZLE{"xyzw"}; + return SWIZZLE.at(component); } constexpr bool IsGenericAttribute(Attribute::Index index) { @@ -71,7 +71,7 @@ std::string_view GetInputFlags(PixelImap attribute) { case PixelImap::Unused: break; } - UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute)); + UNIMPLEMENTED_MSG("Unknown attribute usage index={}", attribute); return {}; } @@ -123,7 +123,7 @@ std::string_view PrimitiveDescription(Tegra::Engines::Maxwell3D::Regs::Primitive case Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology::TriangleStripAdjacency: return "TRIANGLES_ADJACENCY"; default: - UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + UNIMPLEMENTED_MSG("topology={}", topology); return "POINTS"; } } @@ -137,7 +137,7 @@ std::string_view TopologyName(Tegra::Shader::OutputTopology topology) { case Tegra::Shader::OutputTopology::TriangleStrip: return "TRIANGLE_STRIP"; default: - UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology)); + UNIMPLEMENTED_MSG("Unknown output topology: {}", topology); return "points"; } } @@ -187,8 +187,8 @@ std::string TextureType(const MetaTexture& meta) { class ARBDecompiler final { public: - explicit ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, - ShaderType stage, std::string_view identifier); + explicit ARBDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, + ShaderType stage_, std::string_view identifier); std::string Code() const { return shader_source; @@ -224,7 +224,7 @@ private: std::string Visit(const Node& node); - std::pair<std::string, std::size_t> BuildCoords(Operation); + std::tuple<std::string, std::string, std::size_t> BuildCoords(Operation); std::string BuildAoffi(Operation); std::string GlobalMemoryPointer(const GmemNode& gmem); void Exit(); @@ -376,9 +376,11 @@ private: std::string temporary = AllocTemporary(); std::string address; std::string_view opname; + bool robust = false; if (const auto gmem = std::get_if<GmemNode>(&*operation[0])) { address = GlobalMemoryPointer(*gmem); opname = "ATOM"; + robust = true; } else if (const auto smem = std::get_if<SmemNode>(&*operation[0])) { address = fmt::format("shared_mem[{}]", Visit(smem->GetAddress())); opname = "ATOMS"; @@ -386,7 +388,15 @@ private: UNREACHABLE(); return "{0, 0, 0, 0}"; } + if (robust) { + AddLine("IF NE.x;"); + } AddLine("{}.{}.{} {}, {}, {};", opname, op, type, temporary, Visit(operation[1]), address); + if (robust) { + AddLine("ELSE;"); + AddLine("MOV.S {}, 0;", temporary); + AddLine("ENDIF;"); + } return temporary; } @@ -792,9 +802,9 @@ private: }; }; -ARBDecompiler::ARBDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, - ShaderType stage, std::string_view identifier) - : device{device}, ir{ir}, registry{registry}, stage{stage} { +ARBDecompiler::ARBDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, + ShaderType stage_, std::string_view identifier) + : device{device_}, ir{ir_}, registry{registry_}, stage{stage_} { DefineGlobalMemory(); AddLine("TEMP RC;"); @@ -980,10 +990,9 @@ void ARBDecompiler::DeclareLocalMemory() { } void ARBDecompiler::DeclareGlobalMemory() { - const std::size_t num_entries = ir.GetGlobalMemory().size(); + const size_t num_entries = ir.GetGlobalMemory().size(); if (num_entries > 0) { - const std::size_t num_vectors = Common::AlignUp(num_entries, 2) / 2; - AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_vectors, num_vectors - 1); + AddLine("PARAM c[{}] = {{ program.local[0..{}] }};", num_entries, num_entries - 1); } } @@ -1125,44 +1134,44 @@ void ARBDecompiler::VisitAST(const ASTNode& node) { for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { VisitAST(current); } - } else if (const auto ast = std::get_if<ASTIfThen>(&*node->GetInnerData())) { - const std::string condition = VisitExpression(ast->condition); + } else if (const auto if_then = std::get_if<ASTIfThen>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(if_then->condition); ResetTemporaries(); AddLine("MOVC.U RC.x, {};", condition); AddLine("IF NE.x;"); - for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + for (ASTNode current = if_then->nodes.GetFirst(); current; current = current->GetNext()) { VisitAST(current); } AddLine("ENDIF;"); - } else if (const auto ast = std::get_if<ASTIfElse>(&*node->GetInnerData())) { + } else if (const auto if_else = std::get_if<ASTIfElse>(&*node->GetInnerData())) { AddLine("ELSE;"); - for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + for (ASTNode current = if_else->nodes.GetFirst(); current; current = current->GetNext()) { VisitAST(current); } - } else if (const auto ast = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) { - VisitBlock(ast->nodes); - } else if (const auto ast = std::get_if<ASTVarSet>(&*node->GetInnerData())) { - AddLine("MOV.U F{}, {};", ast->index, VisitExpression(ast->condition)); + } else if (const auto decoded = std::get_if<ASTBlockDecoded>(&*node->GetInnerData())) { + VisitBlock(decoded->nodes); + } else if (const auto var_set = std::get_if<ASTVarSet>(&*node->GetInnerData())) { + AddLine("MOV.U F{}, {};", var_set->index, VisitExpression(var_set->condition)); ResetTemporaries(); - } else if (const auto ast = std::get_if<ASTDoWhile>(&*node->GetInnerData())) { - const std::string condition = VisitExpression(ast->condition); + } else if (const auto do_while = std::get_if<ASTDoWhile>(&*node->GetInnerData())) { + const std::string condition = VisitExpression(do_while->condition); ResetTemporaries(); AddLine("REP;"); - for (ASTNode current = ast->nodes.GetFirst(); current; current = current->GetNext()) { + for (ASTNode current = do_while->nodes.GetFirst(); current; current = current->GetNext()) { VisitAST(current); } AddLine("MOVC.U RC.x, {};", condition); AddLine("BRK (NE.x);"); AddLine("ENDREP;"); - } else if (const auto ast = std::get_if<ASTReturn>(&*node->GetInnerData())) { - const bool is_true = ExprIsTrue(ast->condition); + } else if (const auto ast_return = std::get_if<ASTReturn>(&*node->GetInnerData())) { + const bool is_true = ExprIsTrue(ast_return->condition); if (!is_true) { - AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("MOVC.U RC.x, {};", VisitExpression(ast_return->condition)); AddLine("IF NE.x;"); ResetTemporaries(); } - if (ast->kills) { + if (ast_return->kills) { AddLine("KIL TR;"); } else { Exit(); @@ -1170,11 +1179,11 @@ void ARBDecompiler::VisitAST(const ASTNode& node) { if (!is_true) { AddLine("ENDIF;"); } - } else if (const auto ast = std::get_if<ASTBreak>(&*node->GetInnerData())) { - if (ExprIsTrue(ast->condition)) { + } else if (const auto ast_break = std::get_if<ASTBreak>(&*node->GetInnerData())) { + if (ExprIsTrue(ast_break->condition)) { AddLine("BRK;"); } else { - AddLine("MOVC.U RC.x, {};", VisitExpression(ast->condition)); + AddLine("MOVC.U RC.x, {};", VisitExpression(ast_break->condition)); AddLine("BRK (NE.x);"); ResetTemporaries(); } @@ -1342,7 +1351,7 @@ std::string ARBDecompiler::Visit(const Node& node) { GetGenericAttributeIndex(index), swizzle); } } - UNIMPLEMENTED_MSG("Unimplemented input attribute={}", static_cast<int>(index)); + UNIMPLEMENTED_MSG("Unimplemented input attribute={}", index); break; } return "{0, 0, 0, 0}.x"; @@ -1363,7 +1372,8 @@ std::string ARBDecompiler::Visit(const Node& node) { if (const auto gmem = std::get_if<GmemNode>(&*node)) { std::string temporary = AllocTemporary(); - AddLine("LOAD.U32 {}, {};", temporary, GlobalMemoryPointer(*gmem)); + AddLine("MOV {}, 0;", temporary); + AddLine("LOAD.U32 {} (NE.x), {};", temporary, GlobalMemoryPointer(*gmem)); return temporary; } @@ -1406,12 +1416,12 @@ std::string ARBDecompiler::Visit(const Node& node) { return {}; } -std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) { +std::tuple<std::string, std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); UNIMPLEMENTED_IF(meta.sampler.is_indexed); - UNIMPLEMENTED_IF(meta.sampler.is_shadow && meta.sampler.is_array && - meta.sampler.type == Tegra::Shader::TextureType::TextureCube); + const bool is_extended = meta.sampler.is_shadow && meta.sampler.is_array && + meta.sampler.type == Tegra::Shader::TextureType::TextureCube; const std::size_t count = operation.GetOperandsCount(); std::string temporary = AllocVectorTemporary(); std::size_t i = 0; @@ -1419,12 +1429,21 @@ std::pair<std::string, std::size_t> ARBDecompiler::BuildCoords(Operation operati AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), Visit(operation[i])); } if (meta.sampler.is_array) { - AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i++), Visit(meta.array)); + AddLine("I2F.S {}.{}, {};", temporary, Swizzle(i), Visit(meta.array)); + ++i; } if (meta.sampler.is_shadow) { - AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i++), Visit(meta.depth_compare)); + std::string compare = Visit(meta.depth_compare); + if (is_extended) { + ASSERT(i == 4); + std::string extra_coord = AllocVectorTemporary(); + AddLine("MOV.F {}.x, {};", extra_coord, compare); + return {fmt::format("{}, {}", temporary, extra_coord), extra_coord, 0}; + } + AddLine("MOV.F {}.{}, {};", temporary, Swizzle(i), compare); + ++i; } - return {std::move(temporary), i}; + return {temporary, temporary, i}; } std::string ARBDecompiler::BuildAoffi(Operation operation) { @@ -1441,18 +1460,21 @@ std::string ARBDecompiler::BuildAoffi(Operation operation) { } std::string ARBDecompiler::GlobalMemoryPointer(const GmemNode& gmem) { + // Read a bindless SSBO, return its address and set CC accordingly + // address = c[binding].xy + // length = c[binding].z const u32 binding = global_memory_names.at(gmem.GetDescriptor()); - const char result_swizzle = binding % 2 == 0 ? 'x' : 'y'; const std::string pointer = AllocLongVectorTemporary(); std::string temporary = AllocTemporary(); - const u32 local_index = binding / 2; - AddLine("PK64.U {}, c[{}];", pointer, local_index); + AddLine("PK64.U {}, c[{}];", pointer, binding); AddLine("SUB.U {}, {}, {};", temporary, Visit(gmem.GetRealAddress()), Visit(gmem.GetBaseAddress())); AddLine("CVT.U64.U32 {}.z, {};", pointer, temporary); - AddLine("ADD.U64 {}.x, {}.{}, {}.z;", pointer, pointer, result_swizzle, pointer); + AddLine("ADD.U64 {}.x, {}.x, {}.z;", pointer, pointer, pointer); + // Compare offset to length and set CC + AddLine("SLT.U.CC RC.x, {}, c[{}].z;", temporary, binding); return fmt::format("{}.x", pointer); } @@ -1463,9 +1485,7 @@ void ARBDecompiler::Exit() { } const auto safe_get_register = [this](u32 reg) -> std::string { - // TODO(Rodrigo): Replace with contains once C++20 releases - const auto& used_registers = ir.GetRegisters(); - if (used_registers.find(reg) != used_registers.end()) { + if (ir.GetRegisters().contains(reg)) { return fmt::format("R{}.x", reg); } return "{0, 0, 0, 0}.x"; @@ -1552,7 +1572,9 @@ std::string ARBDecompiler::Assign(Operation operation) { ResetTemporaries(); return {}; } else if (const auto gmem = std::get_if<GmemNode>(&*dest)) { + AddLine("IF NE.x;"); AddLine("STORE.U32 {}, {};", Visit(src), GlobalMemoryPointer(*gmem)); + AddLine("ENDIF;"); ResetTemporaries(); return {}; } else { @@ -1844,7 +1866,7 @@ std::string ARBDecompiler::LogicalAddCarry(Operation operation) { std::string ARBDecompiler::Texture(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; - const auto [temporary, swizzle] = BuildCoords(operation); + const auto [coords, temporary, swizzle] = BuildCoords(operation); std::string_view opcode = "TEX"; std::string extra; @@ -1873,7 +1895,7 @@ std::string ARBDecompiler::Texture(Operation operation) { } } - AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, temporary, extra, sampler_id, + AddLine("{}.F {}, {},{} texture[{}], {}{};", opcode, temporary, coords, extra, sampler_id, TextureType(meta), BuildAoffi(operation)); AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); return fmt::format("{}.x", temporary); @@ -1882,7 +1904,7 @@ std::string ARBDecompiler::Texture(Operation operation) { std::string ARBDecompiler::TextureGather(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; - const auto [temporary, swizzle] = BuildCoords(operation); + const auto [coords, temporary, swizzle] = BuildCoords(operation); std::string comp; if (!meta.sampler.is_shadow) { @@ -1892,7 +1914,7 @@ std::string ARBDecompiler::TextureGather(Operation operation) { AddLine("TXG.F {}, {}, texture[{}]{}, {}{};", temporary, temporary, sampler_id, comp, TextureType(meta), BuildAoffi(operation)); - AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); + AddLine("MOV.U {}.x, {}.{};", temporary, coords, Swizzle(meta.element)); return fmt::format("{}.x", temporary); } @@ -1930,13 +1952,13 @@ std::string ARBDecompiler::TextureQueryLod(Operation operation) { std::string ARBDecompiler::TexelFetch(Operation operation) { const auto& meta = std::get<MetaTexture>(operation.GetMeta()); const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; - const auto [temporary, swizzle] = BuildCoords(operation); + const auto [coords, temporary, swizzle] = BuildCoords(operation); if (!meta.sampler.is_buffer) { ASSERT(swizzle < 4); AddLine("MOV.F {}.w, {};", temporary, Visit(meta.lod)); } - AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, temporary, sampler_id, TextureType(meta), + AddLine("TXF.F {}, {}, texture[{}], {}{};", temporary, coords, sampler_id, TextureType(meta), BuildAoffi(operation)); AddLine("MOV.U {}.x, {}.{};", temporary, temporary, Swizzle(meta.element)); return fmt::format("{}.x", temporary); @@ -1947,7 +1969,7 @@ std::string ARBDecompiler::TextureGradient(Operation operation) { const u32 sampler_id = device.GetBaseBindings(stage).sampler + meta.sampler.index; const std::string ddx = AllocVectorTemporary(); const std::string ddy = AllocVectorTemporary(); - const std::string coord = BuildCoords(operation).first; + const std::string coord = std::get<1>(BuildCoords(operation)); const std::size_t num_components = meta.derivates.size() / 2; for (std::size_t index = 0; index < num_components; ++index) { diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index b1c4cd62f..5772cad87 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -22,11 +22,11 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); -Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) - : VideoCommon::BufferBlock{cpu_addr, size} { +Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_) + : BufferBlock{cpu_addr_, size_} { gl_buffer.Create(); - glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size), nullptr, GL_DYNAMIC_DRAW); - if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { + glNamedBufferData(gl_buffer.handle, static_cast<GLsizeiptr>(size_), nullptr, GL_DYNAMIC_DRAW); + if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) { glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); } @@ -34,14 +34,14 @@ Buffer::Buffer(const Device& device, VAddr cpu_addr, std::size_t size) Buffer::~Buffer() = default; -void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) { - glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size), - data); +void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { + glNamedBufferSubData(Handle(), static_cast<GLintptr>(offset), + static_cast<GLsizeiptr>(data_size), data); } -void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { +void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { MICROPROFILE_SCOPE(OpenGL_Buffer_Download); - const GLsizeiptr gl_size = static_cast<GLsizeiptr>(size); + const GLsizeiptr gl_size = static_cast<GLsizeiptr>(data_size); const GLintptr gl_offset = static_cast<GLintptr>(offset); if (read_buffer.handle == 0) { read_buffer.Create(); @@ -54,17 +54,16 @@ void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { } void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t size) { + std::size_t copy_size) { glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast<GLintptr>(src_offset), - static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(size)); + static_cast<GLintptr>(dst_offset), static_cast<GLsizeiptr>(copy_size)); } -OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, - const Device& device_, std::size_t stream_size) - : GenericBufferCache{rasterizer, gpu_memory, cpu_memory, - std::make_unique<OGLStreamBuffer>(device_, stream_size, true)}, - device{device_} { +OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + const Device& device_, OGLStreamBuffer& stream_buffer_, + StateTracker& state_tracker) + : GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} { if (!device.HasFastBufferSubData()) { return; } diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index f75b32e31..17ee90316 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -22,18 +22,19 @@ namespace OpenGL { class Device; class OGLStreamBuffer; class RasterizerOpenGL; +class StateTracker; class Buffer : public VideoCommon::BufferBlock { public: - explicit Buffer(const Device& device, VAddr cpu_addr, std::size_t size); + explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_); ~Buffer(); - void Upload(std::size_t offset, std::size_t size, const u8* data); + void Upload(std::size_t offset, std::size_t data_size, const u8* data); - void Download(std::size_t offset, std::size_t size, u8* data); + void Download(std::size_t offset, std::size_t data_size, u8* data); void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t size); + std::size_t copy_size); GLuint Handle() const noexcept { return gl_buffer.handle; @@ -54,7 +55,8 @@ class OGLBufferCache final : public GenericBufferCache { public: explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, - const Device& device, std::size_t stream_size); + const Device& device, OGLStreamBuffer& stream_buffer, + StateTracker& state_tracker); ~OGLBufferCache(); BufferInfo GetEmptyBuffer(std::size_t) override; diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index e7d95149f..81b71edfb 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -5,9 +5,11 @@ #include <algorithm> #include <array> #include <cstddef> +#include <cstdlib> #include <cstring> #include <limits> #include <optional> +#include <span> #include <vector> #include <glad/glad.h> @@ -27,27 +29,29 @@ constexpr u32 ReservedUniformBlocks = 1; constexpr u32 NumStages = 5; -constexpr std::array LimitUBOs = { +constexpr std::array LIMIT_UBOS = { GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS, - GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS}; - -constexpr std::array LimitSSBOs = { + GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS, +}; +constexpr std::array LIMIT_SSBOS = { GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, - GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS}; - -constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, - GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, - GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, - GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, - GL_MAX_TEXTURE_IMAGE_UNITS, - GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS}; - -constexpr std::array LimitImages = { + GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS, +}; +constexpr std::array LIMIT_SAMPLERS = { + GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, + GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, + GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, + GL_MAX_TEXTURE_IMAGE_UNITS, + GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS, +}; +constexpr std::array LIMIT_IMAGES = { GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS, - GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS}; + GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS, +}; template <typename T> T GetInteger(GLenum pname) { @@ -76,8 +80,8 @@ std::vector<std::string_view> GetExtensions() { return extensions; } -bool HasExtension(const std::vector<std::string_view>& images, std::string_view extension) { - return std::find(images.begin(), images.end(), extension) != images.end(); +bool HasExtension(std::span<const std::string_view> extensions, std::string_view extension) { + return std::ranges::find(extensions, extension) != extensions.end(); } u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { @@ -91,8 +95,8 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept { std::array<u32, Tegra::Engines::MaxShaderTypes> max; - std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(), - [](GLenum pname) { return GetInteger<u32>(pname); }); + std::ranges::transform(LIMIT_UBOS, max.begin(), + [](GLenum pname) { return GetInteger<u32>(pname); }); return max; } @@ -115,9 +119,10 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin for (std::size_t i = 0; i < NumStages; ++i) { const std::size_t stage = stage_swizzle[i]; bindings[stage] = { - Extract(base_ubo, num_ubos, total_ubos / NumStages, LimitUBOs[stage]), - Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LimitSSBOs[stage]), - Extract(base_samplers, num_samplers, total_samplers / NumStages, LimitSamplers[stage])}; + Extract(base_ubo, num_ubos, total_ubos / NumStages, LIMIT_UBOS[stage]), + Extract(base_ssbo, num_ssbos, total_ssbos / NumStages, LIMIT_SSBOS[stage]), + Extract(base_samplers, num_samplers, total_samplers / NumStages, + LIMIT_SAMPLERS[stage])}; } u32 num_images = GetInteger<u32>(GL_MAX_IMAGE_UNITS); @@ -130,7 +135,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin // Reserve at least 4 image bindings on the fragment stage. bindings[4].image = - Extract(base_images, num_images, std::max(4U, num_images / NumStages), LimitImages[4]); + Extract(base_images, num_images, std::max(4U, num_images / NumStages), LIMIT_IMAGES[4]); // This is guaranteed to be at least 1. const u32 total_extracted_images = num_images / (NumStages - 1); @@ -142,7 +147,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin continue; } bindings[stage].image = - Extract(base_images, num_images, total_extracted_images, LimitImages[stage]); + Extract(base_images, num_images, total_extracted_images, LIMIT_IMAGES[stage]); } // Compute doesn't care about any of this. @@ -188,17 +193,22 @@ bool IsASTCSupported() { return true; } +[[nodiscard]] bool IsDebugToolAttached(std::span<const std::string_view> extensions) { + const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); + return nsight || HasExtension(extensions, "GL_EXT_debug_tool"); +} + } // Anonymous namespace Device::Device() : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); - const std::string_view renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); const std::string_view version = reinterpret_cast<const char*>(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); const bool is_nvidia = vendor == "NVIDIA Corporation"; const bool is_amd = vendor == "ATI Technologies Inc."; + const bool is_intel = vendor == "Intel"; bool disable_fast_buffer_sub_data = false; if (is_nvidia && version == "4.6.0 NVIDIA 443.24") { @@ -207,9 +217,8 @@ Device::Device() "Beta driver 443.24 is known to have issues. There might be performance issues."); disable_fast_buffer_sub_data = true; } - - uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); - shader_storage_alignment = GetInteger<std::size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); + uniform_buffer_alignment = GetInteger<size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); + shader_storage_alignment = GetInteger<size_t>(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger<u32>(GL_MAX_VERTEX_ATTRIBS); max_varyings = GetInteger<u32>(GL_MAX_VARYING_VECTORS); max_compute_shared_memory_size = GetInteger<u32>(GL_MAX_COMPUTE_SHARED_MEMORY_SIZE); @@ -223,8 +232,10 @@ Device::Device() has_variable_aoffi = TestVariableAoffi(); has_component_indexing_bug = is_amd; has_precise_bug = TestPreciseBug(); + has_broken_texture_view_formats = is_amd || is_intel; has_nv_viewport_array2 = GLAD_GL_NV_viewport_array2; has_vertex_buffer_unified_memory = GLAD_GL_NV_vertex_buffer_unified_memory; + has_debugging_tool_attached = IsDebugToolAttached(extensions); // At the moment of writing this, only Nvidia's driver optimizes BufferSubData on exclusive // uniform buffers as "push constants" @@ -239,6 +250,8 @@ Device::Device() LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", has_variable_aoffi); LOG_INFO(Render_OpenGL, "Renderer_ComponentIndexingBug: {}", has_component_indexing_bug); LOG_INFO(Render_OpenGL, "Renderer_PreciseBug: {}", has_precise_bug); + LOG_INFO(Render_OpenGL, "Renderer_BrokenTextureViewFormats: {}", + has_broken_texture_view_formats); if (Settings::values.use_assembly_shaders.GetValue() && !use_assembly_shaders) { LOG_ERROR(Render_OpenGL, "Assembly shaders enabled but not supported"); diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 8a4b6b9fc..3e79d1e37 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -36,11 +36,11 @@ public: return GetBaseBindings(static_cast<std::size_t>(shader_type)); } - std::size_t GetUniformBufferAlignment() const { + size_t GetUniformBufferAlignment() const { return uniform_buffer_alignment; } - std::size_t GetShaderStorageBufferAlignment() const { + size_t GetShaderStorageBufferAlignment() const { return shader_storage_alignment; } @@ -96,6 +96,10 @@ public: return has_precise_bug; } + bool HasBrokenTextureViewFormats() const { + return has_broken_texture_view_formats; + } + bool HasFastBufferSubData() const { return has_fast_buffer_sub_data; } @@ -104,6 +108,10 @@ public: return has_nv_viewport_array2; } + bool HasDebuggingToolAttached() const { + return has_debugging_tool_attached; + } + bool UseAssemblyShaders() const { return use_assembly_shaders; } @@ -118,8 +126,8 @@ private: std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{}; std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{}; - std::size_t uniform_buffer_alignment{}; - std::size_t shader_storage_alignment{}; + size_t uniform_buffer_alignment{}; + size_t shader_storage_alignment{}; u32 max_vertex_attributes{}; u32 max_varyings{}; u32 max_compute_shared_memory_size{}; @@ -133,8 +141,10 @@ private: bool has_variable_aoffi{}; bool has_component_indexing_bug{}; bool has_precise_bug{}; + bool has_broken_texture_view_formats{}; bool has_fast_buffer_sub_data{}; bool has_nv_viewport_array2{}; + bool has_debugging_tool_attached{}; bool use_assembly_shaders{}; bool use_asynchronous_shaders{}; }; diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp index b532fdcc2..3e9c922f5 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.cpp +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -11,10 +11,10 @@ namespace OpenGL { -GLInnerFence::GLInnerFence(u32 payload, bool is_stubbed) : FenceBase(payload, is_stubbed) {} +GLInnerFence::GLInnerFence(u32 payload_, bool is_stubbed_) : FenceBase{payload_, is_stubbed_} {} -GLInnerFence::GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed) - : FenceBase(address, payload, is_stubbed) {} +GLInnerFence::GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_) + : FenceBase{address_, payload_, is_stubbed_} {} GLInnerFence::~GLInnerFence() = default; @@ -45,10 +45,10 @@ void GLInnerFence::Wait() { glClientWaitSync(sync_object.handle, 0, GL_TIMEOUT_IGNORED); } -FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, - TextureCacheOpenGL& texture_cache, - OGLBufferCache& buffer_cache, QueryCache& query_cache) - : GenericFenceManager{rasterizer, gpu, texture_cache, buffer_cache, query_cache} {} +FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, + Tegra::GPU& gpu_, TextureCache& texture_cache_, + OGLBufferCache& buffer_cache_, QueryCache& query_cache_) + : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {} Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) { return std::make_shared<GLInnerFence>(value, is_stubbed); diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h index da1dcdace..30dbee613 100644 --- a/src/video_core/renderer_opengl/gl_fence_manager.h +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -17,8 +17,8 @@ namespace OpenGL { class GLInnerFence : public VideoCommon::FenceBase { public: - GLInnerFence(u32 payload, bool is_stubbed); - GLInnerFence(GPUVAddr address, u32 payload, bool is_stubbed); + explicit GLInnerFence(u32 payload_, bool is_stubbed_); + explicit GLInnerFence(GPUVAddr address_, u32 payload_, bool is_stubbed_); ~GLInnerFence(); void Queue(); @@ -33,13 +33,13 @@ private: using Fence = std::shared_ptr<GLInnerFence>; using GenericFenceManager = - VideoCommon::FenceManager<Fence, TextureCacheOpenGL, OGLBufferCache, QueryCache>; + VideoCommon::FenceManager<Fence, TextureCache, OGLBufferCache, QueryCache>; class FenceManagerOpenGL final : public GenericFenceManager { public: - explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, - TextureCacheOpenGL& texture_cache, OGLBufferCache& buffer_cache, - QueryCache& query_cache); + explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, + TextureCache& texture_cache_, OGLBufferCache& buffer_cache_, + QueryCache& query_cache_); protected: Fence CreateFence(u32 value, bool is_stubbed) override; diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp b/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp deleted file mode 100644 index b8a512cb6..000000000 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.cpp +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <tuple> -#include <unordered_map> -#include <utility> - -#include <glad/glad.h> - -#include "common/common_types.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_opengl/gl_framebuffer_cache.h" - -namespace OpenGL { - -using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using VideoCore::Surface::SurfaceType; - -FramebufferCacheOpenGL::FramebufferCacheOpenGL() = default; - -FramebufferCacheOpenGL::~FramebufferCacheOpenGL() = default; - -GLuint FramebufferCacheOpenGL::GetFramebuffer(const FramebufferCacheKey& key) { - const auto [entry, is_cache_miss] = cache.try_emplace(key); - auto& framebuffer{entry->second}; - if (is_cache_miss) { - framebuffer = CreateFramebuffer(key); - } - return framebuffer.handle; -} - -OGLFramebuffer FramebufferCacheOpenGL::CreateFramebuffer(const FramebufferCacheKey& key) { - OGLFramebuffer framebuffer; - framebuffer.Create(); - - // TODO(Rodrigo): Use DSA here after Nvidia fixes their framebuffer DSA bugs. - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer.handle); - - if (key.zeta) { - const bool stencil = key.zeta->GetSurfaceParams().type == SurfaceType::DepthStencil; - const GLenum attach_target = stencil ? GL_DEPTH_STENCIL_ATTACHMENT : GL_DEPTH_ATTACHMENT; - key.zeta->Attach(attach_target, GL_DRAW_FRAMEBUFFER); - } - - std::size_t num_buffers = 0; - std::array<GLenum, Maxwell::NumRenderTargets> targets; - - for (std::size_t index = 0; index < Maxwell::NumRenderTargets; ++index) { - if (!key.colors[index]) { - targets[index] = GL_NONE; - continue; - } - const GLenum attach_target = GL_COLOR_ATTACHMENT0 + static_cast<GLenum>(index); - key.colors[index]->Attach(attach_target, GL_DRAW_FRAMEBUFFER); - - const u32 attachment = (key.color_attachments >> (BitsPerAttachment * index)) & 0b1111; - targets[index] = GL_COLOR_ATTACHMENT0 + attachment; - num_buffers = index + 1; - } - - if (num_buffers > 0) { - glDrawBuffers(static_cast<GLsizei>(num_buffers), std::data(targets)); - } else { - glDrawBuffer(GL_NONE); - } - - return framebuffer; -} - -std::size_t FramebufferCacheKey::Hash() const noexcept { - std::size_t hash = std::hash<View>{}(zeta); - for (const auto& color : colors) { - hash ^= std::hash<View>{}(color); - } - hash ^= static_cast<std::size_t>(color_attachments) << 16; - return hash; -} - -bool FramebufferCacheKey::operator==(const FramebufferCacheKey& rhs) const noexcept { - return std::tie(colors, zeta, color_attachments) == - std::tie(rhs.colors, rhs.zeta, rhs.color_attachments); -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_framebuffer_cache.h b/src/video_core/renderer_opengl/gl_framebuffer_cache.h deleted file mode 100644 index 8f698fee0..000000000 --- a/src/video_core/renderer_opengl/gl_framebuffer_cache.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <array> -#include <cstddef> -#include <unordered_map> - -#include <glad/glad.h> - -#include "common/common_types.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_texture_cache.h" - -namespace OpenGL { - -constexpr std::size_t BitsPerAttachment = 4; - -struct FramebufferCacheKey { - View zeta; - std::array<View, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> colors; - u32 color_attachments = 0; - - std::size_t Hash() const noexcept; - - bool operator==(const FramebufferCacheKey& rhs) const noexcept; - - bool operator!=(const FramebufferCacheKey& rhs) const noexcept { - return !operator==(rhs); - } - - void SetAttachment(std::size_t index, u32 attachment) { - color_attachments |= attachment << (BitsPerAttachment * index); - } -}; - -} // namespace OpenGL - -namespace std { - -template <> -struct hash<OpenGL::FramebufferCacheKey> { - std::size_t operator()(const OpenGL::FramebufferCacheKey& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std - -namespace OpenGL { - -class FramebufferCacheOpenGL { -public: - FramebufferCacheOpenGL(); - ~FramebufferCacheOpenGL(); - - GLuint GetFramebuffer(const FramebufferCacheKey& key); - -private: - OGLFramebuffer CreateFramebuffer(const FramebufferCacheKey& key); - - std::unordered_map<FramebufferCacheKey, OGLFramebuffer> cache; -}; - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_query_cache.cpp b/src/video_core/renderer_opengl/gl_query_cache.cpp index 1a3d9720e..acebbf5f4 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.cpp +++ b/src/video_core/renderer_opengl/gl_query_cache.cpp @@ -30,11 +30,9 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) { } // Anonymous namespace -QueryCache::QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d, - Tegra::MemoryManager& gpu_memory) - : VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter>( - rasterizer, maxwell3d, gpu_memory), - gl_rasterizer{rasterizer} {} +QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::MemoryManager& gpu_memory_) + : QueryCacheBase(rasterizer_, maxwell3d_, gpu_memory_), gl_rasterizer{rasterizer_} {} QueryCache::~QueryCache() = default; @@ -59,10 +57,11 @@ bool QueryCache::AnyCommandQueued() const noexcept { return gl_rasterizer.AnyCommandQueued(); } -HostCounter::HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, - VideoCore::QueryType type) - : VideoCommon::HostCounterBase<QueryCache, HostCounter>{std::move(dependency)}, cache{cache}, - type{type}, query{cache.AllocateQuery(type)} { +HostCounter::HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, + VideoCore::QueryType type_) + : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, query{ + cache.AllocateQuery( + type)} { glBeginQuery(GetTarget(type), query.handle); } @@ -86,13 +85,14 @@ u64 HostCounter::BlockingQuery() const { return static_cast<u64>(value); } -CachedQuery::CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, u8* host_ptr) - : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr}, cache{&cache}, type{type} {} +CachedQuery::CachedQuery(QueryCache& cache_, VideoCore::QueryType type_, VAddr cpu_addr_, + u8* host_ptr_) + : CachedQueryBase{cpu_addr_, host_ptr_}, cache{&cache_}, type{type_} {} CachedQuery::~CachedQuery() = default; CachedQuery::CachedQuery(CachedQuery&& rhs) noexcept - : VideoCommon::CachedQueryBase<HostCounter>(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} + : CachedQueryBase(std::move(rhs)), cache{rhs.cache}, type{rhs.type} {} CachedQuery& CachedQuery::operator=(CachedQuery&& rhs) noexcept { cache = rhs.cache; diff --git a/src/video_core/renderer_opengl/gl_query_cache.h b/src/video_core/renderer_opengl/gl_query_cache.h index 82cac51ee..7bbe5cfe9 100644 --- a/src/video_core/renderer_opengl/gl_query_cache.h +++ b/src/video_core/renderer_opengl/gl_query_cache.h @@ -29,8 +29,8 @@ using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; class QueryCache final : public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit QueryCache(RasterizerOpenGL& rasterizer, Tegra::Engines::Maxwell3D& maxwell3d, - Tegra::MemoryManager& gpu_memory); + explicit QueryCache(RasterizerOpenGL& rasterizer_, Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::MemoryManager& gpu_memory_); ~QueryCache(); OGLQuery AllocateQuery(VideoCore::QueryType type); @@ -46,8 +46,8 @@ private: class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { public: - explicit HostCounter(QueryCache& cache, std::shared_ptr<HostCounter> dependency, - VideoCore::QueryType type); + explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_, + VideoCore::QueryType type_); ~HostCounter(); void EndQuery(); @@ -62,8 +62,8 @@ private: class CachedQuery final : public VideoCommon::CachedQueryBase<HostCounter> { public: - explicit CachedQuery(QueryCache& cache, VideoCore::QueryType type, VAddr cpu_addr, - u8* host_ptr); + explicit CachedQuery(QueryCache& cache_, VideoCore::QueryType type_, VAddr cpu_addr_, + u8* host_ptr_); ~CachedQuery() override; CachedQuery(CachedQuery&& rhs) noexcept; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index bbb2eb17c..8aa63d329 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -25,12 +25,15 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/shader_type.h" #include "video_core/memory_manager.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" #include "video_core/shader_cache.h" +#include "video_core/texture_cache/texture_cache.h" namespace OpenGL { @@ -55,18 +58,32 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 namespace { -constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18; -constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = +constexpr size_t NUM_CONST_BUFFERS_PER_STAGE = 18; +constexpr size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; -constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = +constexpr size_t TOTAL_CONST_BUFFER_BYTES = NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; -constexpr std::size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; -constexpr std::size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16; +constexpr size_t NUM_SUPPORTED_VERTEX_ATTRIBUTES = 16; +constexpr size_t NUM_SUPPORTED_VERTEX_BINDINGS = 16; + +constexpr size_t MAX_TEXTURES = 192; +constexpr size_t MAX_IMAGES = 48; + +struct TextureHandle { + constexpr TextureHandle(u32 data, bool via_header_index) { + const Tegra::Texture::TextureHandle handle{data}; + image = handle.tic_id; + sampler = via_header_index ? image : handle.tsc_id.Value(); + } + + u32 image; + u32 sampler; +}; template <typename Engine, typename Entry> -Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - ShaderType shader_type, std::size_t index = 0) { +TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const Entry& entry, + ShaderType shader_type, size_t index = 0) { if constexpr (std::is_same_v<Entry, SamplerEntry>) { if (entry.is_separated) { const u32 buffer_1 = entry.buffer; @@ -75,21 +92,16 @@ Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry const u32 offset_2 = entry.secondary_offset; const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); - return engine.GetTextureInfo(handle_1 | handle_2); + return TextureHandle(handle_1 | handle_2, via_header_index); } } if (entry.is_bindless) { - const u32 handle = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); - return engine.GetTextureInfo(handle); - } - - const auto& gpu_profile = engine.AccessGuestDriverProfile(); - const u32 offset = entry.offset + static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); - if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { - return engine.GetStageTexture(shader_type, offset); - } else { - return engine.GetTexture(offset); + const u32 raw = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return TextureHandle(raw, via_header_index); } + const u32 buffer = engine.GetBoundBuffer(); + const u64 offset = (entry.offset + index) * sizeof(u32); + return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); } std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, @@ -97,7 +109,6 @@ std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, if (!entry.IsIndirect()) { return entry.GetSize(); } - if (buffer.size > Maxwell::MaxConstBufferSize) { LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size, Maxwell::MaxConstBufferSize); @@ -131,7 +142,7 @@ std::pair<GLint, GLint> TransformFeedbackEnum(u8 location) { case 43: return {GL_BACK_SECONDARY_COLOR_NV, 0}; } - UNIMPLEMENTED_MSG("index={}", static_cast<int>(index)); + UNIMPLEMENTED_MSG("index={}", index); return {GL_POSITION, 0}; } @@ -139,35 +150,68 @@ void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } -void UpdateBindlessPointers(GLenum target, GLuint64EXT* pointers, std::size_t num_entries) { - if (num_entries == 0) { +void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) { + if (num_ssbos == 0) { return; } - if (num_entries % 2 == 1) { - pointers[num_entries] = 0; + glProgramLocalParametersI4uivNV(target, 0, static_cast<GLsizei>(num_ssbos), + reinterpret_cast<const GLuint*>(ssbos)); +} + +ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { + if (entry.is_buffer) { + return ImageViewType::Buffer; + } + switch (entry.type) { + case Tegra::Shader::TextureType::Texture1D: + return entry.is_array ? ImageViewType::e1DArray : ImageViewType::e1D; + case Tegra::Shader::TextureType::Texture2D: + return entry.is_array ? ImageViewType::e2DArray : ImageViewType::e2D; + case Tegra::Shader::TextureType::Texture3D: + return ImageViewType::e3D; + case Tegra::Shader::TextureType::TextureCube: + return entry.is_array ? ImageViewType::CubeArray : ImageViewType::Cube; + } + UNREACHABLE(); + return ImageViewType::e2D; +} + +ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) { + switch (entry.type) { + case Tegra::Shader::ImageType::Texture1D: + return ImageViewType::e1D; + case Tegra::Shader::ImageType::Texture1DArray: + return ImageViewType::e1DArray; + case Tegra::Shader::ImageType::Texture2D: + return ImageViewType::e2D; + case Tegra::Shader::ImageType::Texture2DArray: + return ImageViewType::e2DArray; + case Tegra::Shader::ImageType::Texture3D: + return ImageViewType::e3D; + case Tegra::Shader::ImageType::TextureBuffer: + return ImageViewType::Buffer; } - const GLsizei num_vectors = static_cast<GLsizei>((num_entries + 1) / 2); - glProgramLocalParametersI4uivNV(target, 0, num_vectors, - reinterpret_cast<const GLuint*>(pointers)); + UNREACHABLE(); + return ImageViewType::e2D; } } // Anonymous namespace -RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_, - Core::Memory::Memory& cpu_memory, const Device& device_, +RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Core::Memory::Memory& cpu_memory_, const Device& device_, ScreenInfo& screen_info_, ProgramManager& program_manager_, StateTracker& state_tracker_) - : RasterizerAccelerated{cpu_memory}, gpu(gpu_), maxwell3d(gpu.Maxwell3D()), + : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()), kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_), screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), - texture_cache(*this, maxwell3d, gpu_memory, device, state_tracker), - shader_cache(*this, emu_window, gpu, maxwell3d, kepler_compute, gpu_memory, device), + stream_buffer(device, state_tracker), + texture_cache_runtime(device, program_manager, state_tracker), + texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), + shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device), query_cache(*this, maxwell3d, gpu_memory), - buffer_cache(*this, gpu_memory, cpu_memory, device, STREAM_BUFFER_SIZE), + buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), - async_shaders(emu_window) { - CheckExtensions(); - + async_shaders(emu_window_) { unified_uniform_buffer.Create(); glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); @@ -178,7 +222,6 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra: nullptr, 0); } } - if (device.UseAsynchronousShaders()) { async_shaders.AllocateWorkers(); } @@ -190,14 +233,6 @@ RasterizerOpenGL::~RasterizerOpenGL() { } } -void RasterizerOpenGL::CheckExtensions() { - if (!GLAD_GL_ARB_texture_filter_anisotropic && !GLAD_GL_EXT_texture_filter_anisotropic) { - LOG_WARNING( - Render_OpenGL, - "Anisotropic filter is not supported! This can cause graphical issues in some games."); - } -} - void RasterizerOpenGL::SetupVertexFormat() { auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexFormats]) { @@ -320,10 +355,16 @@ GLintptr RasterizerOpenGL::SetupIndexBuffer() { return info.offset; } -void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { +void RasterizerOpenGL::SetupShaders() { MICROPROFILE_SCOPE(OpenGL_Shader); u32 clip_distances = 0; + std::array<Shader*, Maxwell::MaxShaderStage> shaders{}; + image_view_indices.clear(); + sampler_handles.clear(); + + texture_cache.SynchronizeGraphicsDescriptors(); + for (std::size_t index = 0; index < Maxwell::MaxShaderProgram; ++index) { const auto& shader_config = maxwell3d.regs.shader_config[index]; const auto program{static_cast<Maxwell::ShaderProgram>(index)}; @@ -342,7 +383,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } continue; } - // Currently this stages are not supported in the OpenGL backend. // TODO(Blinkhawk): Port tesselation shaders from Vulkan to OpenGL if (program == Maxwell::ShaderProgram::TesselationControl || @@ -351,7 +391,6 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { } Shader* const shader = shader_cache.GetStageProgram(program, async_shaders); - const GLuint program_handle = shader->IsBuilt() ? shader->GetHandle() : 0; switch (program) { case Maxwell::ShaderProgram::VertexA: @@ -367,14 +406,17 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { default: UNIMPLEMENTED_MSG("Unimplemented shader index={}, enable={}, offset=0x{:08X}", index, shader_config.enable.Value(), shader_config.offset); + break; } // Stage indices are 0 - 5 - const std::size_t stage = index == 0 ? 0 : index - 1; + const size_t stage = index == 0 ? 0 : index - 1; + shaders[stage] = shader; + SetupDrawConstBuffers(stage, shader); SetupDrawGlobalMemory(stage, shader); - SetupDrawTextures(stage, shader); - SetupDrawImages(stage, shader); + SetupDrawTextures(shader, stage); + SetupDrawImages(shader, stage); // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen @@ -388,9 +430,23 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { ++index; } } - SyncClipEnabled(clip_distances); maxwell3d.dirty.flags[Dirty::Shaders] = false; + + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); + + size_t image_view_index = 0; + size_t texture_index = 0; + size_t image_index = 0; + for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { + const Shader* const shader = shaders[stage]; + if (shader) { + const auto base = device.GetBaseBindings(stage); + BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index, + texture_index, image_index); + } + } } std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { @@ -421,98 +477,6 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s shader_cache.LoadDiskCache(title_id, stop_loading, callback); } -void RasterizerOpenGL::ConfigureFramebuffers() { - MICROPROFILE_SCOPE(OpenGL_Framebuffer); - if (!maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets]) { - return; - } - maxwell3d.dirty.flags[VideoCommon::Dirty::RenderTargets] = false; - - texture_cache.GuardRenderTargets(true); - - View depth_surface = texture_cache.GetDepthBufferSurface(true); - - const auto& regs = maxwell3d.regs; - UNIMPLEMENTED_IF(regs.rt_separate_frag_data == 0); - - // Bind the framebuffer surfaces - FramebufferCacheKey key; - const auto colors_count = static_cast<std::size_t>(regs.rt_control.count); - for (std::size_t index = 0; index < colors_count; ++index) { - View color_surface{texture_cache.GetColorBufferSurface(index, true)}; - if (!color_surface) { - continue; - } - // Assume that a surface will be written to if it is used as a framebuffer, even - // if the shader doesn't actually write to it. - texture_cache.MarkColorBufferInUse(index); - - key.SetAttachment(index, regs.rt_control.GetMap(index)); - key.colors[index] = std::move(color_surface); - } - - if (depth_surface) { - // Assume that a surface will be written to if it is used as a framebuffer, even if - // the shader doesn't actually write to it. - texture_cache.MarkDepthBufferInUse(); - key.zeta = std::move(depth_surface); - } - - texture_cache.GuardRenderTargets(false); - - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); -} - -void RasterizerOpenGL::ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil) { - const auto& regs = maxwell3d.regs; - - texture_cache.GuardRenderTargets(true); - View color_surface; - - if (using_color) { - // Determine if we have to preserve the contents. - // First we have to make sure all clear masks are enabled. - bool preserve_contents = !regs.clear_buffers.R || !regs.clear_buffers.G || - !regs.clear_buffers.B || !regs.clear_buffers.A; - const std::size_t index = regs.clear_buffers.RT; - if (regs.clear_flags.scissor) { - // Then we have to confirm scissor testing clears the whole image. - const auto& scissor = regs.scissor_test[0]; - preserve_contents |= scissor.min_x > 0; - preserve_contents |= scissor.min_y > 0; - preserve_contents |= scissor.max_x < regs.rt[index].width; - preserve_contents |= scissor.max_y < regs.rt[index].height; - } - - color_surface = texture_cache.GetColorBufferSurface(index, preserve_contents); - texture_cache.MarkColorBufferInUse(index); - } - - View depth_surface; - if (using_depth_stencil) { - bool preserve_contents = false; - if (regs.clear_flags.scissor) { - // For depth stencil clears we only have to confirm scissor test covers the whole image. - const auto& scissor = regs.scissor_test[0]; - preserve_contents |= scissor.min_x > 0; - preserve_contents |= scissor.min_y > 0; - preserve_contents |= scissor.max_x < regs.zeta_width; - preserve_contents |= scissor.max_y < regs.zeta_height; - } - - depth_surface = texture_cache.GetDepthBufferSurface(preserve_contents); - texture_cache.MarkDepthBufferInUse(); - } - texture_cache.GuardRenderTargets(false); - - FramebufferCacheKey key; - key.colors[0] = std::move(color_surface); - key.zeta = std::move(depth_surface); - - state_tracker.NotifyFramebuffer(); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer_cache.GetFramebuffer(key)); -} - void RasterizerOpenGL::Clear() { if (!maxwell3d.ShouldExecute()) { return; @@ -527,8 +491,9 @@ void RasterizerOpenGL::Clear() { regs.clear_buffers.A) { use_color = true; - state_tracker.NotifyColorMask0(); - glColorMaski(0, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0, + const GLuint index = regs.clear_buffers.RT; + state_tracker.NotifyColorMask(index); + glColorMaski(index, regs.clear_buffers.R != 0, regs.clear_buffers.G != 0, regs.clear_buffers.B != 0, regs.clear_buffers.A != 0); // TODO(Rodrigo): Determine if clamping is used on clears @@ -561,15 +526,17 @@ void RasterizerOpenGL::Clear() { state_tracker.NotifyScissor0(); glDisablei(GL_SCISSOR_TEST, 0); } - UNIMPLEMENTED_IF(regs.clear_flags.viewport); - ConfigureClearFramebuffer(use_color, use_depth || use_stencil); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.UpdateRenderTargets(true); + state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); + } if (use_color) { - glClearBufferfv(GL_COLOR, 0, regs.clear_color); + glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); } - if (use_depth && use_stencil) { glClearBufferfi(GL_DEPTH_STENCIL, 0, regs.clear_depth, regs.clear_stencil); } else if (use_depth) { @@ -626,16 +593,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); // Prepare the vertex array. - const bool invalidated = buffer_cache.Map(buffer_size); - - if (invalidated) { - // When the stream buffer has been invalidated, we have to consider vertex buffers as dirty - auto& dirty = maxwell3d.dirty.flags; - dirty[Dirty::VertexBuffers] = true; - for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { - dirty[index] = true; - } - } + buffer_cache.Map(buffer_size); // Prepare vertex array format. SetupVertexFormat(); @@ -659,22 +617,16 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } // Setup shaders and their used resources. - texture_cache.GuardSamplers(true); - const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology); - SetupShaders(primitive_mode); - texture_cache.GuardSamplers(false); - - ConfigureFramebuffers(); + auto lock = texture_cache.AcquireLock(); + SetupShaders(); // Signal the buffer cache that we are not going to upload more things. buffer_cache.Unmap(); - + texture_cache.UpdateRenderTargets(false); + state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); program_manager.BindGraphicsPipeline(); - if (texture_cache.TextureBarrier()) { - glTextureBarrier(); - } - + const GLenum primitive_mode = MaxwellToGL::PrimitiveTopology(maxwell3d.regs.draw.topology); BeginTransformFeedback(primitive_mode); const GLuint base_instance = static_cast<GLuint>(maxwell3d.regs.vb_base_instance); @@ -726,15 +678,13 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { buffer_cache.Acquire(); current_cbuf = 0; - auto kernel = shader_cache.GetComputeKernel(code_addr); - program_manager.BindCompute(kernel->GetHandle()); + Shader* const kernel = shader_cache.GetComputeKernel(code_addr); - SetupComputeTextures(kernel); - SetupComputeImages(kernel); + auto lock = texture_cache.AcquireLock(); + BindComputeTextures(kernel); - const std::size_t buffer_size = - Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); + const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * + (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); buffer_cache.Map(buffer_size); SetupComputeConstBuffers(kernel); @@ -743,7 +693,6 @@ void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { buffer_cache.Unmap(); const auto& launch_desc = kepler_compute.launch_description; - program_manager.BindCompute(kernel->GetHandle()); glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); ++num_queued_commands; } @@ -764,7 +713,10 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.FlushRegion(addr, size); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.DownloadMemory(addr, size); + } buffer_cache.FlushRegion(addr, size); query_cache.FlushRegion(addr, size); } @@ -773,7 +725,8 @@ bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) { if (!Settings::IsGPULevelHigh()) { return buffer_cache.MustFlushRegion(addr, size); } - return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size); + return texture_cache.IsRegionGpuModified(addr, size) || + buffer_cache.MustFlushRegion(addr, size); } void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { @@ -781,7 +734,10 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.InvalidateRegion(addr, size); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.WriteMemory(addr, size); + } shader_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size); @@ -792,18 +748,29 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.OnCPUWrite(addr, size); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.WriteMemory(addr, size); + } shader_cache.OnCPUWrite(addr, size); buffer_cache.OnCPUWrite(addr, size); } void RasterizerOpenGL::SyncGuestHost() { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - texture_cache.SyncGuestHost(); buffer_cache.SyncGuestHost(); shader_cache.SyncGuestHost(); } +void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { + { + auto lock = texture_cache.AcquireLock(); + texture_cache.UnmapMemory(addr, size); + } + buffer_cache.OnCPUWrite(addr, size); + shader_cache.OnCPUWrite(addr, size); +} + void RasterizerOpenGL::SignalSemaphore(GPUVAddr addr, u32 value) { if (!gpu.IsAsync()) { gpu_memory.Write<u32>(addr, value); @@ -845,6 +812,14 @@ void RasterizerOpenGL::WaitForIdle() { GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT); } +void RasterizerOpenGL::FragmentBarrier() { + glMemoryBarrier(GL_FRAMEBUFFER_BARRIER_BIT); +} + +void RasterizerOpenGL::TiledCacheBarrier() { + glTextureBarrier(); +} + void RasterizerOpenGL::FlushCommands() { // Only flush when we have commands queued to OpenGL. if (num_queued_commands == 0) { @@ -858,53 +833,103 @@ void RasterizerOpenGL::TickFrame() { // Ticking a frame means that buffers will be swapped, calling glFlush implicitly. num_queued_commands = 0; + fence_manager.TickFrame(); buffer_cache.TickFrame(); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.TickFrame(); + } } -bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, +bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { MICROPROFILE_SCOPE(OpenGL_Blits); - texture_cache.DoFermiCopy(src, dst, copy_config); + auto lock = texture_cache.AcquireLock(); + texture_cache.BlitImage(dst, src, copy_config); return true; } bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) { - if (!framebuffer_addr) { - return {}; + if (framebuffer_addr == 0) { + return false; } - MICROPROFILE_SCOPE(OpenGL_CacheManagement); - const auto surface{texture_cache.TryFindFramebufferSurface(framebuffer_addr)}; - if (!surface) { - return {}; + auto lock = texture_cache.AcquireLock(); + ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)}; + if (!image_view) { + return false; } - // Verify that the cached surface is the same size and format as the requested framebuffer - const auto& params{surface->GetSurfaceParams()}; - const auto& pixel_format{ - VideoCore::Surface::PixelFormatFromGPUPixelFormat(config.pixel_format)}; - ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); - ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); + // ASSERT_MSG(image_view->size.width == config.width, "Framebuffer width is different"); + // ASSERT_MSG(image_view->size.height == config.height, "Framebuffer height is different"); - if (params.pixel_format != pixel_format) { - LOG_DEBUG(Render_OpenGL, "Framebuffer pixel_format is different"); - } + screen_info.display_texture = image_view->Handle(ImageViewType::e2D); + screen_info.display_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format); + return true; +} - screen_info.display_texture = surface->GetTexture(); - screen_info.display_srgb = surface->GetSurfaceParams().srgb_conversion; +void RasterizerOpenGL::BindComputeTextures(Shader* kernel) { + image_view_indices.clear(); + sampler_handles.clear(); - return true; + texture_cache.SynchronizeComputeDescriptors(); + + SetupComputeTextures(kernel); + SetupComputeImages(kernel); + + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + texture_cache.FillComputeImageViews(indices_span, image_view_ids); + + program_manager.BindCompute(kernel->GetHandle()); + size_t image_view_index = 0; + size_t texture_index = 0; + size_t image_index = 0; + BindTextures(kernel->GetEntries(), 0, 0, image_view_index, texture_index, image_index); +} + +void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_texture, + GLuint base_image, size_t& image_view_index, + size_t& texture_index, size_t& image_index) { + const GLuint* const samplers = sampler_handles.data() + texture_index; + const GLuint* const textures = texture_handles.data() + texture_index; + const GLuint* const images = image_handles.data() + image_index; + + const size_t num_samplers = entries.samplers.size(); + for (const auto& sampler : entries.samplers) { + for (size_t i = 0; i < sampler.size; ++i) { + const ImageViewId image_view_id = image_view_ids[image_view_index++]; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + const GLuint handle = image_view.Handle(ImageViewTypeFromEntry(sampler)); + texture_handles[texture_index++] = handle; + } + } + const size_t num_images = entries.images.size(); + for (size_t unit = 0; unit < num_images; ++unit) { + // TODO: Mark as modified + const ImageViewId image_view_id = image_view_ids[image_view_index++]; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + const GLuint handle = image_view.Handle(ImageViewTypeFromEntry(entries.images[unit])); + image_handles[image_index] = handle; + ++image_index; + } + if (num_samplers > 0) { + glBindSamplers(base_texture, static_cast<GLsizei>(num_samplers), samplers); + glBindTextures(base_texture, static_cast<GLsizei>(num_samplers), textures); + } + if (num_images > 0) { + glBindImageTextures(base_image, static_cast<GLsizei>(num_images), images); + } } void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { - static constexpr std::array PARAMETER_LUT = { - GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, + static constexpr std::array PARAMETER_LUT{ + GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, - GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV}; - + GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV, + }; MICROPROFILE_SCOPE(OpenGL_UBO); const auto& stages = maxwell3d.state.shader_stages; const auto& shader_stage = stages[stage_index]; @@ -1003,12 +1028,11 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* sh GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, }; - const auto& cbufs{maxwell3d.state.shader_stages[stage_index]}; const auto& entries{shader->GetEntries().global_memory_entries}; - std::array<GLuint64EXT, 32> pointers; - ASSERT(entries.size() < pointers.size()); + std::array<BindlessSSBO, 32> ssbos; + ASSERT(entries.size() < ssbos.size()); const bool assembly_shaders = device.UseAssemblyShaders(); u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; @@ -1016,11 +1040,11 @@ void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* sh const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)}; const u32 size{gpu_memory.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]); + SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); ++binding; } if (assembly_shaders) { - UpdateBindlessPointers(TARGET_LUT[stage_index], pointers.data(), entries.size()); + UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size()); } } @@ -1028,106 +1052,85 @@ void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { const auto& cbufs{kepler_compute.launch_description.const_buffer_config}; const auto& entries{kernel->GetEntries().global_memory_entries}; - std::array<GLuint64EXT, 32> pointers; - ASSERT(entries.size() < pointers.size()); + std::array<BindlessSSBO, 32> ssbos; + ASSERT(entries.size() < ssbos.size()); u32 binding = 0; for (const auto& entry : entries) { const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; const GPUVAddr gpu_addr{gpu_memory.Read<u64>(addr)}; const u32 size{gpu_memory.Read<u32>(addr + 8)}; - SetupGlobalMemory(binding, entry, gpu_addr, size, &pointers[binding]); + SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); ++binding; } if (device.UseAssemblyShaders()) { - UpdateBindlessPointers(GL_COMPUTE_PROGRAM_NV, pointers.data(), entries.size()); + UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size()); } } void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, - GPUVAddr gpu_addr, std::size_t size, - GLuint64EXT* pointer) { - const std::size_t alignment{device.GetShaderStorageBufferAlignment()}; + GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) { + const size_t alignment{device.GetShaderStorageBufferAlignment()}; const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); if (device.UseAssemblyShaders()) { - *pointer = info.address + info.offset; + *ssbo = BindlessSSBO{ + .address = static_cast<GLuint64EXT>(info.address + info.offset), + .length = static_cast<GLsizei>(size), + .padding = 0, + }; } else { glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, static_cast<GLsizeiptr>(size)); } } -void RasterizerOpenGL::SetupDrawTextures(std::size_t stage_index, Shader* shader) { - MICROPROFILE_SCOPE(OpenGL_Texture); - u32 binding = device.GetBaseBindings(stage_index).sampler; +void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) { + const bool via_header_index = + maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : shader->GetEntries().samplers) { const auto shader_type = static_cast<ShaderType>(stage_index); - for (std::size_t i = 0; i < entry.size; ++i) { - const auto texture = GetTextureInfo(maxwell3d, entry, shader_type, i); - SetupTexture(binding++, texture, entry); + for (size_t index = 0; index < entry.size; ++index) { + const auto handle = + GetTextureInfo(maxwell3d, via_header_index, entry, shader_type, index); + const Sampler* const sampler = texture_cache.GetGraphicsSampler(handle.sampler); + sampler_handles.push_back(sampler->Handle()); + image_view_indices.push_back(handle.image); } } } -void RasterizerOpenGL::SetupComputeTextures(Shader* kernel) { - MICROPROFILE_SCOPE(OpenGL_Texture); - u32 binding = 0; +void RasterizerOpenGL::SetupComputeTextures(const Shader* kernel) { + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : kernel->GetEntries().samplers) { - for (std::size_t i = 0; i < entry.size; ++i) { - const auto texture = GetTextureInfo(kepler_compute, entry, ShaderType::Compute, i); - SetupTexture(binding++, texture, entry); + for (size_t i = 0; i < entry.size; ++i) { + const auto handle = + GetTextureInfo(kepler_compute, via_header_index, entry, ShaderType::Compute, i); + const Sampler* const sampler = texture_cache.GetComputeSampler(handle.sampler); + sampler_handles.push_back(sampler->Handle()); + image_view_indices.push_back(handle.image); } } } -void RasterizerOpenGL::SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, - const SamplerEntry& entry) { - const auto view = texture_cache.GetTextureSurface(texture.tic, entry); - if (!view) { - // Can occur when texture addr is null or its memory is unmapped/invalid - glBindSampler(binding, 0); - glBindTextureUnit(binding, 0); - return; - } - const GLuint handle = view->GetTexture(texture.tic.x_source, texture.tic.y_source, - texture.tic.z_source, texture.tic.w_source); - glBindTextureUnit(binding, handle); - if (!view->GetSurfaceParams().IsBuffer()) { - glBindSampler(binding, sampler_cache.GetSampler(texture.tsc)); - } -} - -void RasterizerOpenGL::SetupDrawImages(std::size_t stage_index, Shader* shader) { - u32 binding = device.GetBaseBindings(stage_index).image; +void RasterizerOpenGL::SetupDrawImages(const Shader* shader, size_t stage_index) { + const bool via_header_index = + maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : shader->GetEntries().images) { const auto shader_type = static_cast<ShaderType>(stage_index); - const auto tic = GetTextureInfo(maxwell3d, entry, shader_type).tic; - SetupImage(binding++, tic, entry); + const auto handle = GetTextureInfo(maxwell3d, via_header_index, entry, shader_type); + image_view_indices.push_back(handle.image); } } -void RasterizerOpenGL::SetupComputeImages(Shader* shader) { - u32 binding = 0; +void RasterizerOpenGL::SetupComputeImages(const Shader* shader) { + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : shader->GetEntries().images) { - const auto tic = GetTextureInfo(kepler_compute, entry, ShaderType::Compute).tic; - SetupImage(binding++, tic, entry); + const auto handle = + GetTextureInfo(kepler_compute, via_header_index, entry, ShaderType::Compute); + image_view_indices.push_back(handle.image); } } -void RasterizerOpenGL::SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, - const ImageEntry& entry) { - const auto view = texture_cache.GetImageSurface(tic, entry); - if (!view) { - glBindImageTexture(binding, 0, 0, GL_FALSE, 0, GL_READ_ONLY, GL_R8); - return; - } - if (entry.is_written) { - view->MarkAsModified(texture_cache.Tick()); - } - const GLuint handle = view->GetTexture(tic.x_source, tic.y_source, tic.z_source, tic.w_source); - glBindImageTexture(binding, handle, 0, GL_TRUE, 0, GL_READ_WRITE, view->GetFormat()); -} - void RasterizerOpenGL::SyncViewport() { auto& flags = maxwell3d.dirty.flags; const auto& regs = maxwell3d.regs; @@ -1157,7 +1160,7 @@ void RasterizerOpenGL::SyncViewport() { flags[Dirty::ClipControl] = false; bool flip_y = false; - if (regs.viewport_transform[0].scale_y < 0.0) { + if (regs.viewport_transform[0].scale_y < 0.0f) { flip_y = !flip_y; } if (regs.screen_y_control.y_negate != 0) { @@ -1527,17 +1530,9 @@ void RasterizerOpenGL::SyncPointState() { flags[Dirty::PointSize] = false; oglEnable(GL_POINT_SPRITE, maxwell3d.regs.point_sprite_enable); + oglEnable(GL_PROGRAM_POINT_SIZE, maxwell3d.regs.vp_point_size.enable); - if (maxwell3d.regs.vp_point_size.enable) { - // By definition of GL_POINT_SIZE, it only matters if GL_PROGRAM_POINT_SIZE is disabled. - glEnable(GL_PROGRAM_POINT_SIZE); - return; - } - - // Limit the point size to 1 since nouveau sometimes sets a point size of 0 (and that's invalid - // in OpenGL). glPointSize(std::max(1.0f, maxwell3d.regs.point_size)); - glDisable(GL_PROGRAM_POINT_SIZE); } void RasterizerOpenGL::SyncLineState() { @@ -1580,10 +1575,6 @@ void RasterizerOpenGL::SyncAlphaTest() { flags[Dirty::AlphaTest] = false; const auto& regs = maxwell3d.regs; - if (regs.alpha_test_enabled && regs.rt_control.count > 1) { - LOG_WARNING(Render_OpenGL, "Alpha testing with more than one render target is not tested"); - } - if (regs.alpha_test_enabled) { glEnable(GL_ALPHA_TEST); glAlphaFunc(MaxwellToGL::ComparisonOp(regs.alpha_test_func), regs.alpha_test_ref); diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index f451404b2..82e03e677 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -7,12 +7,13 @@ #include <array> #include <atomic> #include <cstddef> -#include <map> #include <memory> #include <optional> #include <tuple> #include <utility> +#include <boost/container/static_vector.hpp> + #include <glad/glad.h> #include "common/common_types.h" @@ -23,16 +24,14 @@ #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_fence_manager.h" -#include "video_core/renderer_opengl/gl_framebuffer_cache.h" #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_sampler_cache.h" #include "video_core/renderer_opengl/gl_shader_cache.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" +#include "video_core/renderer_opengl/gl_stream_buffer.h" #include "video_core/renderer_opengl/gl_texture_cache.h" -#include "video_core/renderer_opengl/utils.h" #include "video_core/shader/async_shaders.h" #include "video_core/textures/texture.h" @@ -51,14 +50,21 @@ class MemoryManager; namespace OpenGL { struct ScreenInfo; -struct DrawParameters; +struct ShaderEntries; + +struct BindlessSSBO { + GLuint64EXT address; + GLsizei length; + GLsizei padding; +}; +static_assert(sizeof(BindlessSSBO) * CHAR_BIT == 128); class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: - explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu, - Core::Memory::Memory& cpu_memory, const Device& device, - ScreenInfo& screen_info, ProgramManager& program_manager, - StateTracker& state_tracker); + explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Core::Memory::Memory& cpu_memory_, const Device& device_, + ScreenInfo& screen_info_, ProgramManager& program_manager_, + StateTracker& state_tracker_); ~RasterizerOpenGL() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -72,15 +78,18 @@ public: void InvalidateRegion(VAddr addr, u64 size) override; void OnCPUWrite(VAddr addr, u64 size) override; void SyncGuestHost() override; + void UnmapMemory(VAddr addr, u64 size) override; void SignalSemaphore(GPUVAddr addr, u32 value) override; void SignalSyncPoint(u32 value) override; void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void WaitForIdle() override; + void FragmentBarrier() override; + void TiledCacheBarrier() override; void FlushCommands() override; void TickFrame() override; - bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, + bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; @@ -101,11 +110,14 @@ public: } private: - /// Configures the color and depth framebuffer states. - void ConfigureFramebuffers(); + static constexpr size_t MAX_TEXTURES = 192; + static constexpr size_t MAX_IMAGES = 48; + static constexpr size_t MAX_IMAGE_VIEWS = MAX_TEXTURES + MAX_IMAGES; + + void BindComputeTextures(Shader* kernel); - /// Configures the color and depth framebuffer for clearing. - void ConfigureClearFramebuffer(bool using_color, bool using_depth_stencil); + void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image, + size_t& image_view_index, size_t& texture_index, size_t& image_index); /// Configures the current constbuffers to use for the draw command. void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); @@ -126,26 +138,19 @@ private: /// Configures a global memory buffer. void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, - std::size_t size, GLuint64EXT* pointer); + size_t size, BindlessSSBO* ssbo); /// Configures the current textures to use for the draw command. - void SetupDrawTextures(std::size_t stage_index, Shader* shader); + void SetupDrawTextures(const Shader* shader, size_t stage_index); /// Configures the textures used in a compute shader. - void SetupComputeTextures(Shader* kernel); - - /// Configures a texture. - void SetupTexture(u32 binding, const Tegra::Texture::FullTextureInfo& texture, - const SamplerEntry& entry); + void SetupComputeTextures(const Shader* kernel); /// Configures images in a graphics shader. - void SetupDrawImages(std::size_t stage_index, Shader* shader); + void SetupDrawImages(const Shader* shader, size_t stage_index); /// Configures images in a compute shader. - void SetupComputeImages(Shader* shader); - - /// Configures an image. - void SetupImage(u32 binding, const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); + void SetupComputeImages(const Shader* shader); /// Syncs the viewport and depth range to match the guest state void SyncViewport(); @@ -220,9 +225,6 @@ private: /// End a transform feedback void EndTransformFeedback(); - /// Check for extension that are not strictly required but are needed for correct emulation - void CheckExtensions(); - std::size_t CalculateVertexArraysSize() const; std::size_t CalculateIndexBufferSize() const; @@ -235,7 +237,7 @@ private: GLintptr SetupIndexBuffer(); - void SetupShaders(GLenum primitive_mode); + void SetupShaders(); Tegra::GPU& gpu; Tegra::Engines::Maxwell3D& maxwell3d; @@ -247,19 +249,21 @@ private: ProgramManager& program_manager; StateTracker& state_tracker; - TextureCacheOpenGL texture_cache; + OGLStreamBuffer stream_buffer; + TextureCacheRuntime texture_cache_runtime; + TextureCache texture_cache; ShaderCacheOpenGL shader_cache; - SamplerCacheOpenGL sampler_cache; - FramebufferCacheOpenGL framebuffer_cache; QueryCache query_cache; OGLBufferCache buffer_cache; FenceManagerOpenGL fence_manager; VideoCommon::Shader::AsyncShaders async_shaders; - static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; - - GLint vertex_binding = 0; + boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices; + std::array<ImageViewId, MAX_IMAGE_VIEWS> image_view_ids; + boost::container::static_vector<GLuint, MAX_TEXTURES> sampler_handles; + std::array<GLuint, MAX_TEXTURES> texture_handles; + std::array<GLuint, MAX_IMAGES> image_handles; std::array<OGLBuffer, Tegra::Engines::Maxwell3D::Regs::NumTransformFeedbackBuffers> transform_feedback_buffers; @@ -273,7 +277,7 @@ private: std::size_t current_cbuf = 0; OGLBuffer unified_uniform_buffer; - /// Number of commands queued to the OpenGL driver. Reseted on flush. + /// Number of commands queued to the OpenGL driver. Resetted on flush. std::size_t num_queued_commands = 0; u32 last_clip_distance_mask = 0; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 0ebcec427..0e34a0f20 100644 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -71,7 +71,7 @@ void OGLSampler::Create() { return; MICROPROFILE_SCOPE(OpenGL_ResourceCreation); - glGenSamplers(1, &handle); + glCreateSamplers(1, &handle); } void OGLSampler::Release() { diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.cpp b/src/video_core/renderer_opengl/gl_sampler_cache.cpp deleted file mode 100644 index 5c174879a..000000000 --- a/src/video_core/renderer_opengl/gl_sampler_cache.cpp +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "common/logging/log.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_sampler_cache.h" -#include "video_core/renderer_opengl/maxwell_to_gl.h" - -namespace OpenGL { - -SamplerCacheOpenGL::SamplerCacheOpenGL() = default; - -SamplerCacheOpenGL::~SamplerCacheOpenGL() = default; - -OGLSampler SamplerCacheOpenGL::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const { - OGLSampler sampler; - sampler.Create(); - - const GLuint sampler_id{sampler.handle}; - glSamplerParameteri( - sampler_id, GL_TEXTURE_MAG_FILTER, - MaxwellToGL::TextureFilterMode(tsc.mag_filter, Tegra::Texture::TextureMipmapFilter::None)); - glSamplerParameteri(sampler_id, GL_TEXTURE_MIN_FILTER, - MaxwellToGL::TextureFilterMode(tsc.min_filter, tsc.mipmap_filter)); - glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(tsc.wrap_u)); - glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(tsc.wrap_v)); - glSamplerParameteri(sampler_id, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(tsc.wrap_p)); - glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_MODE, - tsc.depth_compare_enabled == 1 ? GL_COMPARE_REF_TO_TEXTURE : GL_NONE); - glSamplerParameteri(sampler_id, GL_TEXTURE_COMPARE_FUNC, - MaxwellToGL::DepthCompareFunc(tsc.depth_compare_func)); - glSamplerParameterfv(sampler_id, GL_TEXTURE_BORDER_COLOR, tsc.GetBorderColor().data()); - glSamplerParameterf(sampler_id, GL_TEXTURE_MIN_LOD, tsc.GetMinLod()); - glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_LOD, tsc.GetMaxLod()); - glSamplerParameterf(sampler_id, GL_TEXTURE_LOD_BIAS, tsc.GetLodBias()); - if (GLAD_GL_ARB_texture_filter_anisotropic) { - glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY, tsc.GetMaxAnisotropy()); - } else if (GLAD_GL_EXT_texture_filter_anisotropic) { - glSamplerParameterf(sampler_id, GL_TEXTURE_MAX_ANISOTROPY_EXT, tsc.GetMaxAnisotropy()); - } else { - LOG_WARNING(Render_OpenGL, "Anisotropy not supported by host GPU driver"); - } - - return sampler; -} - -GLuint SamplerCacheOpenGL::ToSamplerType(const OGLSampler& sampler) const { - return sampler.handle; -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_sampler_cache.h b/src/video_core/renderer_opengl/gl_sampler_cache.h deleted file mode 100644 index 34ee37f00..000000000 --- a/src/video_core/renderer_opengl/gl_sampler_cache.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <glad/glad.h> - -#include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/sampler_cache.h" - -namespace OpenGL { - -class SamplerCacheOpenGL final : public VideoCommon::SamplerCache<GLuint, OGLSampler> { -public: - explicit SamplerCacheOpenGL(); - ~SamplerCacheOpenGL(); - -protected: - OGLSampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const override; - - GLuint ToSamplerType(const OGLSampler& sampler) const override; -}; - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index bd56bed0c..d4841fdb7 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -27,7 +27,6 @@ #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_disk_cache.h" #include "video_core/renderer_opengl/gl_state_tracker.h" -#include "video_core/renderer_opengl/utils.h" #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" @@ -198,10 +197,10 @@ ProgramSharedPtr BuildShader(const Device& device, ShaderType shader_type, u64 u return program; } -Shader::Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry_, ShaderEntries entries_, - ProgramSharedPtr program_, bool is_built) +Shader::Shader(std::shared_ptr<Registry> registry_, ShaderEntries entries_, + ProgramSharedPtr program_, bool is_built_) : registry{std::move(registry_)}, entries{std::move(entries_)}, program{std::move(program_)}, - is_built(is_built) { + is_built{is_built_} { handle = program->assembly_program.handle; if (handle == 0) { handle = program->source_program.handle; @@ -318,14 +317,13 @@ std::unique_ptr<Shader> Shader::CreateFromCache(const ShaderParameters& params, precompiled_shader.registry, precompiled_shader.entries, precompiled_shader.program)); } -ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, +ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer_, Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::Engines::KeplerCompute& kepler_compute_, Tegra::MemoryManager& gpu_memory_, const Device& device_) - : VideoCommon::ShaderCache<Shader>{rasterizer}, emu_window{emu_window_}, gpu{gpu_}, - gpu_memory{gpu_memory_}, maxwell3d{maxwell3d_}, - kepler_compute{kepler_compute_}, device{device_} {} + : ShaderCache{rasterizer_}, emu_window{emu_window_}, gpu{gpu_}, gpu_memory{gpu_memory_}, + maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, device{device_} {} ShaderCacheOpenGL::~ShaderCacheOpenGL() = default; @@ -460,7 +458,7 @@ void ShaderCacheOpenGL::LoadDiskCache(u64 title_id, const std::atomic_bool& stop ProgramSharedPtr ShaderCacheOpenGL::GeneratePrecompiledProgram( const ShaderDiskCacheEntry& entry, const ShaderDiskCachePrecompiled& precompiled_entry, const std::unordered_set<GLenum>& supported_formats) { - if (supported_formats.find(precompiled_entry.binary_format) == supported_formats.end()) { + if (!supported_formats.contains(precompiled_entry.binary_format)) { LOG_INFO(Render_OpenGL, "Precompiled cache entry with unsupported format, removing"); return {}; } diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index 1708af06a..2aed0697e 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -108,7 +108,7 @@ public: private: explicit Shader(std::shared_ptr<VideoCommon::Shader::Registry> registry, ShaderEntries entries, - ProgramSharedPtr program, bool is_built = true); + ProgramSharedPtr program, bool is_built_ = true); std::shared_ptr<VideoCommon::Shader::Registry> registry; ShaderEntries entries; @@ -119,10 +119,11 @@ private: class ShaderCacheOpenGL final : public VideoCommon::ShaderCache<Shader> { public: - explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::Frontend::EmuWindow& emu_window, - Tegra::GPU& gpu, Tegra::Engines::Maxwell3D& maxwell3d, - Tegra::Engines::KeplerCompute& kepler_compute, - Tegra::MemoryManager& gpu_memory, const Device& device); + explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer_, + Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, const Device& device_); ~ShaderCacheOpenGL() override; /// Loads disk cache for the current game diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index bbb8fb095..2e1fa252d 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -38,11 +38,9 @@ using Tegra::Shader::IpaSampleMode; using Tegra::Shader::PixelImap; using Tegra::Shader::Register; using Tegra::Shader::TextureType; -using VideoCommon::Shader::BuildTransformFeedback; -using VideoCommon::Shader::Registry; -using namespace std::string_literals; using namespace VideoCommon::Shader; +using namespace std::string_literals; using Maxwell = Tegra::Engines::Maxwell3D::Regs; using Operation = const OperationNode&; @@ -131,7 +129,7 @@ private: class Expression final { public: - Expression(std::string code, Type type) : code{std::move(code)}, type{type} { + Expression(std::string code_, Type type_) : code{std::move(code_)}, type{type_} { ASSERT(type != Type::Void); } Expression() : type{Type::Void} {} @@ -148,8 +146,8 @@ public: ASSERT(type == Type::Void); } - std::string As(Type type) const { - switch (type) { + std::string As(Type type_) const { + switch (type_) { case Type::Bool: return AsBool(); case Type::Bool2: @@ -316,7 +314,7 @@ std::pair<const char*, u32> GetPrimitiveDescription(Maxwell::PrimitiveTopology t case Maxwell::PrimitiveTopology::TriangleStripAdjacency: return {"triangles_adjacency", 6}; default: - UNIMPLEMENTED_MSG("topology={}", static_cast<int>(topology)); + UNIMPLEMENTED_MSG("topology={}", topology); return {"points", 1}; } } @@ -342,7 +340,7 @@ std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { case Tegra::Shader::OutputTopology::TriangleStrip: return "triangle_strip"; default: - UNIMPLEMENTED_MSG("Unknown output topology: {}", static_cast<u32>(topology)); + UNIMPLEMENTED_MSG("Unknown output topology: {}", topology); return "points"; } } @@ -418,11 +416,12 @@ struct GenericVaryingDescription { class GLSLDecompiler final { public: - explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, - ShaderType stage, std::string_view identifier, std::string_view suffix) - : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier}, - suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{ - UseUnifiedUniforms(device, ir, stage)} { + explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, + ShaderType stage_, std::string_view identifier_, + std::string_view suffix_) + : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, identifier{identifier_}, + suffix{suffix_}, header{ir.GetHeader()}, use_unified_uniforms{ + UseUnifiedUniforms(device_, ir_, stage_)} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -744,7 +743,7 @@ private: case PixelImap::Unused: break; } - UNIMPLEMENTED_MSG("Unknown attribute usage index={}", static_cast<int>(attribute)); + UNIMPLEMENTED_MSG("Unknown attribute usage index={}", attribute); return {}; } @@ -777,16 +776,16 @@ private: name = "gs_" + name + "[]"; } - std::string suffix; + std::string suffix_; if (stage == ShaderType::Fragment) { const auto input_mode{header.ps.GetPixelImap(location)}; if (input_mode == PixelImap::Unused) { return; } - suffix = GetInputFlags(input_mode); + suffix_ = GetInputFlags(input_mode); } - code.AddLine("layout (location = {}) {} in vec4 {};", location, suffix, name); + code.AddLine("layout (location = {}) {} in vec4 {};", location, suffix_, name); } void DeclareOutputAttributes() { @@ -877,7 +876,7 @@ private: } u32 binding = device.GetBaseBindings(stage).uniform_buffer; - for (const auto [index, info] : ir.GetConstantBuffers()) { + for (const auto& [index, info] : ir.GetConstantBuffers()) { const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements; code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, @@ -1251,7 +1250,7 @@ private: } break; } - UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute)); + UNIMPLEMENTED_MSG("Unhandled input attribute: {}", attribute); return {"0", Type::Int}; } @@ -1331,7 +1330,7 @@ private: GetSwizzle(element)), Type::Float}}; } - UNIMPLEMENTED_MSG("Unhandled output attribute: {}", static_cast<u32>(attribute)); + UNIMPLEMENTED_MSG("Unhandled output attribute: {}", attribute); return std::nullopt; } } @@ -2056,15 +2055,19 @@ private: } Expression Texture(Operation operation) { - const auto meta = std::get_if<MetaTexture>(&operation.GetMeta()); - ASSERT(meta); - - std::string expr = GenerateTexture( - operation, "", {TextureOffset{}, TextureArgument{Type::Float, meta->bias}}); - if (meta->sampler.is_shadow) { - expr = "vec4(" + expr + ')'; + const auto meta = std::get<MetaTexture>(operation.GetMeta()); + const bool separate_dc = meta.sampler.type == TextureType::TextureCube && + meta.sampler.is_array && meta.sampler.is_shadow; + // TODO: Replace this with an array and make GenerateTexture use C++20 std::span + const std::vector<TextureIR> extras{ + TextureOffset{}, + TextureArgument{Type::Float, meta.bias}, + }; + std::string expr = GenerateTexture(operation, "", extras, separate_dc); + if (meta.sampler.is_shadow) { + expr = fmt::format("vec4({})", expr); } - return {expr + GetSwizzle(meta->element), Type::Float}; + return {expr + GetSwizzle(meta.element), Type::Float}; } Expression TextureLod(Operation operation) { @@ -2096,13 +2099,13 @@ private: const auto type = meta.sampler.is_shadow ? Type::Float : Type::Int; const bool separate_dc = meta.sampler.is_shadow; - std::vector<TextureIR> ir; + std::vector<TextureIR> ir_; if (meta.sampler.is_shadow) { - ir = {TextureOffset{}}; + ir_ = {TextureOffset{}}; } else { - ir = {TextureOffset{}, TextureArgument{type, meta.component}}; + ir_ = {TextureOffset{}, TextureArgument{type, meta.component}}; } - return {GenerateTexture(operation, "Gather", ir, separate_dc) + GetSwizzle(meta.element), + return {GenerateTexture(operation, "Gather", ir_, separate_dc) + GetSwizzle(meta.element), Type::Float}; } @@ -2748,11 +2751,11 @@ private: } } - std::string GetSampler(const Sampler& sampler) const { + std::string GetSampler(const SamplerEntry& sampler) const { return AppendSuffix(sampler.index, "sampler"); } - std::string GetImage(const Image& image) const { + std::string GetImage(const ImageEntry& image) const { return AppendSuffix(image.index, "image"); } @@ -2797,7 +2800,7 @@ std::string GetFlowVariable(u32 index) { class ExprDecompiler { public: - explicit ExprDecompiler(GLSLDecompiler& decomp) : decomp{decomp} {} + explicit ExprDecompiler(GLSLDecompiler& decomp_) : decomp{decomp_} {} void operator()(const ExprAnd& expr) { inner += '('; @@ -2852,7 +2855,7 @@ private: class ASTDecompiler { public: - explicit ASTDecompiler(GLSLDecompiler& decomp) : decomp{decomp} {} + explicit ASTDecompiler(GLSLDecompiler& decomp_) : decomp{decomp_} {} void operator()(const ASTProgram& ast) { ASTNode current = ast.nodes.GetFirst(); diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 451c9689a..be68994bb 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -20,13 +20,13 @@ namespace OpenGL { class Device; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using SamplerEntry = VideoCommon::Shader::Sampler; -using ImageEntry = VideoCommon::Shader::Image; +using SamplerEntry = VideoCommon::Shader::SamplerEntry; +using ImageEntry = VideoCommon::Shader::ImageEntry; class ConstBufferEntry : public VideoCommon::Shader::ConstBuffer { public: - explicit ConstBufferEntry(u32 max_offset, bool is_indirect, u32 index) - : VideoCommon::Shader::ConstBuffer{max_offset, is_indirect}, index{index} {} + explicit ConstBufferEntry(u32 max_offset_, bool is_indirect_, u32 index_) + : ConstBuffer{max_offset_, is_indirect_}, index{index_} {} u32 GetIndex() const { return index; @@ -37,10 +37,10 @@ private: }; struct GlobalMemoryEntry { - constexpr explicit GlobalMemoryEntry(u32 cbuf_index, u32 cbuf_offset, bool is_read, - bool is_written) - : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_read{is_read}, is_written{ - is_written} {} + constexpr explicit GlobalMemoryEntry(u32 cbuf_index_, u32 cbuf_offset_, bool is_read_, + bool is_written_) + : cbuf_index{cbuf_index_}, cbuf_offset{cbuf_offset_}, is_read{is_read_}, is_written{ + is_written_} {} u32 cbuf_index = 0; u32 cbuf_offset = 0; diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 166ee34e1..955b2abc4 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -317,8 +317,7 @@ std::optional<std::vector<ShaderDiskCachePrecompiled>> ShaderDiskCacheOpenGL::Lo return std::nullopt; } } - - return std::move(entries); + return entries; } void ShaderDiskCacheOpenGL::InvalidateTransferable() { @@ -344,7 +343,7 @@ void ShaderDiskCacheOpenGL::SaveEntry(const ShaderDiskCacheEntry& entry) { } const u64 id = entry.unique_identifier; - if (stored_transferable.find(id) != stored_transferable.end()) { + if (stored_transferable.contains(id)) { // The shader already exists return; } diff --git a/src/video_core/renderer_opengl/gl_shader_manager.cpp b/src/video_core/renderer_opengl/gl_shader_manager.cpp index 691c6c79b..553e6e8d6 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.cpp +++ b/src/video_core/renderer_opengl/gl_shader_manager.cpp @@ -83,6 +83,21 @@ void ProgramManager::RestoreGuestPipeline() { } } +void ProgramManager::BindHostCompute(GLuint program) { + if (use_assembly_programs) { + glDisable(GL_COMPUTE_PROGRAM_NV); + } + glUseProgram(program); + is_graphics_bound = false; +} + +void ProgramManager::RestoreGuestCompute() { + if (use_assembly_programs) { + glEnable(GL_COMPUTE_PROGRAM_NV); + glUseProgram(0); + } +} + void ProgramManager::UseVertexShader(GLuint program) { if (use_assembly_programs) { BindProgram(GL_VERTEX_PROGRAM_NV, program, current_state.vertex, vertex_enabled); diff --git a/src/video_core/renderer_opengl/gl_shader_manager.h b/src/video_core/renderer_opengl/gl_shader_manager.h index 950e0dfcb..ad42cce74 100644 --- a/src/video_core/renderer_opengl/gl_shader_manager.h +++ b/src/video_core/renderer_opengl/gl_shader_manager.h @@ -45,6 +45,12 @@ public: /// Rewinds BindHostPipeline state changes. void RestoreGuestPipeline(); + /// Binds an OpenGL GLSL program object unsynchronized with the guest state. + void BindHostCompute(GLuint program); + + /// Rewinds BindHostCompute state changes. + void RestoreGuestCompute(); + void UseVertexShader(GLuint program); void UseGeometryShader(GLuint program); void UseFragmentShader(GLuint program); diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp index 6bcf831f2..60e6fa39f 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.cpp +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -13,7 +13,7 @@ #include "video_core/renderer_opengl/gl_state_tracker.h" #define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) -#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32)) +#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / (sizeof(u32))) namespace OpenGL { @@ -249,4 +249,11 @@ StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} } } +void StateTracker::InvalidateStreamBuffer() { + flags[Dirty::VertexBuffers] = true; + for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { + flags[index] = true; + } +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h index 9d127548f..574615d3c 100644 --- a/src/video_core/renderer_opengl/gl_state_tracker.h +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -92,6 +92,8 @@ class StateTracker { public: explicit StateTracker(Tegra::GPU& gpu); + void InvalidateStreamBuffer(); + void BindIndexBuffer(GLuint new_index_buffer) { if (index_buffer == new_index_buffer) { return; @@ -100,6 +102,14 @@ public: glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, new_index_buffer); } + void BindFramebuffer(GLuint new_framebuffer) { + if (framebuffer == new_framebuffer) { + return; + } + framebuffer = new_framebuffer; + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer); + } + void NotifyScreenDrawVertexArray() { flags[OpenGL::Dirty::VertexFormats] = true; flags[OpenGL::Dirty::VertexFormat0 + 0] = true; @@ -129,9 +139,9 @@ public: flags[OpenGL::Dirty::Scissor0] = true; } - void NotifyColorMask0() { + void NotifyColorMask(size_t index) { flags[OpenGL::Dirty::ColorMasks] = true; - flags[OpenGL::Dirty::ColorMask0] = true; + flags[OpenGL::Dirty::ColorMask0 + index] = true; } void NotifyBlend0() { @@ -190,6 +200,7 @@ public: private: Tegra::Engines::Maxwell3D::DirtyState::Flags& flags; + GLuint framebuffer = 0; GLuint index_buffer = 0; }; diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index 887995cf4..e0819cdf2 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -9,6 +9,7 @@ #include "common/assert.h" #include "common/microprofile.h" #include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", @@ -16,24 +17,14 @@ MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage) - : buffer_size(size) { +OGLStreamBuffer::OGLStreamBuffer(const Device& device, StateTracker& state_tracker_) + : state_tracker{state_tracker_} { gl_buffer.Create(); - GLsizeiptr allocate_size = size; - if (vertex_data_usage) { - // On AMD GPU there is a strange crash in indexed drawing. The crash happens when the buffer - // read position is near the end and is an out-of-bound access to the vertex buffer. This is - // probably a bug in the driver and is related to the usage of vec3<byte> attributes in the - // vertex array. Doubling the allocation size for the vertex buffer seems to avoid the - // crash. - allocate_size *= 2; - } - static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; - glNamedBufferStorage(gl_buffer.handle, allocate_size, nullptr, flags); + glNamedBufferStorage(gl_buffer.handle, BUFFER_SIZE, nullptr, flags); mapped_ptr = static_cast<u8*>( - glMapNamedBufferRange(gl_buffer.handle, 0, buffer_size, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); + glMapNamedBufferRange(gl_buffer.handle, 0, BUFFER_SIZE, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); @@ -46,25 +37,24 @@ OGLStreamBuffer::~OGLStreamBuffer() { gl_buffer.Release(); } -std::tuple<u8*, GLintptr, bool> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { - ASSERT(size <= buffer_size); - ASSERT(alignment <= buffer_size); +std::pair<u8*, GLintptr> OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { + ASSERT(size <= BUFFER_SIZE); + ASSERT(alignment <= BUFFER_SIZE); mapped_size = size; if (alignment > 0) { buffer_pos = Common::AlignUp<std::size_t>(buffer_pos, alignment); } - bool invalidate = false; - if (buffer_pos + size > buffer_size) { + if (buffer_pos + size > BUFFER_SIZE) { MICROPROFILE_SCOPE(OpenGL_StreamBuffer); glInvalidateBufferData(gl_buffer.handle); + state_tracker.InvalidateStreamBuffer(); buffer_pos = 0; - invalidate = true; } - return std::make_tuple(mapped_ptr + buffer_pos, buffer_pos, invalidate); + return std::make_pair(mapped_ptr + buffer_pos, buffer_pos); } void OGLStreamBuffer::Unmap(GLsizeiptr size) { diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index 307a67113..dd9cf67eb 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -4,29 +4,31 @@ #pragma once -#include <tuple> +#include <utility> + #include <glad/glad.h> + #include "common/common_types.h" #include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { class Device; +class StateTracker; class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(const Device& device, GLsizeiptr size, bool vertex_data_usage); + explicit OGLStreamBuffer(const Device& device, StateTracker& state_tracker_); ~OGLStreamBuffer(); /* * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes * and the optional alignment requirement. * If the buffer is full, the whole buffer is reallocated which invalidates old chunks. - * The return values are the pointer to the new chunk, the offset within the buffer, - * and the invalidation flag for previous chunks. + * The return values are the pointer to the new chunk, and the offset within the buffer. * The actual used size must be specified on unmapping the chunk. */ - std::tuple<u8*, GLintptr, bool> Map(GLsizeiptr size, GLintptr alignment = 0); + std::pair<u8*, GLintptr> Map(GLsizeiptr size, GLintptr alignment = 0); void Unmap(GLsizeiptr size); @@ -39,15 +41,18 @@ public: } GLsizeiptr Size() const noexcept { - return buffer_size; + return BUFFER_SIZE; } private: + static constexpr GLsizeiptr BUFFER_SIZE = 256 * 1024 * 1024; + + StateTracker& state_tracker; + OGLBuffer gl_buffer; GLuint64EXT gpu_address = 0; GLintptr buffer_pos = 0; - GLsizeiptr buffer_size = 0; GLsizeiptr mapped_size = 0; u8* mapped_ptr = nullptr; }; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index a863ef218..546cb6d00 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -2,37 +2,60 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include "common/assert.h" -#include "common/bit_util.h" -#include "common/common_types.h" -#include "common/microprofile.h" -#include "common/scope_exit.h" -#include "core/core.h" -#include "video_core/morton.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" +#include <algorithm> +#include <array> +#include <bit> +#include <string> + +#include <glad/glad.h> + +#include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_texture_cache.h" -#include "video_core/renderer_opengl/utils.h" -#include "video_core/texture_cache/surface_base.h" +#include "video_core/renderer_opengl/maxwell_to_gl.h" +#include "video_core/renderer_opengl/util_shaders.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/format_lookup_table.h" +#include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/texture_cache.h" -#include "video_core/textures/convert.h" -#include "video_core/textures/texture.h" +#include "video_core/textures/decoders.h" namespace OpenGL { -using Tegra::Texture::SwizzleSource; -using VideoCore::MortonSwizzleMode; +namespace { +using Tegra::Texture::SwizzleSource; +using Tegra::Texture::TextureMipmapFilter; +using Tegra::Texture::TextureType; +using Tegra::Texture::TICEntry; +using Tegra::Texture::TSCEntry; +using VideoCommon::CalculateLevelStrideAlignment; +using VideoCommon::ImageCopy; +using VideoCommon::ImageFlagBits; +using VideoCommon::ImageType; +using VideoCommon::NUM_RT; +using VideoCommon::SamplesLog2; +using VideoCommon::SwizzleParameters; +using VideoCore::Surface::BytesPerBlock; +using VideoCore::Surface::IsPixelFormatASTC; +using VideoCore::Surface::IsPixelFormatSRGB; +using VideoCore::Surface::MaxPixelFormat; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::SurfaceTarget; using VideoCore::Surface::SurfaceType; -MICROPROFILE_DEFINE(OpenGL_Texture_Upload, "OpenGL", "Texture Upload", MP_RGB(128, 192, 128)); -MICROPROFILE_DEFINE(OpenGL_Texture_Download, "OpenGL", "Texture Download", MP_RGB(128, 192, 128)); -MICROPROFILE_DEFINE(OpenGL_Texture_Buffer_Copy, "OpenGL", "Texture Buffer Copy", - MP_RGB(128, 192, 128)); +struct CopyOrigin { + GLint level; + GLint x; + GLint y; + GLint z; +}; -namespace { +struct CopyRegion { + GLsizei width; + GLsizei height; + GLsizei depth; +}; struct FormatTuple { GLenum internal_format; @@ -40,7 +63,7 @@ struct FormatTuple { GLenum type = GL_NONE; }; -constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format_tuples = {{ +constexpr std::array<FormatTuple, MaxPixelFormat> FORMAT_TABLE = {{ {GL_RGBA8, GL_RGBA, GL_UNSIGNED_INT_8_8_8_8_REV}, // A8B8G8R8_UNORM {GL_RGBA8_SNORM, GL_RGBA, GL_BYTE}, // A8B8G8R8_SNORM {GL_RGBA8I, GL_RGBA_INTEGER, GL_BYTE}, // A8B8G8R8_SINT @@ -103,72 +126,113 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5_UNORM {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4_UNORM {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM - // Compressed sRGB formats - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB - {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB - {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7_SRGB - {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // A4B4G4R4_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR}, // ASTC_2D_4X4_SRGB - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR}, // ASTC_2D_8X8_SRGB - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR}, // ASTC_2D_8X5_SRGB - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR}, // ASTC_2D_5X4_SRGB - {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR}, // ASTC_2D_5X5_SRGB - {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR}, // ASTC_2D_10X8_SRGB - {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB - {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB - {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB - {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR}, // ASTC_2D_8X6_SRGB - {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5_UNORM - {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR}, // ASTC_2D_6X5_SRGB - {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9_FLOAT - - // Depth formats - {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // D32_FLOAT - {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM - - // DepthStencil formats - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT - {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB + {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB + {GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM}, // BC7_SRGB + {GL_RGBA4, GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4_REV}, // A4B4G4R4_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR}, // ASTC_2D_4X4_SRGB + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR}, // ASTC_2D_8X8_SRGB + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR}, // ASTC_2D_8X5_SRGB + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR}, // ASTC_2D_5X4_SRGB + {GL_COMPRESSED_RGBA_ASTC_5x5_KHR}, // ASTC_2D_5X5_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR}, // ASTC_2D_5X5_SRGB + {GL_COMPRESSED_RGBA_ASTC_10x8_KHR}, // ASTC_2D_10X8_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR}, // ASTC_2D_10X8_SRGB + {GL_COMPRESSED_RGBA_ASTC_6x6_KHR}, // ASTC_2D_6X6_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR}, // ASTC_2D_6X6_SRGB + {GL_COMPRESSED_RGBA_ASTC_10x10_KHR}, // ASTC_2D_10X10_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR}, // ASTC_2D_10X10_SRGB + {GL_COMPRESSED_RGBA_ASTC_12x12_KHR}, // ASTC_2D_12X12_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR}, // ASTC_2D_12X12_SRGB + {GL_COMPRESSED_RGBA_ASTC_8x6_KHR}, // ASTC_2D_8X6_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR}, // ASTC_2D_8X6_SRGB + {GL_COMPRESSED_RGBA_ASTC_6x5_KHR}, // ASTC_2D_6X5_UNORM + {GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR}, // ASTC_2D_6X5_SRGB + {GL_RGB9_E5, GL_RGB, GL_UNSIGNED_INT_5_9_9_9_REV}, // E5B9G9R9_FLOAT + {GL_DEPTH_COMPONENT32F, GL_DEPTH_COMPONENT, GL_FLOAT}, // D32_FLOAT + {GL_DEPTH_COMPONENT16, GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT}, // D16_UNORM + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // D24_UNORM_S8_UINT + {GL_DEPTH24_STENCIL8, GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8}, // S8_UINT_D24_UNORM {GL_DEPTH32F_STENCIL8, GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV}, // D32_FLOAT_S8_UINT }}; +constexpr std::array ACCELERATED_FORMATS{ + GL_RGBA32F, GL_RGBA16F, GL_RG32F, GL_RG16F, GL_R11F_G11F_B10F, GL_R32F, + GL_R16F, GL_RGBA32UI, GL_RGBA16UI, GL_RGB10_A2UI, GL_RGBA8UI, GL_RG32UI, + GL_RG16UI, GL_RG8UI, GL_R32UI, GL_R16UI, GL_R8UI, GL_RGBA32I, + GL_RGBA16I, GL_RGBA8I, GL_RG32I, GL_RG16I, GL_RG8I, GL_R32I, + GL_R16I, GL_R8I, GL_RGBA16, GL_RGB10_A2, GL_RGBA8, GL_RG16, + GL_RG8, GL_R16, GL_R8, GL_RGBA16_SNORM, GL_RGBA8_SNORM, GL_RG16_SNORM, + GL_RG8_SNORM, GL_R16_SNORM, GL_R8_SNORM, +}; + const FormatTuple& GetFormatTuple(PixelFormat pixel_format) { - ASSERT(static_cast<std::size_t>(pixel_format) < tex_format_tuples.size()); - return tex_format_tuples[static_cast<std::size_t>(pixel_format)]; + ASSERT(static_cast<size_t>(pixel_format) < FORMAT_TABLE.size()); + return FORMAT_TABLE[static_cast<size_t>(pixel_format)]; } -GLenum GetTextureTarget(const SurfaceTarget& target) { - switch (target) { - case SurfaceTarget::TextureBuffer: +GLenum ImageTarget(const VideoCommon::ImageInfo& info) { + switch (info.type) { + case ImageType::e1D: + return GL_TEXTURE_1D_ARRAY; + case ImageType::e2D: + if (info.num_samples > 1) { + return GL_TEXTURE_2D_MULTISAMPLE_ARRAY; + } + return GL_TEXTURE_2D_ARRAY; + case ImageType::e3D: + return GL_TEXTURE_3D; + case ImageType::Linear: + return GL_TEXTURE_2D_ARRAY; + case ImageType::Buffer: return GL_TEXTURE_BUFFER; - case SurfaceTarget::Texture1D: + } + UNREACHABLE_MSG("Invalid image type={}", info.type); + return GL_NONE; +} + +GLenum ImageTarget(ImageViewType type, int num_samples = 1) { + const bool is_multisampled = num_samples > 1; + switch (type) { + case ImageViewType::e1D: return GL_TEXTURE_1D; - case SurfaceTarget::Texture2D: - return GL_TEXTURE_2D; - case SurfaceTarget::Texture3D: + case ImageViewType::e2D: + return is_multisampled ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D; + case ImageViewType::Cube: + return GL_TEXTURE_CUBE_MAP; + case ImageViewType::e3D: return GL_TEXTURE_3D; - case SurfaceTarget::Texture1DArray: + case ImageViewType::e1DArray: return GL_TEXTURE_1D_ARRAY; - case SurfaceTarget::Texture2DArray: - return GL_TEXTURE_2D_ARRAY; - case SurfaceTarget::TextureCubemap: - return GL_TEXTURE_CUBE_MAP; - case SurfaceTarget::TextureCubeArray: + case ImageViewType::e2DArray: + return is_multisampled ? GL_TEXTURE_2D_MULTISAMPLE_ARRAY : GL_TEXTURE_2D_ARRAY; + case ImageViewType::CubeArray: return GL_TEXTURE_CUBE_MAP_ARRAY; + case ImageViewType::Rect: + return GL_TEXTURE_RECTANGLE; + case ImageViewType::Buffer: + return GL_TEXTURE_BUFFER; } - UNREACHABLE(); - return {}; + UNREACHABLE_MSG("Invalid image view type={}", type); + return GL_NONE; } -GLint GetSwizzleSource(SwizzleSource source) { +GLenum TextureMode(PixelFormat format, bool is_first) { + switch (format) { + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: + return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; + case PixelFormat::S8_UINT_D24_UNORM: + return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; + default: + UNREACHABLE(); + return GL_DEPTH_COMPONENT; + } +} + +GLint Swizzle(SwizzleSource source) { switch (source) { case SwizzleSource::Zero: return GL_ZERO; @@ -184,531 +248,813 @@ GLint GetSwizzleSource(SwizzleSource source) { case SwizzleSource::OneFloat: return GL_ONE; } - UNREACHABLE(); + UNREACHABLE_MSG("Invalid swizzle source={}", source); return GL_NONE; } -GLenum GetComponent(PixelFormat format, bool is_first) { - switch (format) { - case PixelFormat::D24_UNORM_S8_UINT: - case PixelFormat::D32_FLOAT_S8_UINT: - return is_first ? GL_DEPTH_COMPONENT : GL_STENCIL_INDEX; - case PixelFormat::S8_UINT_D24_UNORM: - return is_first ? GL_STENCIL_INDEX : GL_DEPTH_COMPONENT; +GLenum AttachmentType(PixelFormat format) { + switch (const SurfaceType type = VideoCore::Surface::GetFormatType(format); type) { + case SurfaceType::Depth: + return GL_DEPTH_ATTACHMENT; + case SurfaceType::DepthStencil: + return GL_DEPTH_STENCIL_ATTACHMENT; default: - UNREACHABLE(); - return GL_DEPTH_COMPONENT; + UNIMPLEMENTED_MSG("Unimplemented type={}", type); + return GL_NONE; } } -void ApplyTextureDefaults(const SurfaceParams& params, GLuint texture) { - if (params.IsBuffer()) { - return; +[[nodiscard]] bool IsConverted(const Device& device, PixelFormat format, ImageType type) { + if (!device.HasASTC() && IsPixelFormatASTC(format)) { + return true; } - glTextureParameteri(texture, GL_TEXTURE_MIN_FILTER, GL_LINEAR); - glTextureParameteri(texture, GL_TEXTURE_MAG_FILTER, GL_LINEAR); - glTextureParameteri(texture, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); - glTextureParameteri(texture, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); - glTextureParameteri(texture, GL_TEXTURE_MAX_LEVEL, static_cast<GLint>(params.num_levels - 1)); - if (params.num_levels == 1) { - glTextureParameterf(texture, GL_TEXTURE_LOD_BIAS, 1000.0f); + switch (format) { + case PixelFormat::BC4_UNORM: + case PixelFormat::BC5_UNORM: + return type == ImageType::e3D; + default: + break; } + return false; } -OGLTexture CreateTexture(const SurfaceParams& params, GLenum target, GLenum internal_format, - OGLBuffer& texture_buffer) { - OGLTexture texture; - texture.Create(target); +[[nodiscard]] constexpr SwizzleSource ConvertGreenRed(SwizzleSource value) { + switch (value) { + case SwizzleSource::G: + return SwizzleSource::R; + default: + return value; + } +} - switch (params.target) { - case SurfaceTarget::Texture1D: - glTextureStorage1D(texture.handle, params.emulated_levels, internal_format, params.width); - break; - case SurfaceTarget::TextureBuffer: - texture_buffer.Create(); - glNamedBufferStorage(texture_buffer.handle, params.width * params.GetBytesPerPixel(), - nullptr, GL_DYNAMIC_STORAGE_BIT); - glTextureBuffer(texture.handle, internal_format, texture_buffer.handle); +void ApplySwizzle(GLuint handle, PixelFormat format, std::array<SwizzleSource, 4> swizzle) { + switch (format) { + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: + case PixelFormat::S8_UINT_D24_UNORM: + UNIMPLEMENTED_IF(swizzle[0] != SwizzleSource::R && swizzle[0] != SwizzleSource::G); + glTextureParameteri(handle, GL_DEPTH_STENCIL_TEXTURE_MODE, + TextureMode(format, swizzle[0] == SwizzleSource::R)); + std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed); break; - case SurfaceTarget::Texture2D: - case SurfaceTarget::TextureCubemap: - glTextureStorage2D(texture.handle, params.emulated_levels, internal_format, params.width, - params.height); + default: break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureStorage3D(texture.handle, params.emulated_levels, internal_format, params.width, - params.height, params.depth); + } + std::array<GLint, 4> gl_swizzle; + std::ranges::transform(swizzle, gl_swizzle.begin(), Swizzle); + glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); +} + +[[nodiscard]] bool CanBeAccelerated(const TextureCacheRuntime& runtime, + const VideoCommon::ImageInfo& info) { + // Disable accelerated uploads for now as they don't implement swizzled uploads + return false; + switch (info.type) { + case ImageType::e2D: + case ImageType::e3D: + case ImageType::Linear: break; default: - UNREACHABLE(); + return false; + } + const GLenum internal_format = GetFormatTuple(info.format).internal_format; + const auto& format_info = runtime.FormatInfo(info.type, internal_format); + if (format_info.is_compressed) { + return false; + } + if (std::ranges::find(ACCELERATED_FORMATS, internal_format) == ACCELERATED_FORMATS.end()) { + return false; } + if (format_info.compatibility_by_size) { + return true; + } + const GLenum store_format = StoreFormat(BytesPerBlock(info.format)); + const GLenum store_class = runtime.FormatInfo(info.type, store_format).compatibility_class; + return format_info.compatibility_class == store_class; +} - ApplyTextureDefaults(params, texture.handle); +[[nodiscard]] CopyOrigin MakeCopyOrigin(VideoCommon::Offset3D offset, + VideoCommon::SubresourceLayers subresource, GLenum target) { + switch (target) { + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + return CopyOrigin{ + .level = static_cast<GLint>(subresource.base_level), + .x = static_cast<GLint>(offset.x), + .y = static_cast<GLint>(offset.y), + .z = static_cast<GLint>(subresource.base_layer), + }; + case GL_TEXTURE_3D: + return CopyOrigin{ + .level = static_cast<GLint>(subresource.base_level), + .x = static_cast<GLint>(offset.x), + .y = static_cast<GLint>(offset.y), + .z = static_cast<GLint>(offset.z), + }; + default: + UNIMPLEMENTED_MSG("Unimplemented copy target={}", target); + return CopyOrigin{.level = 0, .x = 0, .y = 0, .z = 0}; + } +} - return texture; +[[nodiscard]] CopyRegion MakeCopyRegion(VideoCommon::Extent3D extent, + VideoCommon::SubresourceLayers dst_subresource, + GLenum target) { + switch (target) { + case GL_TEXTURE_2D_ARRAY: + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: + return CopyRegion{ + .width = static_cast<GLsizei>(extent.width), + .height = static_cast<GLsizei>(extent.height), + .depth = static_cast<GLsizei>(dst_subresource.num_layers), + }; + case GL_TEXTURE_3D: + return CopyRegion{ + .width = static_cast<GLsizei>(extent.width), + .height = static_cast<GLsizei>(extent.height), + .depth = static_cast<GLsizei>(extent.depth), + }; + default: + UNIMPLEMENTED_MSG("Unimplemented copy target={}", target); + return CopyRegion{.width = 0, .height = 0, .depth = 0}; + } } -constexpr u32 EncodeSwizzle(SwizzleSource x_source, SwizzleSource y_source, SwizzleSource z_source, - SwizzleSource w_source) { - return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | - (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) { + if (False(image_view->flags & VideoCommon::ImageViewFlagBits::Slice)) { + const GLuint texture = image_view->DefaultHandle(); + glNamedFramebufferTexture(fbo, attachment, texture, 0); + return; + } + const GLuint texture = image_view->Handle(ImageViewType::e3D); + if (image_view->range.extent.layers > 1) { + // TODO: OpenGL doesn't support rendering to a fixed number of slices + glNamedFramebufferTexture(fbo, attachment, texture, 0); + } else { + const u32 slice = image_view->range.base.layer; + glNamedFramebufferTextureLayer(fbo, attachment, texture, 0, slice); + } } } // Anonymous namespace -CachedSurface::CachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params, - bool is_astc_supported) - : VideoCommon::SurfaceBase<View>(gpu_addr, params, is_astc_supported) { - if (is_converted) { - internal_format = params.srgb_conversion ? GL_SRGB8_ALPHA8 : GL_RGBA8; - format = GL_RGBA; - type = GL_UNSIGNED_BYTE; - } else { - const auto& tuple{GetFormatTuple(params.pixel_format)}; - internal_format = tuple.internal_format; - format = tuple.format; - type = tuple.type; - is_compressed = params.IsCompressed(); - } - target = GetTextureTarget(params.target); - texture = CreateTexture(params, target, internal_format, texture_buffer); - DecorateSurfaceName(); +ImageBufferMap::ImageBufferMap(GLuint handle_, u8* map, size_t size, OGLSync* sync_) + : span(map, size), sync{sync_}, handle{handle_} {} - u32 num_layers = 1; - if (params.is_layered || params.target == SurfaceTarget::Texture3D) { - num_layers = params.depth; +ImageBufferMap::~ImageBufferMap() { + if (sync) { + sync->Create(); } - - main_view = - CreateViewInner(ViewParams(params.target, 0, num_layers, 0, params.num_levels), true); } -CachedSurface::~CachedSurface() = default; +TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, + StateTracker& state_tracker_) + : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager) { + static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; + for (size_t i = 0; i < TARGETS.size(); ++i) { + const GLenum target = TARGETS[i]; + for (const FormatTuple& tuple : FORMAT_TABLE) { + const GLenum format = tuple.internal_format; + GLint compat_class; + GLint compat_type; + GLint is_compressed; + glGetInternalformativ(target, format, GL_IMAGE_COMPATIBILITY_CLASS, 1, &compat_class); + glGetInternalformativ(target, format, GL_IMAGE_FORMAT_COMPATIBILITY_TYPE, 1, + &compat_type); + glGetInternalformativ(target, format, GL_TEXTURE_COMPRESSED, 1, &is_compressed); + const FormatProperties properties{ + .compatibility_class = static_cast<GLenum>(compat_class), + .compatibility_by_size = compat_type == GL_IMAGE_FORMAT_COMPATIBILITY_BY_SIZE, + .is_compressed = is_compressed == GL_TRUE, + }; + format_properties[i].emplace(format, properties); + } + } + has_broken_texture_view_formats = device.HasBrokenTextureViewFormats(); + + null_image_1d_array.Create(GL_TEXTURE_1D_ARRAY); + null_image_cube_array.Create(GL_TEXTURE_CUBE_MAP_ARRAY); + null_image_3d.Create(GL_TEXTURE_3D); + null_image_rect.Create(GL_TEXTURE_RECTANGLE); + glTextureStorage2D(null_image_1d_array.handle, 1, GL_R8, 1, 1); + glTextureStorage3D(null_image_cube_array.handle, 1, GL_R8, 1, 1, 6); + glTextureStorage3D(null_image_3d.handle, 1, GL_R8, 1, 1, 1); + glTextureStorage2D(null_image_rect.handle, 1, GL_R8, 1, 1); + + std::array<GLuint, 4> new_handles; + glGenTextures(static_cast<GLsizei>(new_handles.size()), new_handles.data()); + null_image_view_1d.handle = new_handles[0]; + null_image_view_2d.handle = new_handles[1]; + null_image_view_2d_array.handle = new_handles[2]; + null_image_view_cube.handle = new_handles[3]; + glTextureView(null_image_view_1d.handle, GL_TEXTURE_1D, null_image_1d_array.handle, GL_R8, 0, 1, + 0, 1); + glTextureView(null_image_view_2d.handle, GL_TEXTURE_2D, null_image_cube_array.handle, GL_R8, 0, + 1, 0, 1); + glTextureView(null_image_view_2d_array.handle, GL_TEXTURE_2D_ARRAY, + null_image_cube_array.handle, GL_R8, 0, 1, 0, 1); + glTextureView(null_image_view_cube.handle, GL_TEXTURE_CUBE_MAP, null_image_cube_array.handle, + GL_R8, 0, 1, 0, 6); + const std::array texture_handles{ + null_image_1d_array.handle, null_image_cube_array.handle, null_image_3d.handle, + null_image_rect.handle, null_image_view_1d.handle, null_image_view_2d.handle, + null_image_view_2d_array.handle, null_image_view_cube.handle, + }; + for (const GLuint handle : texture_handles) { + static constexpr std::array NULL_SWIZZLE{GL_ZERO, GL_ZERO, GL_ZERO, GL_ZERO}; + glTextureParameteriv(handle, GL_TEXTURE_SWIZZLE_RGBA, NULL_SWIZZLE.data()); + } + const auto set_view = [this](ImageViewType type, GLuint handle) { + if (device.HasDebuggingToolAttached()) { + const std::string name = fmt::format("NullImage {}", type); + glObjectLabel(GL_TEXTURE, handle, static_cast<GLsizei>(name.size()), name.data()); + } + null_image_views[static_cast<size_t>(type)] = handle; + }; + set_view(ImageViewType::e1D, null_image_view_1d.handle); + set_view(ImageViewType::e2D, null_image_view_2d.handle); + set_view(ImageViewType::Cube, null_image_view_cube.handle); + set_view(ImageViewType::e3D, null_image_3d.handle); + set_view(ImageViewType::e1DArray, null_image_1d_array.handle); + set_view(ImageViewType::e2DArray, null_image_view_2d_array.handle); + set_view(ImageViewType::CubeArray, null_image_cube_array.handle); + set_view(ImageViewType::Rect, null_image_rect.handle); +} -void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { - MICROPROFILE_SCOPE(OpenGL_Texture_Download); +TextureCacheRuntime::~TextureCacheRuntime() = default; - if (params.IsBuffer()) { - glGetNamedBufferSubData(texture_buffer.handle, 0, - static_cast<GLsizeiptr>(params.GetHostSizeInBytes(false)), - staging_buffer.data()); - return; - } +void TextureCacheRuntime::Finish() { + glFinish(); +} - SCOPE_EXIT({ glPixelStorei(GL_PACK_ROW_LENGTH, 0); }); +ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { + return upload_buffers.RequestMap(size, true); +} - for (u32 level = 0; level < params.emulated_levels; ++level) { - glPixelStorei(GL_PACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level, is_converted))); - glPixelStorei(GL_PACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level))); - const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level, is_converted); +ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) { + return download_buffers.RequestMap(size, false); +} - u8* const mip_data = staging_buffer.data() + mip_offset; - const GLsizei size = static_cast<GLsizei>(params.GetHostMipmapSize(level)); - if (is_compressed) { - glGetCompressedTextureImage(texture.handle, level, size, mip_data); - } else { - glGetTextureImage(texture.handle, level, format, type, size, mip_data); - } +void TextureCacheRuntime::CopyImage(Image& dst_image, Image& src_image, + std::span<const ImageCopy> copies) { + const GLuint dst_name = dst_image.Handle(); + const GLuint src_name = src_image.Handle(); + const GLenum dst_target = ImageTarget(dst_image.info); + const GLenum src_target = ImageTarget(src_image.info); + for (const ImageCopy& copy : copies) { + const auto src_origin = MakeCopyOrigin(copy.src_offset, copy.src_subresource, src_target); + const auto dst_origin = MakeCopyOrigin(copy.dst_offset, copy.dst_subresource, dst_target); + const auto region = MakeCopyRegion(copy.extent, copy.dst_subresource, dst_target); + glCopyImageSubData(src_name, src_target, src_origin.level, src_origin.x, src_origin.y, + src_origin.z, dst_name, dst_target, dst_origin.level, dst_origin.x, + dst_origin.y, dst_origin.z, region.width, region.height, region.depth); } } -void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) { - MICROPROFILE_SCOPE(OpenGL_Texture_Upload); - SCOPE_EXIT({ glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); }); - for (u32 level = 0; level < params.emulated_levels; ++level) { - UploadTextureMipmap(level, staging_buffer); +bool TextureCacheRuntime::CanImageBeCopied(const Image& dst, const Image& src) { + if (dst.info.type == ImageType::e3D && dst.info.format == PixelFormat::BC4_UNORM) { + return false; } + return true; } -void CachedSurface::UploadTextureMipmap(u32 level, const std::vector<u8>& staging_buffer) { - glPixelStorei(GL_UNPACK_ALIGNMENT, std::min(8U, params.GetRowAlignment(level, is_converted))); - glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(params.GetMipWidth(level))); - - const std::size_t mip_offset = params.GetHostMipmapLevelOffset(level, is_converted); - const u8* buffer{staging_buffer.data() + mip_offset}; - if (is_compressed) { - const auto image_size{static_cast<GLsizei>(params.GetHostMipmapSize(level))}; - switch (params.target) { - case SurfaceTarget::Texture2D: - glCompressedTextureSubImage2D(texture.handle, level, 0, 0, - static_cast<GLsizei>(params.GetMipWidth(level)), - static_cast<GLsizei>(params.GetMipHeight(level)), - internal_format, image_size, buffer); - break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glCompressedTextureSubImage3D(texture.handle, level, 0, 0, 0, - static_cast<GLsizei>(params.GetMipWidth(level)), - static_cast<GLsizei>(params.GetMipHeight(level)), - static_cast<GLsizei>(params.GetMipDepth(level)), - internal_format, image_size, buffer); - break; - case SurfaceTarget::TextureCubemap: { - const std::size_t layer_size{params.GetHostLayerSize(level)}; - for (std::size_t face = 0; face < params.depth; ++face) { - glCompressedTextureSubImage3D(texture.handle, level, 0, 0, static_cast<GLint>(face), - static_cast<GLsizei>(params.GetMipWidth(level)), - static_cast<GLsizei>(params.GetMipHeight(level)), 1, - internal_format, static_cast<GLsizei>(layer_size), - buffer); - buffer += layer_size; - } - break; - } - default: - UNREACHABLE(); - } +void TextureCacheRuntime::EmulateCopyImage(Image& dst, Image& src, + std::span<const ImageCopy> copies) { + if (dst.info.type == ImageType::e3D && dst.info.format == PixelFormat::BC4_UNORM) { + ASSERT(src.info.type == ImageType::e3D); + util_shaders.CopyBC4(dst, src, copies); } else { - switch (params.target) { - case SurfaceTarget::Texture1D: - glTextureSubImage1D(texture.handle, level, 0, params.GetMipWidth(level), format, type, - buffer); - break; - case SurfaceTarget::TextureBuffer: - ASSERT(level == 0); - glNamedBufferSubData(texture_buffer.handle, 0, - params.GetMipWidth(level) * params.GetBytesPerPixel(), buffer); - break; - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2D: - glTextureSubImage2D(texture.handle, level, 0, 0, params.GetMipWidth(level), - params.GetMipHeight(level), format, type, buffer); - break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureSubImage3D( - texture.handle, level, 0, 0, 0, static_cast<GLsizei>(params.GetMipWidth(level)), - static_cast<GLsizei>(params.GetMipHeight(level)), - static_cast<GLsizei>(params.GetMipDepth(level)), format, type, buffer); - break; - case SurfaceTarget::TextureCubemap: - for (std::size_t face = 0; face < params.depth; ++face) { - glTextureSubImage3D(texture.handle, level, 0, 0, static_cast<GLint>(face), - params.GetMipWidth(level), params.GetMipHeight(level), 1, - format, type, buffer); - buffer += params.GetHostLayerSize(level); - } - break; - default: - UNREACHABLE(); - } + UNREACHABLE(); } } -void CachedSurface::DecorateSurfaceName() { - LabelGLObject(GL_TEXTURE, texture.handle, GetGpuAddr(), params.TargetName()); -} +void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation) { + state_tracker.NotifyScissor0(); + state_tracker.NotifyRasterizeEnable(); + state_tracker.NotifyFramebufferSRGB(); -void CachedSurfaceView::DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix) { - LabelGLObject(GL_TEXTURE, main_view.handle, gpu_addr, prefix); + ASSERT(dst->BufferBits() == src->BufferBits()); + + glEnable(GL_FRAMEBUFFER_SRGB); + glDisable(GL_RASTERIZER_DISCARD); + glDisablei(GL_SCISSOR_TEST, 0); + + const GLbitfield buffer_bits = dst->BufferBits(); + const bool has_depth = (buffer_bits & ~GL_COLOR_BUFFER_BIT) != 0; + const bool is_linear = !has_depth && filter == Tegra::Engines::Fermi2D::Filter::Bilinear; + glBlitNamedFramebuffer(src->Handle(), dst->Handle(), src_region[0].x, src_region[0].y, + src_region[1].x, src_region[1].y, dst_region[0].x, dst_region[0].y, + dst_region[1].x, dst_region[1].y, buffer_bits, + is_linear ? GL_LINEAR : GL_NEAREST); } -View CachedSurface::CreateView(const ViewParams& view_key) { - return CreateViewInner(view_key, false); +void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, + size_t buffer_offset, + std::span<const SwizzleParameters> swizzles) { + switch (image.info.type) { + case ImageType::e2D: + return util_shaders.BlockLinearUpload2D(image, map, buffer_offset, swizzles); + case ImageType::e3D: + return util_shaders.BlockLinearUpload3D(image, map, buffer_offset, swizzles); + case ImageType::Linear: + return util_shaders.PitchUpload(image, map, buffer_offset, swizzles); + default: + UNREACHABLE(); + break; + } } -View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_proxy) { - auto view = std::make_shared<CachedSurfaceView>(*this, view_key, is_proxy); - views[view_key] = view; - if (!is_proxy) - view->DecorateViewName(gpu_addr, params.TargetName() + "V:" + std::to_string(view_count++)); - return view; +void TextureCacheRuntime::InsertUploadMemoryBarrier() { + glMemoryBarrier(GL_TEXTURE_FETCH_BARRIER_BIT | GL_SHADER_IMAGE_ACCESS_BARRIER_BIT); } -CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, - bool is_proxy) - : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format}, - target{GetTextureTarget(params.target)}, is_proxy{is_proxy} { - if (!is_proxy) { - main_view = CreateTextureView(); +FormatProperties TextureCacheRuntime::FormatInfo(ImageType type, GLenum internal_format) const { + switch (type) { + case ImageType::e1D: + return format_properties[0].at(internal_format); + case ImageType::e2D: + case ImageType::Linear: + return format_properties[1].at(internal_format); + case ImageType::e3D: + return format_properties[2].at(internal_format); + default: + UNREACHABLE(); + return FormatProperties{}; } } -CachedSurfaceView::~CachedSurfaceView() = default; +TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_) + : storage_flags{storage_flags_}, map_flags{map_flags_} {} -void CachedSurfaceView::Attach(GLenum attachment, GLenum fb_target) const { - ASSERT(params.num_levels == 1); +TextureCacheRuntime::StagingBuffers::~StagingBuffers() = default; - if (params.target == SurfaceTarget::Texture3D) { - if (params.num_layers > 1) { - ASSERT(params.base_layer == 0); - glFramebufferTexture(fb_target, attachment, surface.texture.handle, params.base_level); - } else { - glFramebufferTexture3D(fb_target, attachment, target, surface.texture.handle, - params.base_level, params.base_layer); - } - return; +ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_size, + bool insert_fence) { + const size_t index = RequestBuffer(requested_size); + OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; + return ImageBufferMap(buffers[index].handle, maps[index], requested_size, sync); +} + +size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) { + if (const std::optional<size_t> index = FindBuffer(requested_size); index) { + return *index; } - if (params.num_layers > 1) { - UNIMPLEMENTED_IF(params.base_layer != 0); - glFramebufferTexture(fb_target, attachment, GetTexture(), 0); - return; + OGLBuffer& buffer = buffers.emplace_back(); + buffer.Create(); + glNamedBufferStorage(buffer.handle, requested_size, nullptr, + storage_flags | GL_MAP_PERSISTENT_BIT); + maps.push_back(static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, requested_size, + map_flags | GL_MAP_PERSISTENT_BIT))); + + syncs.emplace_back(); + sizes.push_back(requested_size); + + ASSERT(syncs.size() == buffers.size() && buffers.size() == maps.size() && + maps.size() == sizes.size()); + + return buffers.size() - 1; +} + +std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t requested_size) { + size_t smallest_buffer = std::numeric_limits<size_t>::max(); + std::optional<size_t> found; + const size_t num_buffers = sizes.size(); + for (size_t index = 0; index < num_buffers; ++index) { + const size_t buffer_size = sizes[index]; + if (buffer_size < requested_size || buffer_size >= smallest_buffer) { + continue; + } + if (syncs[index].handle != 0) { + GLint status; + glGetSynciv(syncs[index].handle, GL_SYNC_STATUS, 1, nullptr, &status); + if (status != GL_SIGNALED) { + continue; + } + syncs[index].Release(); + } + smallest_buffer = buffer_size; + found = index; } + return found; +} - const GLenum view_target = surface.GetTarget(); - const GLuint texture = surface.GetTexture(); - switch (surface.GetSurfaceParams().target) { - case SurfaceTarget::Texture1D: - glFramebufferTexture1D(fb_target, attachment, view_target, texture, params.base_level); +Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, + VAddr cpu_addr_) + : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_) { + if (CanBeAccelerated(runtime, info)) { + flags |= ImageFlagBits::AcceleratedUpload; + } + if (IsConverted(runtime.device, info.format, info.type)) { + flags |= ImageFlagBits::Converted; + gl_internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; + gl_format = GL_RGBA; + gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; + } else { + const auto& tuple = GetFormatTuple(info.format); + gl_internal_format = tuple.internal_format; + gl_format = tuple.format; + gl_type = tuple.type; + } + const GLenum target = ImageTarget(info); + const GLsizei width = info.size.width; + const GLsizei height = info.size.height; + const GLsizei depth = info.size.depth; + const int max_host_mip_levels = std::bit_width(info.size.width); + const GLsizei num_levels = std::min(info.resources.levels, max_host_mip_levels); + const GLsizei num_layers = info.resources.layers; + const GLsizei num_samples = info.num_samples; + + GLuint handle = 0; + if (target != GL_TEXTURE_BUFFER) { + texture.Create(target); + handle = texture.handle; + } + switch (target) { + case GL_TEXTURE_1D_ARRAY: + glTextureStorage2D(handle, num_levels, gl_internal_format, width, num_layers); break; - case SurfaceTarget::Texture2D: - glFramebufferTexture2D(fb_target, attachment, view_target, texture, params.base_level); + case GL_TEXTURE_2D_ARRAY: + glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, num_layers); break; - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::TextureCubeArray: - glFramebufferTextureLayer(fb_target, attachment, texture, params.base_level, - params.base_layer); + case GL_TEXTURE_2D_MULTISAMPLE_ARRAY: { + // TODO: Where should 'fixedsamplelocations' come from? + const auto [samples_x, samples_y] = SamplesLog2(info.num_samples); + glTextureStorage3DMultisample(handle, num_samples, gl_internal_format, width >> samples_x, + height >> samples_y, num_layers, GL_FALSE); + break; + } + case GL_TEXTURE_RECTANGLE: + glTextureStorage2D(handle, num_levels, gl_internal_format, width, height); + break; + case GL_TEXTURE_3D: + glTextureStorage3D(handle, num_levels, gl_internal_format, width, height, depth); + break; + case GL_TEXTURE_BUFFER: + buffer.Create(); + glNamedBufferStorage(buffer.handle, guest_size_bytes, nullptr, 0); break; default: - UNIMPLEMENTED(); + UNREACHABLE_MSG("Invalid target=0x{:x}", target); + break; + } + if (runtime.device.HasDebuggingToolAttached()) { + const std::string name = VideoCommon::Name(*this); + glObjectLabel(target == GL_TEXTURE_BUFFER ? GL_BUFFER : GL_TEXTURE, handle, + static_cast<GLsizei>(name.size()), name.data()); } } -GLuint CachedSurfaceView::GetTexture(SwizzleSource x_source, SwizzleSource y_source, - SwizzleSource z_source, SwizzleSource w_source) { - if (GetSurfaceParams().IsBuffer()) { - return GetTexture(); - } - const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); - if (current_swizzle == new_swizzle) { - return current_view; - } - current_swizzle = new_swizzle; +void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies) { + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.Handle()); + glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); - const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle); - OGLTextureView& view = entry->second; - if (!is_cache_miss) { - current_view = view.handle; - return view.handle; - } - view = CreateTextureView(); - current_view = view.handle; + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - std::array swizzle{x_source, y_source, z_source, w_source}; + u32 current_row_length = std::numeric_limits<u32>::max(); + u32 current_image_height = std::numeric_limits<u32>::max(); - switch (const PixelFormat format = GetSurfaceParams().pixel_format) { - case PixelFormat::D24_UNORM_S8_UINT: - case PixelFormat::D32_FLOAT_S8_UINT: - case PixelFormat::S8_UINT_D24_UNORM: - UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); - glTextureParameteri(view.handle, GL_DEPTH_STENCIL_TEXTURE_MODE, - GetComponent(format, x_source == SwizzleSource::R)); - - // Make sure we sample the first component - std::transform(swizzle.begin(), swizzle.end(), swizzle.begin(), [](SwizzleSource value) { - return value == SwizzleSource::G ? SwizzleSource::R : value; - }); - [[fallthrough]]; - default: { - const std::array gl_swizzle = {GetSwizzleSource(swizzle[0]), GetSwizzleSource(swizzle[1]), - GetSwizzleSource(swizzle[2]), GetSwizzleSource(swizzle[3])}; - glTextureParameteriv(view.handle, GL_TEXTURE_SWIZZLE_RGBA, gl_swizzle.data()); - break; - } + for (const VideoCommon::BufferImageCopy& copy : copies) { + if (current_row_length != copy.buffer_row_length) { + current_row_length = copy.buffer_row_length; + glPixelStorei(GL_UNPACK_ROW_LENGTH, current_row_length); + } + if (current_image_height != copy.buffer_image_height) { + current_image_height = copy.buffer_image_height; + glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, current_image_height); + } + CopyBufferToImage(copy, buffer_offset); } - return view.handle; } -OGLTextureView CachedSurfaceView::CreateTextureView() const { - OGLTextureView texture_view; - texture_view.Create(); - - if (target == GL_TEXTURE_3D) { - glTextureView(texture_view.handle, target, surface.texture.handle, format, - params.base_level, params.num_levels, 0, 1); - } else { - glTextureView(texture_view.handle, target, surface.texture.handle, format, - params.base_level, params.num_levels, params.base_layer, params.num_layers); +void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferCopy> copies) { + for (const VideoCommon::BufferCopy& copy : copies) { + glCopyNamedBufferSubData(map.Handle(), buffer.handle, copy.src_offset + buffer_offset, + copy.dst_offset, copy.size); } - ApplyTextureDefaults(surface.GetSurfaceParams(), texture_view.handle); - - return texture_view; } -TextureCacheOpenGL::TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer, - Tegra::Engines::Maxwell3D& maxwell3d, - Tegra::MemoryManager& gpu_memory, const Device& device, - StateTracker& state_tracker_) - : TextureCacheBase{rasterizer, maxwell3d, gpu_memory, device.HasASTC()}, state_tracker{ - state_tracker_} { - src_framebuffer.Create(); - dst_framebuffer.Create(); -} +void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies) { + glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API -TextureCacheOpenGL::~TextureCacheOpenGL() = default; + glBindBuffer(GL_PIXEL_PACK_BUFFER, map.Handle()); + glPixelStorei(GL_PACK_ALIGNMENT, 1); -Surface TextureCacheOpenGL::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) { - return std::make_shared<CachedSurface>(gpu_addr, params, is_astc_supported); -} + u32 current_row_length = std::numeric_limits<u32>::max(); + u32 current_image_height = std::numeric_limits<u32>::max(); -void TextureCacheOpenGL::ImageCopy(Surface& src_surface, Surface& dst_surface, - const VideoCommon::CopyParams& copy_params) { - const auto& src_params = src_surface->GetSurfaceParams(); - const auto& dst_params = dst_surface->GetSurfaceParams(); - if (src_params.type != dst_params.type) { - // A fallback is needed - return; + for (const VideoCommon::BufferImageCopy& copy : copies) { + if (current_row_length != copy.buffer_row_length) { + current_row_length = copy.buffer_row_length; + glPixelStorei(GL_PACK_ROW_LENGTH, current_row_length); + } + if (current_image_height != copy.buffer_image_height) { + current_image_height = copy.buffer_image_height; + glPixelStorei(GL_PACK_IMAGE_HEIGHT, current_image_height); + } + CopyImageToBuffer(copy, buffer_offset); } - const auto src_handle = src_surface->GetTexture(); - const auto src_target = src_surface->GetTarget(); - const auto dst_handle = dst_surface->GetTexture(); - const auto dst_target = dst_surface->GetTarget(); - glCopyImageSubData(src_handle, src_target, copy_params.source_level, copy_params.source_x, - copy_params.source_y, copy_params.source_z, dst_handle, dst_target, - copy_params.dest_level, copy_params.dest_x, copy_params.dest_y, - copy_params.dest_z, copy_params.width, copy_params.height, - copy_params.depth); } -void TextureCacheOpenGL::ImageBlit(View& src_view, View& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) { - const auto& src_params{src_view->GetSurfaceParams()}; - const auto& dst_params{dst_view->GetSurfaceParams()}; - UNIMPLEMENTED_IF(src_params.depth != 1); - UNIMPLEMENTED_IF(dst_params.depth != 1); - - state_tracker.NotifyScissor0(); - state_tracker.NotifyFramebuffer(); - state_tracker.NotifyRasterizeEnable(); - state_tracker.NotifyFramebufferSRGB(); +void Image::CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset) { + // Compressed formats don't have a pixel format or type + const bool is_compressed = gl_format == GL_NONE; + const void* const offset = reinterpret_cast<const void*>(copy.buffer_offset + buffer_offset); - if (dst_params.srgb_conversion) { - glEnable(GL_FRAMEBUFFER_SRGB); - } else { - glDisable(GL_FRAMEBUFFER_SRGB); + switch (info.type) { + case ImageType::e1D: + if (is_compressed) { + glCompressedTextureSubImage2D(texture.handle, copy.image_subresource.base_level, + copy.image_offset.x, copy.image_subresource.base_layer, + copy.image_extent.width, + copy.image_subresource.num_layers, gl_internal_format, + static_cast<GLsizei>(copy.buffer_size), offset); + } else { + glTextureSubImage2D(texture.handle, copy.image_subresource.base_level, + copy.image_offset.x, copy.image_subresource.base_layer, + copy.image_extent.width, copy.image_subresource.num_layers, + gl_format, gl_type, offset); + } + break; + case ImageType::e2D: + case ImageType::Linear: + if (is_compressed) { + glCompressedTextureSubImage3D( + texture.handle, copy.image_subresource.base_level, copy.image_offset.x, + copy.image_offset.y, copy.image_subresource.base_layer, copy.image_extent.width, + copy.image_extent.height, copy.image_subresource.num_layers, gl_internal_format, + static_cast<GLsizei>(copy.buffer_size), offset); + } else { + glTextureSubImage3D(texture.handle, copy.image_subresource.base_level, + copy.image_offset.x, copy.image_offset.y, + copy.image_subresource.base_layer, copy.image_extent.width, + copy.image_extent.height, copy.image_subresource.num_layers, + gl_format, gl_type, offset); + } + break; + case ImageType::e3D: + if (is_compressed) { + glCompressedTextureSubImage3D( + texture.handle, copy.image_subresource.base_level, copy.image_offset.x, + copy.image_offset.y, copy.image_offset.z, copy.image_extent.width, + copy.image_extent.height, copy.image_extent.depth, gl_internal_format, + static_cast<GLsizei>(copy.buffer_size), offset); + } else { + glTextureSubImage3D(texture.handle, copy.image_subresource.base_level, + copy.image_offset.x, copy.image_offset.y, copy.image_offset.z, + copy.image_extent.width, copy.image_extent.height, + copy.image_extent.depth, gl_format, gl_type, offset); + } + break; + default: + UNREACHABLE(); } - glDisable(GL_RASTERIZER_DISCARD); - glDisablei(GL_SCISSOR_TEST, 0); - - glBindFramebuffer(GL_READ_FRAMEBUFFER, src_framebuffer.handle); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, dst_framebuffer.handle); - - GLenum buffers = 0; - if (src_params.type == SurfaceType::ColorTexture) { - src_view->Attach(GL_COLOR_ATTACHMENT0, GL_READ_FRAMEBUFFER); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); - - dst_view->Attach(GL_COLOR_ATTACHMENT0, GL_DRAW_FRAMEBUFFER); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_DEPTH_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, - 0); - - buffers = GL_COLOR_BUFFER_BIT; - } else if (src_params.type == SurfaceType::Depth) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - src_view->Attach(GL_DEPTH_ATTACHMENT, GL_READ_FRAMEBUFFER); - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); +} - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - dst_view->Attach(GL_DEPTH_ATTACHMENT, GL_DRAW_FRAMEBUFFER); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_TEXTURE_2D, 0, 0); +void Image::CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset) { + const GLint x_offset = copy.image_offset.x; + const GLsizei width = copy.image_extent.width; - buffers = GL_DEPTH_BUFFER_BIT; - } else if (src_params.type == SurfaceType::DepthStencil) { - glFramebufferTexture2D(GL_READ_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - src_view->Attach(GL_DEPTH_STENCIL_ATTACHMENT, GL_READ_FRAMEBUFFER); + const GLint level = copy.image_subresource.base_level; + const GLsizei buffer_size = static_cast<GLsizei>(copy.buffer_size); + void* const offset = reinterpret_cast<void*>(copy.buffer_offset + buffer_offset); - glFramebufferTexture2D(GL_DRAW_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, 0, 0); - dst_view->Attach(GL_DEPTH_STENCIL_ATTACHMENT, GL_DRAW_FRAMEBUFFER); + GLint y_offset = 0; + GLint z_offset = 0; + GLsizei height = 1; + GLsizei depth = 1; - buffers = GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT; + switch (info.type) { + case ImageType::e1D: + y_offset = copy.image_subresource.base_layer; + height = copy.image_subresource.num_layers; + break; + case ImageType::e2D: + case ImageType::Linear: + y_offset = copy.image_offset.y; + z_offset = copy.image_subresource.base_layer; + height = copy.image_extent.height; + depth = copy.image_subresource.num_layers; + break; + case ImageType::e3D: + y_offset = copy.image_offset.y; + z_offset = copy.image_offset.z; + height = copy.image_extent.height; + depth = copy.image_extent.depth; + break; + default: + UNREACHABLE(); + } + // Compressed formats don't have a pixel format or type + const bool is_compressed = gl_format == GL_NONE; + if (is_compressed) { + glGetCompressedTextureSubImage(texture.handle, level, x_offset, y_offset, z_offset, width, + height, depth, buffer_size, offset); + } else { + glGetTextureSubImage(texture.handle, level, x_offset, y_offset, z_offset, width, height, + depth, gl_format, gl_type, buffer_size, offset); } - - const Common::Rectangle<u32>& src_rect = copy_config.src_rect; - const Common::Rectangle<u32>& dst_rect = copy_config.dst_rect; - const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; - - glBlitFramebuffer(static_cast<GLint>(src_rect.left), static_cast<GLint>(src_rect.top), - static_cast<GLint>(src_rect.right), static_cast<GLint>(src_rect.bottom), - static_cast<GLint>(dst_rect.left), static_cast<GLint>(dst_rect.top), - static_cast<GLint>(dst_rect.right), static_cast<GLint>(dst_rect.bottom), - buffers, - is_linear && (buffers == GL_COLOR_BUFFER_BIT) ? GL_LINEAR : GL_NEAREST); } -void TextureCacheOpenGL::BufferCopy(Surface& src_surface, Surface& dst_surface) { - MICROPROFILE_SCOPE(OpenGL_Texture_Buffer_Copy); - const auto& src_params = src_surface->GetSurfaceParams(); - const auto& dst_params = dst_surface->GetSurfaceParams(); - UNIMPLEMENTED_IF(src_params.num_levels > 1 || dst_params.num_levels > 1); +ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, + ImageId image_id_, Image& image) + : VideoCommon::ImageViewBase{info, image.info, image_id_}, views{runtime.null_image_views} { + const Device& device = runtime.device; + if (True(image.flags & ImageFlagBits::Converted)) { + internal_format = IsPixelFormatSRGB(info.format) ? GL_SRGB8_ALPHA8 : GL_RGBA8; + } else { + internal_format = GetFormatTuple(format).internal_format; + } + VideoCommon::SubresourceRange flatten_range = info.range; + std::array<GLuint, 2> handles; + stored_views.reserve(2); - const auto source_format = GetFormatTuple(src_params.pixel_format); - const auto dest_format = GetFormatTuple(dst_params.pixel_format); + switch (info.type) { + case ImageViewType::e1DArray: + flatten_range.extent.layers = 1; + [[fallthrough]]; + case ImageViewType::e1D: + glGenTextures(2, handles.data()); + SetupView(device, image, ImageViewType::e1D, handles[0], info, flatten_range); + SetupView(device, image, ImageViewType::e1DArray, handles[1], info, info.range); + break; + case ImageViewType::e2DArray: + flatten_range.extent.layers = 1; + [[fallthrough]]; + case ImageViewType::e2D: + if (True(flags & VideoCommon::ImageViewFlagBits::Slice)) { + // 2D and 2D array views on a 3D textures are used exclusively for render targets + ASSERT(info.range.extent.levels == 1); + const VideoCommon::SubresourceRange slice_range{ + .base = {.level = info.range.base.level, .layer = 0}, + .extent = {.levels = 1, .layers = 1}, + }; + glGenTextures(1, handles.data()); + SetupView(device, image, ImageViewType::e3D, handles[0], info, slice_range); + break; + } + glGenTextures(2, handles.data()); + SetupView(device, image, ImageViewType::e2D, handles[0], info, flatten_range); + SetupView(device, image, ImageViewType::e2DArray, handles[1], info, info.range); + break; + case ImageViewType::e3D: + glGenTextures(1, handles.data()); + SetupView(device, image, ImageViewType::e3D, handles[0], info, info.range); + break; + case ImageViewType::CubeArray: + flatten_range.extent.layers = 6; + [[fallthrough]]; + case ImageViewType::Cube: + glGenTextures(2, handles.data()); + SetupView(device, image, ImageViewType::Cube, handles[0], info, flatten_range); + SetupView(device, image, ImageViewType::CubeArray, handles[1], info, info.range); + break; + case ImageViewType::Rect: + glGenTextures(1, handles.data()); + SetupView(device, image, ImageViewType::Rect, handles[0], info, info.range); + break; + case ImageViewType::Buffer: + glCreateTextures(GL_TEXTURE_BUFFER, 1, handles.data()); + SetupView(device, image, ImageViewType::Buffer, handles[0], info, info.range); + break; + } + default_handle = Handle(info.type); +} - const std::size_t source_size = src_surface->GetHostSizeInBytes(); - const std::size_t dest_size = dst_surface->GetHostSizeInBytes(); +ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::NullImageParams& params) + : VideoCommon::ImageViewBase{params}, views{runtime.null_image_views} {} - const std::size_t buffer_size = std::max(source_size, dest_size); +void ImageView::SetupView(const Device& device, Image& image, ImageViewType view_type, + GLuint handle, const VideoCommon::ImageViewInfo& info, + VideoCommon::SubresourceRange view_range) { + if (info.type == ImageViewType::Buffer) { + // TODO: Take offset from buffer cache + glTextureBufferRange(handle, internal_format, image.buffer.handle, 0, + image.guest_size_bytes); + } else { + const GLuint parent = image.texture.handle; + const GLenum target = ImageTarget(view_type, image.info.num_samples); + glTextureView(handle, target, parent, internal_format, view_range.base.level, + view_range.extent.levels, view_range.base.layer, view_range.extent.layers); + if (!info.IsRenderTarget()) { + ApplySwizzle(handle, format, info.Swizzle()); + } + } + if (device.HasDebuggingToolAttached()) { + const std::string name = VideoCommon::Name(*this, view_type); + glObjectLabel(GL_TEXTURE, handle, static_cast<GLsizei>(name.size()), name.data()); + } + stored_views.emplace_back().handle = handle; + views[static_cast<size_t>(view_type)] = handle; +} - GLuint copy_pbo_handle = FetchPBO(buffer_size); +Sampler::Sampler(TextureCacheRuntime& runtime, const TSCEntry& config) { + const GLenum compare_mode = config.depth_compare_enabled ? GL_COMPARE_REF_TO_TEXTURE : GL_NONE; + const GLenum compare_func = MaxwellToGL::DepthCompareFunc(config.depth_compare_func); + const GLenum mag = MaxwellToGL::TextureFilterMode(config.mag_filter, TextureMipmapFilter::None); + const GLenum min = MaxwellToGL::TextureFilterMode(config.min_filter, config.mipmap_filter); + const GLenum reduction_filter = MaxwellToGL::ReductionFilter(config.reduction_filter); + const GLint seamless = config.cubemap_interface_filtering ? GL_TRUE : GL_FALSE; + + UNIMPLEMENTED_IF(config.cubemap_anisotropy != 1); + UNIMPLEMENTED_IF(config.float_coord_normalization != 0); + + sampler.Create(); + const GLuint handle = sampler.handle; + glSamplerParameteri(handle, GL_TEXTURE_WRAP_S, MaxwellToGL::WrapMode(config.wrap_u)); + glSamplerParameteri(handle, GL_TEXTURE_WRAP_T, MaxwellToGL::WrapMode(config.wrap_v)); + glSamplerParameteri(handle, GL_TEXTURE_WRAP_R, MaxwellToGL::WrapMode(config.wrap_p)); + glSamplerParameteri(handle, GL_TEXTURE_COMPARE_MODE, compare_mode); + glSamplerParameteri(handle, GL_TEXTURE_COMPARE_FUNC, compare_func); + glSamplerParameteri(handle, GL_TEXTURE_MAG_FILTER, mag); + glSamplerParameteri(handle, GL_TEXTURE_MIN_FILTER, min); + glSamplerParameterf(handle, GL_TEXTURE_LOD_BIAS, config.LodBias()); + glSamplerParameterf(handle, GL_TEXTURE_MIN_LOD, config.MinLod()); + glSamplerParameterf(handle, GL_TEXTURE_MAX_LOD, config.MaxLod()); + glSamplerParameterfv(handle, GL_TEXTURE_BORDER_COLOR, config.BorderColor().data()); + + if (GLAD_GL_ARB_texture_filter_anisotropic || GLAD_GL_EXT_texture_filter_anisotropic) { + glSamplerParameterf(handle, GL_TEXTURE_MAX_ANISOTROPY, config.MaxAnisotropy()); + } else { + LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_anisotropic is required"); + } + if (GLAD_GL_ARB_texture_filter_minmax || GLAD_GL_EXT_texture_filter_minmax) { + glSamplerParameteri(handle, GL_TEXTURE_REDUCTION_MODE_ARB, reduction_filter); + } else if (reduction_filter != GL_WEIGHTED_AVERAGE_ARB) { + LOG_WARNING(Render_OpenGL, "GL_ARB_texture_filter_minmax is required"); + } + if (GLAD_GL_ARB_seamless_cubemap_per_texture || GLAD_GL_AMD_seamless_cubemap_per_texture) { + glSamplerParameteri(handle, GL_TEXTURE_CUBE_MAP_SEAMLESS, seamless); + } else if (seamless == GL_FALSE) { + // We default to false because it's more common + LOG_WARNING(Render_OpenGL, "GL_ARB_seamless_cubemap_per_texture is required"); + } +} - glBindBuffer(GL_PIXEL_PACK_BUFFER, copy_pbo_handle); +Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers, + ImageView* depth_buffer, const VideoCommon::RenderTargets& key) { + // Bind to READ_FRAMEBUFFER to stop Nvidia's driver from creating an EXT_framebuffer instead of + // a core framebuffer. EXT framebuffer attachments have to match in size and can be shared + // across contexts. yuzu doesn't share framebuffers across contexts and we need attachments with + // mismatching size, this is why core framebuffers are preferred. + GLuint handle; + glGenFramebuffers(1, &handle); + glBindFramebuffer(GL_READ_FRAMEBUFFER, handle); + + GLsizei num_buffers = 0; + std::array<GLenum, NUM_RT> gl_draw_buffers; + gl_draw_buffers.fill(GL_NONE); + + for (size_t index = 0; index < color_buffers.size(); ++index) { + const ImageView* const image_view = color_buffers[index]; + if (!image_view) { + continue; + } + buffer_bits |= GL_COLOR_BUFFER_BIT; + gl_draw_buffers[index] = GL_COLOR_ATTACHMENT0 + key.draw_buffers[index]; + num_buffers = static_cast<GLsizei>(index + 1); - if (src_surface->IsCompressed()) { - glGetCompressedTextureImage(src_surface->GetTexture(), 0, static_cast<GLsizei>(source_size), - nullptr); - } else { - glGetTextureImage(src_surface->GetTexture(), 0, source_format.format, source_format.type, - static_cast<GLsizei>(source_size), nullptr); + const GLenum attachment = static_cast<GLenum>(GL_COLOR_ATTACHMENT0 + index); + AttachTexture(handle, attachment, image_view); } - glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, copy_pbo_handle); + if (const ImageView* const image_view = depth_buffer; image_view) { + if (GetFormatType(image_view->format) == SurfaceType::DepthStencil) { + buffer_bits |= GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT; + } else { + buffer_bits |= GL_DEPTH_BUFFER_BIT; + } + const GLenum attachment = AttachmentType(image_view->format); + AttachTexture(handle, attachment, image_view); + } - const GLsizei width = static_cast<GLsizei>(dst_params.width); - const GLsizei height = static_cast<GLsizei>(dst_params.height); - const GLsizei depth = static_cast<GLsizei>(dst_params.depth); - if (dst_surface->IsCompressed()) { - LOG_CRITICAL(HW_GPU, "Compressed buffer copy is unimplemented!"); - UNREACHABLE(); + if (num_buffers > 1) { + glNamedFramebufferDrawBuffers(handle, num_buffers, gl_draw_buffers.data()); + } else if (num_buffers > 0) { + glNamedFramebufferDrawBuffer(handle, gl_draw_buffers[0]); } else { - switch (dst_params.target) { - case SurfaceTarget::Texture1D: - glTextureSubImage1D(dst_surface->GetTexture(), 0, 0, width, dest_format.format, - dest_format.type, nullptr); - break; - case SurfaceTarget::Texture2D: - glTextureSubImage2D(dst_surface->GetTexture(), 0, 0, 0, width, height, - dest_format.format, dest_format.type, nullptr); - break; - case SurfaceTarget::Texture3D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubeArray: - glTextureSubImage3D(dst_surface->GetTexture(), 0, 0, 0, 0, width, height, depth, - dest_format.format, dest_format.type, nullptr); - break; - case SurfaceTarget::TextureCubemap: - glTextureSubImage3D(dst_surface->GetTexture(), 0, 0, 0, 0, width, height, depth, - dest_format.format, dest_format.type, nullptr); - break; - default: - LOG_CRITICAL(Render_OpenGL, "Unimplemented surface target={}", - static_cast<u32>(dst_params.target)); - UNREACHABLE(); - } + glNamedFramebufferDrawBuffer(handle, GL_NONE); } - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); - glTextureBarrier(); -} + glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_WIDTH, key.size.width); + glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_HEIGHT, key.size.height); + // TODO + // glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_LAYERS, ...); + // glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_SAMPLES, ...); + // glNamedFramebufferParameteri(handle, GL_FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS, ...); -GLuint TextureCacheOpenGL::FetchPBO(std::size_t buffer_size) { - ASSERT_OR_EXECUTE(buffer_size > 0, { return 0; }); - const u32 l2 = Common::Log2Ceil64(static_cast<u64>(buffer_size)); - OGLBuffer& cp = copy_pbo_cache[l2]; - if (cp.handle == 0) { - const std::size_t ceil_size = 1ULL << l2; - cp.Create(); - cp.MakeStreamCopy(ceil_size); + if (runtime.device.HasDebuggingToolAttached()) { + const std::string name = VideoCommon::Name(key); + glObjectLabel(GL_FRAMEBUFFER, handle, static_cast<GLsizei>(name.size()), name.data()); } - return cp.handle; + framebuffer.handle = handle; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 7787134fc..15b7c3676 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -4,156 +4,251 @@ #pragma once -#include <array> -#include <functional> #include <memory> -#include <unordered_map> -#include <utility> -#include <vector> +#include <span> #include <glad/glad.h> -#include "common/common_types.h" -#include "video_core/engines/shader_bytecode.h" -#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/util_shaders.h" #include "video_core/texture_cache/texture_cache.h" namespace OpenGL { -using VideoCommon::SurfaceParams; -using VideoCommon::ViewParams; - -class CachedSurfaceView; -class CachedSurface; -class TextureCacheOpenGL; +class Device; +class ProgramManager; class StateTracker; -using Surface = std::shared_ptr<CachedSurface>; -using View = std::shared_ptr<CachedSurfaceView>; -using TextureCacheBase = VideoCommon::TextureCache<Surface, View>; +class Framebuffer; +class Image; +class ImageView; +class Sampler; -class CachedSurface final : public VideoCommon::SurfaceBase<View> { - friend CachedSurfaceView; +using VideoCommon::ImageId; +using VideoCommon::ImageViewId; +using VideoCommon::ImageViewType; +using VideoCommon::NUM_RT; +using VideoCommon::Offset2D; +using VideoCommon::RenderTargets; +class ImageBufferMap { public: - explicit CachedSurface(GPUVAddr gpu_addr, const SurfaceParams& params, bool is_astc_supported); - ~CachedSurface(); - - void UploadTexture(const std::vector<u8>& staging_buffer) override; - void DownloadTexture(std::vector<u8>& staging_buffer) override; + explicit ImageBufferMap(GLuint handle, u8* map, size_t size, OGLSync* sync); + ~ImageBufferMap(); - GLenum GetTarget() const { - return target; + GLuint Handle() const noexcept { + return handle; } - GLuint GetTexture() const { - return texture.handle; + std::span<u8> Span() const noexcept { + return span; } - bool IsCompressed() const { - return is_compressed; +private: + std::span<u8> span; + OGLSync* sync; + GLuint handle; +}; + +struct FormatProperties { + GLenum compatibility_class; + bool compatibility_by_size; + bool is_compressed; +}; + +class TextureCacheRuntime { + friend Framebuffer; + friend Image; + friend ImageView; + friend Sampler; + +public: + explicit TextureCacheRuntime(const Device& device, ProgramManager& program_manager, + StateTracker& state_tracker); + ~TextureCacheRuntime(); + + void Finish(); + + ImageBufferMap MapUploadBuffer(size_t size); + + ImageBufferMap MapDownloadBuffer(size_t size); + + void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); + + void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { + UNIMPLEMENTED(); } -protected: - void DecorateSurfaceName() override; + bool CanImageBeCopied(const Image& dst, const Image& src); + + void EmulateCopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); + + void BlitFramebuffer(Framebuffer* dst, Framebuffer* src, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation); + + void AccelerateImageUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::SwizzleParameters> swizzles); - View CreateView(const ViewParams& view_key) override; - View CreateViewInner(const ViewParams& view_key, bool is_proxy); + void InsertUploadMemoryBarrier(); + + FormatProperties FormatInfo(VideoCommon::ImageType type, GLenum internal_format) const; + + bool HasBrokenTextureViewFormats() const noexcept { + return has_broken_texture_view_formats; + } private: - void UploadTextureMipmap(u32 level, const std::vector<u8>& staging_buffer); + struct StagingBuffers { + explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); + ~StagingBuffers(); - GLenum internal_format{}; - GLenum format{}; - GLenum type{}; - bool is_compressed{}; - GLenum target{}; - u32 view_count{}; + ImageBufferMap RequestMap(size_t requested_size, bool insert_fence); - OGLTexture texture; - OGLBuffer texture_buffer; + size_t RequestBuffer(size_t requested_size); + + std::optional<size_t> FindBuffer(size_t requested_size); + + std::vector<OGLSync> syncs; + std::vector<OGLBuffer> buffers; + std::vector<u8*> maps; + std::vector<size_t> sizes; + GLenum storage_flags; + GLenum map_flags; + }; + + const Device& device; + StateTracker& state_tracker; + UtilShaders util_shaders; + + std::array<std::unordered_map<GLenum, FormatProperties>, 3> format_properties; + bool has_broken_texture_view_formats = false; + + StagingBuffers upload_buffers{GL_MAP_WRITE_BIT, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT}; + StagingBuffers download_buffers{GL_MAP_READ_BIT, GL_MAP_READ_BIT}; + + OGLTexture null_image_1d_array; + OGLTexture null_image_cube_array; + OGLTexture null_image_3d; + OGLTexture null_image_rect; + OGLTextureView null_image_view_1d; + OGLTextureView null_image_view_2d; + OGLTextureView null_image_view_2d_array; + OGLTextureView null_image_view_cube; + + std::array<GLuint, VideoCommon::NUM_IMAGE_VIEW_TYPES> null_image_views; }; -class CachedSurfaceView final : public VideoCommon::ViewBase { +class Image : public VideoCommon::ImageBase { + friend ImageView; + public: - explicit CachedSurfaceView(CachedSurface& surface, const ViewParams& params, bool is_proxy); - ~CachedSurfaceView(); + explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, + VAddr cpu_addr); - /// @brief Attaches this texture view to the currently bound fb_target framebuffer - /// @param attachment Attachment to bind textures to - /// @param fb_target Framebuffer target to attach to (e.g. DRAW_FRAMEBUFFER) - void Attach(GLenum attachment, GLenum fb_target) const; + void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); - GLuint GetTexture(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source); + void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferCopy> copies); - void DecorateViewName(GPUVAddr gpu_addr, const std::string& prefix); + void DownloadMemory(ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); - void MarkAsModified(u64 tick) { - surface.MarkAsModified(true, tick); + GLuint Handle() const noexcept { + return texture.handle; } - GLuint GetTexture() const { - if (is_proxy) { - return surface.GetTexture(); - } - return main_view.handle; +private: + void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); + + void CopyImageToBuffer(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); + + OGLTexture texture; + OGLTextureView store_view; + OGLBuffer buffer; + GLenum gl_internal_format = GL_NONE; + GLenum gl_format = GL_NONE; + GLenum gl_type = GL_NONE; +}; + +class ImageView : public VideoCommon::ImageViewBase { + friend Image; + +public: + explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageViewInfo&, ImageId, Image&); + explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams&); + + [[nodiscard]] GLuint Handle(ImageViewType query_type) const noexcept { + return views[static_cast<size_t>(query_type)]; } - GLenum GetFormat() const { - return format; + [[nodiscard]] GLuint DefaultHandle() const noexcept { + return default_handle; } - const SurfaceParams& GetSurfaceParams() const { - return surface.GetSurfaceParams(); + [[nodiscard]] GLenum Format() const noexcept { + return internal_format; } private: - OGLTextureView CreateTextureView() const; + void SetupView(const Device& device, Image& image, ImageViewType view_type, GLuint handle, + const VideoCommon::ImageViewInfo& info, + VideoCommon::SubresourceRange view_range); + + std::array<GLuint, VideoCommon::NUM_IMAGE_VIEW_TYPES> views{}; + std::vector<OGLTextureView> stored_views; + GLuint default_handle = 0; + GLenum internal_format = GL_NONE; +}; + +class ImageAlloc : public VideoCommon::ImageAllocBase {}; - CachedSurface& surface; - const GLenum format; - const GLenum target; - const bool is_proxy; +class Sampler { +public: + explicit Sampler(TextureCacheRuntime&, const Tegra::Texture::TSCEntry&); - std::unordered_map<u32, OGLTextureView> view_cache; - OGLTextureView main_view; + GLuint Handle() const noexcept { + return sampler.handle; + } - // Use an invalid default so it always fails the comparison test - u32 current_swizzle = 0xffffffff; - GLuint current_view = 0; +private: + OGLSampler sampler; }; -class TextureCacheOpenGL final : public TextureCacheBase { +class Framebuffer { public: - explicit TextureCacheOpenGL(VideoCore::RasterizerInterface& rasterizer, - Tegra::Engines::Maxwell3D& maxwell3d, - Tegra::MemoryManager& gpu_memory, const Device& device, - StateTracker& state_tracker); - ~TextureCacheOpenGL(); - -protected: - Surface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) override; - - void ImageCopy(Surface& src_surface, Surface& dst_surface, - const VideoCommon::CopyParams& copy_params) override; + explicit Framebuffer(TextureCacheRuntime&, std::span<ImageView*, NUM_RT> color_buffers, + ImageView* depth_buffer, const VideoCommon::RenderTargets& key); - void ImageBlit(View& src_view, View& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) override; + [[nodiscard]] GLuint Handle() const noexcept { + return framebuffer.handle; + } - void BufferCopy(Surface& src_surface, Surface& dst_surface) override; + [[nodiscard]] GLbitfield BufferBits() const noexcept { + return buffer_bits; + } private: - GLuint FetchPBO(std::size_t buffer_size); - - StateTracker& state_tracker; + OGLFramebuffer framebuffer; + GLbitfield buffer_bits = GL_NONE; +}; - OGLFramebuffer src_framebuffer; - OGLFramebuffer dst_framebuffer; - std::unordered_map<u32, OGLBuffer> copy_pbo_cache; +struct TextureCacheParams { + static constexpr bool ENABLE_VALIDATION = true; + static constexpr bool FRAMEBUFFER_BLITS = true; + static constexpr bool HAS_EMULATED_COPIES = true; + + using Runtime = OpenGL::TextureCacheRuntime; + using Image = OpenGL::Image; + using ImageAlloc = OpenGL::ImageAlloc; + using ImageView = OpenGL::ImageView; + using Sampler = OpenGL::Sampler; + using Framebuffer = OpenGL::Framebuffer; }; +using TextureCache = VideoCommon::TextureCache<TextureCacheParams>; + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index a8be2aa37..cbccfdeb4 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -107,7 +107,7 @@ inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { case Maxwell::IndexFormat::UnsignedInt: return GL_UNSIGNED_INT; } - UNREACHABLE_MSG("Invalid index_format={}", static_cast<u32>(index_format)); + UNREACHABLE_MSG("Invalid index_format={}", index_format); return {}; } @@ -144,7 +144,7 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) { case Maxwell::PrimitiveTopology::Patches: return GL_PATCHES; } - UNREACHABLE_MSG("Invalid topology={}", static_cast<int>(topology)); + UNREACHABLE_MSG("Invalid topology={}", topology); return GL_POINTS; } @@ -172,8 +172,8 @@ inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, } break; } - UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}", - static_cast<u32>(filter_mode), static_cast<u32>(mipmap_filter_mode)); + UNREACHABLE_MSG("Invalid texture filter mode={} and mipmap filter mode={}", filter_mode, + mipmap_filter_mode); return GL_NEAREST; } @@ -204,7 +204,7 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { return GL_MIRROR_CLAMP_TO_EDGE; } } - UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); + UNIMPLEMENTED_MSG("Unimplemented texture wrap mode={}", wrap_mode); return GL_REPEAT; } @@ -227,7 +227,7 @@ inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { case Tegra::Texture::DepthCompareFunc::Always: return GL_ALWAYS; } - UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", static_cast<u32>(func)); + UNIMPLEMENTED_MSG("Unimplemented texture depth compare function={}", func); return GL_GREATER; } @@ -249,7 +249,7 @@ inline GLenum BlendEquation(Maxwell::Blend::Equation equation) { case Maxwell::Blend::Equation::MaxGL: return GL_MAX; } - UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation)); + UNIMPLEMENTED_MSG("Unimplemented blend equation={}", equation); return GL_FUNC_ADD; } @@ -313,7 +313,7 @@ inline GLenum BlendFunc(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return GL_ONE_MINUS_CONSTANT_ALPHA; } - UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor)); + UNIMPLEMENTED_MSG("Unimplemented blend factor={}", factor); return GL_ZERO; } @@ -333,7 +333,7 @@ inline GLenum SwizzleSource(Tegra::Texture::SwizzleSource source) { case Tegra::Texture::SwizzleSource::OneFloat: return GL_ONE; } - UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(source)); + UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", source); return GL_ZERO; } @@ -364,7 +364,7 @@ inline GLenum ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return GL_ALWAYS; } - UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison)); + UNIMPLEMENTED_MSG("Unimplemented comparison op={}", comparison); return GL_ALWAYS; } @@ -395,7 +395,7 @@ inline GLenum StencilOp(Maxwell::StencilOp stencil) { case Maxwell::StencilOp::DecrWrapOGL: return GL_DECR_WRAP; } - UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil)); + UNIMPLEMENTED_MSG("Unimplemented stencil op={}", stencil); return GL_KEEP; } @@ -406,7 +406,7 @@ inline GLenum FrontFace(Maxwell::FrontFace front_face) { case Maxwell::FrontFace::CounterClockWise: return GL_CCW; } - UNIMPLEMENTED_MSG("Unimplemented front face cull={}", static_cast<u32>(front_face)); + UNIMPLEMENTED_MSG("Unimplemented front face cull={}", front_face); return GL_CCW; } @@ -419,7 +419,7 @@ inline GLenum CullFace(Maxwell::CullFace cull_face) { case Maxwell::CullFace::FrontAndBack: return GL_FRONT_AND_BACK; } - UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); + UNIMPLEMENTED_MSG("Unimplemented cull face={}", cull_face); return GL_BACK; } @@ -458,7 +458,7 @@ inline GLenum LogicOp(Maxwell::LogicOperation operation) { case Maxwell::LogicOperation::Set: return GL_SET; } - UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(operation)); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", operation); return GL_COPY; } @@ -471,10 +471,23 @@ inline GLenum PolygonMode(Maxwell::PolygonMode polygon_mode) { case Maxwell::PolygonMode::Fill: return GL_FILL; } - UNREACHABLE_MSG("Invalid polygon mode={}", static_cast<int>(polygon_mode)); + UNREACHABLE_MSG("Invalid polygon mode={}", polygon_mode); return GL_FILL; } +inline GLenum ReductionFilter(Tegra::Texture::SamplerReduction filter) { + switch (filter) { + case Tegra::Texture::SamplerReduction::WeightedAverage: + return GL_WEIGHTED_AVERAGE_ARB; + case Tegra::Texture::SamplerReduction::Min: + return GL_MIN; + case Tegra::Texture::SamplerReduction::Max: + return GL_MAX; + } + UNREACHABLE_MSG("Invalid reduction filter={}", static_cast<int>(filter)); + return GL_WEIGHTED_AVERAGE_ARB; +} + inline GLenum ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) { // Enumeration order matches register order. We can convert it arithmetically. return GL_VIEWPORT_SWIZZLE_POSITIVE_X_NV + static_cast<GLenum>(swizzle); diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 2ccca1993..dd77a543c 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -23,10 +23,10 @@ #include "core/telemetry_session.h" #include "video_core/host_shaders/opengl_present_frag.h" #include "video_core/host_shaders/opengl_present_vert.h" -#include "video_core/morton.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/renderer_opengl.h" +#include "video_core/textures/decoders.h" namespace OpenGL { @@ -130,8 +130,8 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, Core::Frontend::EmuWindow& emu_window_, Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, - std::unique_ptr<Core::Frontend::GraphicsContext> context) - : RendererBase{emu_window_, std::move(context)}, telemetry_session{telemetry_session_}, + std::unique_ptr<Core::Frontend::GraphicsContext> context_) + : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_}, emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {} RendererOpenGL::~RendererOpenGL() = default; @@ -140,19 +140,18 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { if (!framebuffer) { return; } - PrepareRendertarget(framebuffer); RenderScreenshot(); - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0); + state_tracker.BindFramebuffer(0); DrawScreen(emu_window.GetFramebufferLayout()); ++m_current_frame; rasterizer->TickFrame(); - render_window.PollEvents(); context->SwapBuffers(); + render_window.OnFrameDisplayed(); } void RendererOpenGL::PrepareRendertarget(const Tegra::FramebufferConfig* framebuffer) { @@ -187,19 +186,20 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf // Reset the screen info's display texture to its own permanent texture screen_info.display_texture = screen_info.texture.resource.handle; - const auto pixel_format{ - VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; - const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)}; - const u64 size_in_bytes{framebuffer.stride * framebuffer.height * bytes_per_pixel}; - u8* const host_ptr{cpu_memory.GetPointer(framebuffer_addr)}; - rasterizer->FlushRegion(ToCacheAddr(host_ptr), size_in_bytes); - // TODO(Rodrigo): Read this from HLE constexpr u32 block_height_log2 = 4; - VideoCore::MortonSwizzle(VideoCore::MortonSwizzleMode::MortonToLinear, pixel_format, - framebuffer.stride, block_height_log2, framebuffer.height, 0, 1, 1, - gl_framebuffer_data.data(), host_ptr); - + const auto pixel_format{ + VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; + const u32 bytes_per_pixel{VideoCore::Surface::BytesPerBlock(pixel_format)}; + const u64 size_in_bytes{Tegra::Texture::CalculateSize( + true, bytes_per_pixel, framebuffer.stride, framebuffer.height, 1, block_height_log2, 0)}; + const u8* const host_ptr{cpu_memory.GetPointer(framebuffer_addr)}; + const std::span<const u8> input_data(host_ptr, size_in_bytes); + Tegra::Texture::UnswizzleTexture(gl_framebuffer_data, input_data, bytes_per_pixel, + framebuffer.width, framebuffer.height, 1, block_height_log2, + 0); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); glPixelStorei(GL_UNPACK_ROW_LENGTH, static_cast<GLint>(framebuffer.stride)); // Update existing texture @@ -238,6 +238,10 @@ void RendererOpenGL::InitOpenGLObjects() { glUseProgramStages(pipeline.handle, GL_VERTEX_SHADER_BIT, vertex_program.handle); glUseProgramStages(pipeline.handle, GL_FRAGMENT_SHADER_BIT, fragment_program.handle); + // Generate presentation sampler + present_sampler.Create(); + glSamplerParameteri(present_sampler.handle, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + // Generate VBO handle for drawing vertex_buffer.Create(); @@ -255,6 +259,11 @@ void RendererOpenGL::InitOpenGLObjects() { // Clear screen to black LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture); + // Enable seamless cubemaps when per texture parameters are not available + if (!GLAD_GL_ARB_seamless_cubemap_per_texture && !GLAD_GL_AMD_seamless_cubemap_per_texture) { + glEnable(GL_TEXTURE_CUBE_MAP_SEAMLESS); + } + // Enable unified vertex attributes and query vertex buffer address when the driver supports it if (device.HasVertexBufferUnifiedMemory()) { glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); @@ -275,9 +284,9 @@ void RendererOpenGL::AddTelemetryFields() { LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model); constexpr auto user_system = Common::Telemetry::FieldType::UserSystem; - telemetry_session.AddField(user_system, "GPU_Vendor", gpu_vendor); - telemetry_session.AddField(user_system, "GPU_Model", gpu_model); - telemetry_session.AddField(user_system, "GPU_OpenGL_Version", gl_version); + telemetry_session.AddField(user_system, "GPU_Vendor", std::string(gpu_vendor)); + telemetry_session.AddField(user_system, "GPU_Model", std::string(gpu_model)); + telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version)); } void RendererOpenGL::CreateRasterizer() { @@ -296,7 +305,7 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, const auto pixel_format{ VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)}; - const u32 bytes_per_pixel{VideoCore::Surface::GetBytesPerPixel(pixel_format)}; + const u32 bytes_per_pixel{VideoCore::Surface::BytesPerBlock(pixel_format)}; gl_framebuffer_data.resize(texture.width * texture.height * bytes_per_pixel); GLint internal_format; @@ -315,8 +324,8 @@ void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, internal_format = GL_RGBA8; texture.gl_format = GL_RGBA; texture.gl_type = GL_UNSIGNED_INT_8_8_8_8_REV; - UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}", - static_cast<u32>(framebuffer.pixel_format)); + // UNIMPLEMENTED_MSG("Unknown framebuffer pixel format: {}", + // static_cast<u32>(framebuffer.pixel_format)); } texture.resource.Release(); @@ -348,7 +357,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { } else { // Other transformations are unsupported LOG_CRITICAL(Render_OpenGL, "Unsupported framebuffer_transform_flags={}", - static_cast<u32>(framebuffer_transform_flags)); + framebuffer_transform_flags); UNIMPLEMENTED(); } } @@ -382,7 +391,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { state_tracker.NotifyPolygonModes(); state_tracker.NotifyViewport0(); state_tracker.NotifyScissor0(); - state_tracker.NotifyColorMask0(); + state_tracker.NotifyColorMask(0); state_tracker.NotifyBlend0(); state_tracker.NotifyFramebuffer(); state_tracker.NotifyFrontFace(); @@ -440,7 +449,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { } glBindTextureUnit(0, screen_info.display_texture); - glBindSampler(0, 0); + glBindSampler(0, present_sampler.handle); glClear(GL_COLOR_BUFFER_BIT); glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); @@ -473,6 +482,8 @@ void RendererOpenGL::RenderScreenshot() { DrawScreen(layout); + glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); + glPixelStorei(GL_PACK_ROW_LENGTH, 0); glReadPixels(0, 0, layout.width, layout.height, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV, renderer_settings.screenshot_bits); diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 9ef181f95..44e109794 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -57,10 +57,10 @@ struct ScreenInfo { class RendererOpenGL final : public VideoCore::RendererBase { public: - explicit RendererOpenGL(Core::TelemetrySession& telemetry_session, - Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory, - Tegra::GPU& gpu, - std::unique_ptr<Core::Frontend::GraphicsContext> context); + explicit RendererOpenGL(Core::TelemetrySession& telemetry_session_, + Core::Frontend::EmuWindow& emu_window_, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, + std::unique_ptr<Core::Frontend::GraphicsContext> context_); ~RendererOpenGL() override; bool Init() override; @@ -102,6 +102,7 @@ private: StateTracker state_tracker{gpu}; // OpenGL object IDs + OGLSampler present_sampler; OGLBuffer vertex_buffer; OGLProgram vertex_program; OGLProgram fragment_program; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp new file mode 100644 index 000000000..eb849cbf2 --- /dev/null +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -0,0 +1,224 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <bit> +#include <span> +#include <string_view> + +#include <glad/glad.h> + +#include "common/assert.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" +#include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" +#include "video_core/host_shaders/opengl_copy_bc4_comp.h" +#include "video_core/host_shaders/pitch_unswizzle_comp.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_shader_manager.h" +#include "video_core/renderer_opengl/gl_texture_cache.h" +#include "video_core/renderer_opengl/util_shaders.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/accelerated_swizzle.h" +#include "video_core/texture_cache/types.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/decoders.h" + +namespace OpenGL { + +using namespace HostShaders; + +using VideoCommon::Extent3D; +using VideoCommon::ImageCopy; +using VideoCommon::ImageType; +using VideoCommon::SwizzleParameters; +using VideoCommon::Accelerated::MakeBlockLinearSwizzle2DParams; +using VideoCommon::Accelerated::MakeBlockLinearSwizzle3DParams; +using VideoCore::Surface::BytesPerBlock; + +namespace { + +OGLProgram MakeProgram(std::string_view source) { + OGLShader shader; + shader.Create(source, GL_COMPUTE_SHADER); + + OGLProgram program; + program.Create(true, false, shader.handle); + return program; +} + +} // Anonymous namespace + +UtilShaders::UtilShaders(ProgramManager& program_manager_) + : program_manager{program_manager_}, + block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)), + block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), + pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), + copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { + const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); + swizzle_table_buffer.Create(); + glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); +} + +UtilShaders::~UtilShaders() = default; + +void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, + std::span<const SwizzleParameters> swizzles) { + static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; + static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; + static constexpr GLuint BINDING_INPUT_BUFFER = 1; + static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + + program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle); + glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); + + const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); + for (const SwizzleParameters& swizzle : swizzles) { + const Extent3D num_tiles = swizzle.num_tiles; + const size_t input_offset = swizzle.buffer_offset + buffer_offset; + + const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); + const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); + + const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); + glUniform3uiv(0, 1, params.origin.data()); + glUniform3iv(1, 1, params.destination.data()); + glUniform1ui(2, params.bytes_per_block_log2); + glUniform1ui(3, params.layer_stride); + glUniform1ui(4, params.block_size); + glUniform1ui(5, params.x_shift); + glUniform1ui(6, params.block_height); + glUniform1ui(7, params.block_height_mask); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), + input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, + GL_WRITE_ONLY, store_format); + glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); + } + program_manager.RestoreGuestCompute(); +} + +void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, + std::span<const SwizzleParameters> swizzles) { + static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8}; + + static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; + static constexpr GLuint BINDING_INPUT_BUFFER = 1; + static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + + glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); + + const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); + for (const SwizzleParameters& swizzle : swizzles) { + const Extent3D num_tiles = swizzle.num_tiles; + const size_t input_offset = swizzle.buffer_offset + buffer_offset; + + const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); + const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); + const u32 num_dispatches_z = Common::DivCeil(num_tiles.depth, WORKGROUP_SIZE.depth); + + const auto params = MakeBlockLinearSwizzle3DParams(swizzle, image.info); + glUniform3uiv(0, 1, params.origin.data()); + glUniform3iv(1, 1, params.destination.data()); + glUniform1ui(2, params.bytes_per_block_log2); + glUniform1ui(3, params.slice_size); + glUniform1ui(4, params.block_size); + glUniform1ui(5, params.x_shift); + glUniform1ui(6, params.block_height); + glUniform1ui(7, params.block_height_mask); + glUniform1ui(8, params.block_depth); + glUniform1ui(9, params.block_depth_mask); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), + input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, + GL_WRITE_ONLY, store_format); + glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z); + } + program_manager.RestoreGuestCompute(); +} + +void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, + std::span<const SwizzleParameters> swizzles) { + static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; + static constexpr GLuint BINDING_INPUT_BUFFER = 0; + static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; + static constexpr GLuint LOC_ORIGIN = 0; + static constexpr GLuint LOC_DESTINATION = 1; + static constexpr GLuint LOC_BYTES_PER_BLOCK = 2; + static constexpr GLuint LOC_PITCH = 3; + + const u32 bytes_per_block = BytesPerBlock(image.info.format); + const GLenum format = StoreFormat(bytes_per_block); + const u32 pitch = image.info.pitch; + + UNIMPLEMENTED_IF_MSG(!std::has_single_bit(bytes_per_block), + "Non-power of two images are not implemented"); + + program_manager.BindHostCompute(pitch_unswizzle_program.handle); + glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + glUniform2ui(LOC_ORIGIN, 0, 0); + glUniform2i(LOC_DESTINATION, 0, 0); + glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block); + glUniform1ui(LOC_PITCH, pitch); + glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), 0, GL_FALSE, 0, GL_WRITE_ONLY, format); + for (const SwizzleParameters& swizzle : swizzles) { + const Extent3D num_tiles = swizzle.num_tiles; + const size_t input_offset = swizzle.buffer_offset + buffer_offset; + + const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); + const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); + + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), + input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); + } + program_manager.RestoreGuestCompute(); +} + +void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const ImageCopy> copies) { + static constexpr GLuint BINDING_INPUT_IMAGE = 0; + static constexpr GLuint BINDING_OUTPUT_IMAGE = 1; + static constexpr GLuint LOC_SRC_OFFSET = 0; + static constexpr GLuint LOC_DST_OFFSET = 1; + + program_manager.BindHostCompute(copy_bc4_program.handle); + + for (const ImageCopy& copy : copies) { + ASSERT(copy.src_subresource.base_layer == 0); + ASSERT(copy.src_subresource.num_layers == 1); + ASSERT(copy.dst_subresource.base_layer == 0); + ASSERT(copy.dst_subresource.num_layers == 1); + + glUniform3ui(LOC_SRC_OFFSET, copy.src_offset.x, copy.src_offset.y, copy.src_offset.z); + glUniform3ui(LOC_DST_OFFSET, copy.dst_offset.x, copy.dst_offset.y, copy.dst_offset.z); + glBindImageTexture(BINDING_INPUT_IMAGE, src_image.Handle(), copy.src_subresource.base_level, + GL_FALSE, 0, GL_READ_ONLY, GL_RG32UI); + glBindImageTexture(BINDING_OUTPUT_IMAGE, dst_image.Handle(), + copy.dst_subresource.base_level, GL_FALSE, 0, GL_WRITE_ONLY, GL_RGBA8UI); + glDispatchCompute(copy.extent.width, copy.extent.height, copy.extent.depth); + } + program_manager.RestoreGuestCompute(); +} + +GLenum StoreFormat(u32 bytes_per_block) { + switch (bytes_per_block) { + case 1: + return GL_R8UI; + case 2: + return GL_R16UI; + case 4: + return GL_R32UI; + case 8: + return GL_RG32UI; + case 16: + return GL_RGBA32UI; + } + UNREACHABLE(); + return GL_R8UI; +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h new file mode 100644 index 000000000..359997255 --- /dev/null +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -0,0 +1,51 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <span> + +#include <glad/glad.h> + +#include "common/common_types.h" +#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/texture_cache/types.h" + +namespace OpenGL { + +class Image; +class ImageBufferMap; +class ProgramManager; + +class UtilShaders { +public: + explicit UtilShaders(ProgramManager& program_manager); + ~UtilShaders(); + + void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::SwizzleParameters> swizzles); + + void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::SwizzleParameters> swizzles); + + void PitchUpload(Image& image, const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::SwizzleParameters> swizzles); + + void CopyBC4(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies); + +private: + ProgramManager& program_manager; + + OGLBuffer swizzle_table_buffer; + + OGLProgram block_linear_unswizzle_2d_program; + OGLProgram block_linear_unswizzle_3d_program; + OGLProgram pitch_unswizzle_program; + OGLProgram copy_bc4_program; +}; + +GLenum StoreFormat(u32 bytes_per_block); + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/utils.cpp b/src/video_core/renderer_opengl/utils.cpp deleted file mode 100644 index 6d7bb16b2..000000000 --- a/src/video_core/renderer_opengl/utils.cpp +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <string> -#include <vector> - -#include <fmt/format.h> -#include <glad/glad.h> - -#include "common/common_types.h" -#include "video_core/renderer_opengl/gl_state_tracker.h" -#include "video_core/renderer_opengl/utils.h" - -namespace OpenGL { - -void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info) { - if (!GLAD_GL_KHR_debug) { - // We don't need to throw an error as this is just for debugging - return; - } - - std::string object_label; - if (extra_info.empty()) { - switch (identifier) { - case GL_TEXTURE: - object_label = fmt::format("Texture@0x{:016X}", addr); - break; - case GL_PROGRAM: - object_label = fmt::format("Shader@0x{:016X}", addr); - break; - default: - object_label = fmt::format("Object(0x{:X})@0x{:016X}", identifier, addr); - break; - } - } else { - object_label = fmt::format("{}@0x{:016X}", extra_info, addr); - } - glObjectLabel(identifier, handle, -1, static_cast<const GLchar*>(object_label.c_str())); -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/utils.h b/src/video_core/renderer_opengl/utils.h deleted file mode 100644 index 9c09ee12c..000000000 --- a/src/video_core/renderer_opengl/utils.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2014 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <string_view> -#include <vector> -#include <glad/glad.h> -#include "common/common_types.h" - -namespace OpenGL { - -void LabelGLObject(GLenum identifier, GLuint handle, VAddr addr, std::string_view extra_info = {}); - -} // namespace OpenGL diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp new file mode 100644 index 000000000..1f6a169ae --- /dev/null +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -0,0 +1,624 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> + +#include "video_core/host_shaders/convert_depth_to_float_frag_spv.h" +#include "video_core/host_shaders/convert_float_to_depth_frag_spv.h" +#include "video_core/host_shaders/full_screen_triangle_vert_spv.h" +#include "video_core/host_shaders/vulkan_blit_color_float_frag_spv.h" +#include "video_core/host_shaders/vulkan_blit_depth_stencil_frag_spv.h" +#include "video_core/renderer_vulkan/blit_image.h" +#include "video_core/renderer_vulkan/maxwell_to_vk.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" +#include "video_core/surface.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +using VideoCommon::ImageViewType; + +namespace { +struct PushConstants { + std::array<float, 2> tex_scale; + std::array<float, 2> tex_offset; +}; + +template <u32 binding> +inline constexpr VkDescriptorSetLayoutBinding TEXTURE_DESCRIPTOR_SET_LAYOUT_BINDING{ + .binding = binding, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .descriptorCount = 1, + .stageFlags = VK_SHADER_STAGE_FRAGMENT_BIT, + .pImmutableSamplers = nullptr, +}; +constexpr std::array TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_BINDINGS{ + TEXTURE_DESCRIPTOR_SET_LAYOUT_BINDING<0>, + TEXTURE_DESCRIPTOR_SET_LAYOUT_BINDING<1>, +}; +constexpr VkDescriptorSetLayoutCreateInfo ONE_TEXTURE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = 1, + .pBindings = &TEXTURE_DESCRIPTOR_SET_LAYOUT_BINDING<0>, +}; +constexpr VkDescriptorSetLayoutCreateInfo TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .bindingCount = static_cast<u32>(TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_BINDINGS.size()), + .pBindings = TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_BINDINGS.data(), +}; +constexpr VkPushConstantRange PUSH_CONSTANT_RANGE{ + .stageFlags = VK_SHADER_STAGE_VERTEX_BIT, + .offset = 0, + .size = sizeof(PushConstants), +}; +constexpr VkPipelineVertexInputStateCreateInfo PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .vertexBindingDescriptionCount = 0, + .pVertexBindingDescriptions = nullptr, + .vertexAttributeDescriptionCount = 0, + .pVertexAttributeDescriptions = nullptr, +}; +constexpr VkPipelineInputAssemblyStateCreateInfo PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .topology = VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST, + .primitiveRestartEnable = VK_FALSE, +}; +constexpr VkPipelineViewportStateCreateInfo PIPELINE_VIEWPORT_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .viewportCount = 1, + .pViewports = nullptr, + .scissorCount = 1, + .pScissors = nullptr, +}; +constexpr VkPipelineRasterizationStateCreateInfo PIPELINE_RASTERIZATION_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthClampEnable = VK_FALSE, + .rasterizerDiscardEnable = VK_FALSE, + .polygonMode = VK_POLYGON_MODE_FILL, + .cullMode = VK_CULL_MODE_BACK_BIT, + .frontFace = VK_FRONT_FACE_CLOCKWISE, + .depthBiasEnable = VK_FALSE, + .depthBiasConstantFactor = 0.0f, + .depthBiasClamp = 0.0f, + .depthBiasSlopeFactor = 0.0f, + .lineWidth = 1.0f, +}; +constexpr VkPipelineMultisampleStateCreateInfo PIPELINE_MULTISAMPLE_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + .sampleShadingEnable = VK_FALSE, + .minSampleShading = 0.0f, + .pSampleMask = nullptr, + .alphaToCoverageEnable = VK_FALSE, + .alphaToOneEnable = VK_FALSE, +}; +constexpr std::array DYNAMIC_STATES{ + VK_DYNAMIC_STATE_VIEWPORT, + VK_DYNAMIC_STATE_SCISSOR, +}; +constexpr VkPipelineDynamicStateCreateInfo PIPELINE_DYNAMIC_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .dynamicStateCount = static_cast<u32>(DYNAMIC_STATES.size()), + .pDynamicStates = DYNAMIC_STATES.data(), +}; +constexpr VkPipelineColorBlendStateCreateInfo PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_CLEAR, + .attachmentCount = 0, + .pAttachments = nullptr, + .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, +}; +constexpr VkPipelineColorBlendAttachmentState PIPELINE_COLOR_BLEND_ATTACHMENT_STATE{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, +}; +constexpr VkPipelineColorBlendStateCreateInfo PIPELINE_COLOR_BLEND_STATE_GENERIC_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_CLEAR, + .attachmentCount = 1, + .pAttachments = &PIPELINE_COLOR_BLEND_ATTACHMENT_STATE, + .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, +}; +constexpr VkPipelineDepthStencilStateCreateInfo PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .depthTestEnable = VK_TRUE, + .depthWriteEnable = VK_TRUE, + .depthCompareOp = VK_COMPARE_OP_ALWAYS, + .depthBoundsTestEnable = VK_FALSE, + .stencilTestEnable = VK_FALSE, + .front = VkStencilOpState{}, + .back = VkStencilOpState{}, + .minDepthBounds = 0.0f, + .maxDepthBounds = 0.0f, +}; + +template <VkFilter filter> +inline constexpr VkSamplerCreateInfo SAMPLER_CREATE_INFO{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .magFilter = filter, + .minFilter = filter, + .mipmapMode = VK_SAMPLER_MIPMAP_MODE_NEAREST, + .addressModeU = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .addressModeV = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .addressModeW = VK_SAMPLER_ADDRESS_MODE_CLAMP_TO_BORDER, + .mipLodBias = 0.0f, + .anisotropyEnable = VK_FALSE, + .maxAnisotropy = 0.0f, + .compareEnable = VK_FALSE, + .compareOp = VK_COMPARE_OP_NEVER, + .minLod = 0.0f, + .maxLod = 0.0f, + .borderColor = VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE, + .unnormalizedCoordinates = VK_TRUE, +}; + +constexpr VkPipelineLayoutCreateInfo PipelineLayoutCreateInfo( + const VkDescriptorSetLayout* set_layout) { + return VkPipelineLayoutCreateInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .setLayoutCount = 1, + .pSetLayouts = set_layout, + .pushConstantRangeCount = 1, + .pPushConstantRanges = &PUSH_CONSTANT_RANGE, + }; +} + +constexpr VkPipelineShaderStageCreateInfo PipelineShaderStageCreateInfo(VkShaderStageFlagBits stage, + VkShaderModule shader) { + return VkPipelineShaderStageCreateInfo{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stage = stage, + .module = shader, + .pName = "main", + .pSpecializationInfo = nullptr, + }; +} + +constexpr std::array<VkPipelineShaderStageCreateInfo, 2> MakeStages( + VkShaderModule vertex_shader, VkShaderModule fragment_shader) { + return std::array{ + PipelineShaderStageCreateInfo(VK_SHADER_STAGE_VERTEX_BIT, vertex_shader), + PipelineShaderStageCreateInfo(VK_SHADER_STAGE_FRAGMENT_BIT, fragment_shader), + }; +} + +void UpdateOneTextureDescriptorSet(const Device& device, VkDescriptorSet descriptor_set, + VkSampler sampler, VkImageView image_view) { + const VkDescriptorImageInfo image_info{ + .sampler = sampler, + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + const VkWriteDescriptorSet write_descriptor_set{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }; + device.GetLogical().UpdateDescriptorSets(write_descriptor_set, nullptr); +} + +void UpdateTwoTexturesDescriptorSet(const Device& device, VkDescriptorSet descriptor_set, + VkSampler sampler, VkImageView image_view_0, + VkImageView image_view_1) { + const VkDescriptorImageInfo image_info_0{ + .sampler = sampler, + .imageView = image_view_0, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + const VkDescriptorImageInfo image_info_1{ + .sampler = sampler, + .imageView = image_view_1, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }; + const std::array write_descriptor_sets{ + VkWriteDescriptorSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_set, + .dstBinding = 0, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info_0, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }, + VkWriteDescriptorSet{ + .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, + .pNext = nullptr, + .dstSet = descriptor_set, + .dstBinding = 1, + .dstArrayElement = 0, + .descriptorCount = 1, + .descriptorType = VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + .pImageInfo = &image_info_1, + .pBufferInfo = nullptr, + .pTexelBufferView = nullptr, + }, + }; + device.GetLogical().UpdateDescriptorSets(write_descriptor_sets, nullptr); +} + +void BindBlitState(vk::CommandBuffer cmdbuf, VkPipelineLayout layout, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region) { + const VkOffset2D offset{ + .x = std::min(dst_region[0].x, dst_region[1].x), + .y = std::min(dst_region[0].y, dst_region[1].y), + }; + const VkExtent2D extent{ + .width = static_cast<u32>(std::abs(dst_region[1].x - dst_region[0].x)), + .height = static_cast<u32>(std::abs(dst_region[1].y - dst_region[0].y)), + }; + const VkViewport viewport{ + .x = static_cast<float>(offset.x), + .y = static_cast<float>(offset.y), + .width = static_cast<float>(extent.width), + .height = static_cast<float>(extent.height), + .minDepth = 0.0f, + .maxDepth = 1.0f, + }; + // TODO: Support scissored blits + const VkRect2D scissor{ + .offset = offset, + .extent = extent, + }; + const float scale_x = static_cast<float>(src_region[1].x - src_region[0].x); + const float scale_y = static_cast<float>(src_region[1].y - src_region[0].y); + const PushConstants push_constants{ + .tex_scale = {scale_x, scale_y}, + .tex_offset = {static_cast<float>(src_region[0].x), static_cast<float>(src_region[0].y)}, + }; + cmdbuf.SetViewport(0, viewport); + cmdbuf.SetScissor(0, scissor); + cmdbuf.PushConstants(layout, VK_SHADER_STAGE_VERTEX_BIT, push_constants); +} + +} // Anonymous namespace + +BlitImageHelper::BlitImageHelper(const Device& device_, VKScheduler& scheduler_, + StateTracker& state_tracker_, VKDescriptorPool& descriptor_pool) + : device{device_}, scheduler{scheduler_}, state_tracker{state_tracker_}, + one_texture_set_layout(device.GetLogical().CreateDescriptorSetLayout( + ONE_TEXTURE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO)), + two_textures_set_layout(device.GetLogical().CreateDescriptorSetLayout( + TWO_TEXTURES_DESCRIPTOR_SET_LAYOUT_CREATE_INFO)), + one_texture_descriptor_allocator(descriptor_pool, *one_texture_set_layout), + two_textures_descriptor_allocator(descriptor_pool, *two_textures_set_layout), + one_texture_pipeline_layout(device.GetLogical().CreatePipelineLayout( + PipelineLayoutCreateInfo(one_texture_set_layout.address()))), + two_textures_pipeline_layout(device.GetLogical().CreatePipelineLayout( + PipelineLayoutCreateInfo(two_textures_set_layout.address()))), + full_screen_vert(BuildShader(device, FULL_SCREEN_TRIANGLE_VERT_SPV)), + blit_color_to_color_frag(BuildShader(device, VULKAN_BLIT_COLOR_FLOAT_FRAG_SPV)), + convert_depth_to_float_frag(BuildShader(device, CONVERT_DEPTH_TO_FLOAT_FRAG_SPV)), + convert_float_to_depth_frag(BuildShader(device, CONVERT_FLOAT_TO_DEPTH_FRAG_SPV)), + linear_sampler(device.GetLogical().CreateSampler(SAMPLER_CREATE_INFO<VK_FILTER_LINEAR>)), + nearest_sampler(device.GetLogical().CreateSampler(SAMPLER_CREATE_INFO<VK_FILTER_NEAREST>)) { + if (device.IsExtShaderStencilExportSupported()) { + blit_depth_stencil_frag = BuildShader(device, VULKAN_BLIT_DEPTH_STENCIL_FRAG_SPV); + } +} + +BlitImageHelper::~BlitImageHelper() = default; + +void BlitImageHelper::BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation) { + const bool is_linear = filter == Tegra::Engines::Fermi2D::Filter::Bilinear; + const BlitImagePipelineKey key{ + .renderpass = dst_framebuffer->RenderPass(), + .operation = operation, + }; + const VkPipelineLayout layout = *one_texture_pipeline_layout; + const VkImageView src_view = src_image_view.Handle(ImageViewType::e2D); + const VkSampler sampler = is_linear ? *linear_sampler : *nearest_sampler; + const VkPipeline pipeline = FindOrEmplacePipeline(key); + const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit(); + scheduler.RequestRenderpass(dst_framebuffer); + scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_view, descriptor_set, + &device = device](vk::CommandBuffer cmdbuf) { + // TODO: Barriers + UpdateOneTextureDescriptorSet(device, descriptor_set, sampler, src_view); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set, + nullptr); + BindBlitState(cmdbuf, layout, dst_region, src_region); + cmdbuf.Draw(3, 1, 0, 0); + }); + scheduler.InvalidateState(); +} + +void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, + VkImageView src_depth_view, VkImageView src_stencil_view, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation) { + ASSERT(filter == Tegra::Engines::Fermi2D::Filter::Point); + ASSERT(operation == Tegra::Engines::Fermi2D::Operation::SrcCopy); + + const VkPipelineLayout layout = *two_textures_pipeline_layout; + const VkSampler sampler = *nearest_sampler; + const VkPipeline pipeline = BlitDepthStencilPipeline(dst_framebuffer->RenderPass()); + const VkDescriptorSet descriptor_set = two_textures_descriptor_allocator.Commit(); + scheduler.RequestRenderpass(dst_framebuffer); + scheduler.Record([dst_region, src_region, pipeline, layout, sampler, src_depth_view, + src_stencil_view, descriptor_set, + &device = device](vk::CommandBuffer cmdbuf) { + // TODO: Barriers + UpdateTwoTexturesDescriptorSet(device, descriptor_set, sampler, src_depth_view, + src_stencil_view); + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set, + nullptr); + BindBlitState(cmdbuf, layout, dst_region, src_region); + cmdbuf.Draw(3, 1, 0, 0); + }); + scheduler.InvalidateState(); +} + +void BlitImageHelper::ConvertD32ToR32(const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + ConvertDepthToColorPipeline(convert_d32_to_r32_pipeline, dst_framebuffer->RenderPass()); + Convert(*convert_d32_to_r32_pipeline, dst_framebuffer, src_image_view); +} + +void BlitImageHelper::ConvertR32ToD32(const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + + ConvertColorToDepthPipeline(convert_r32_to_d32_pipeline, dst_framebuffer->RenderPass()); + Convert(*convert_r32_to_d32_pipeline, dst_framebuffer, src_image_view); +} + +void BlitImageHelper::ConvertD16ToR16(const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + ConvertDepthToColorPipeline(convert_d16_to_r16_pipeline, dst_framebuffer->RenderPass()); + Convert(*convert_d16_to_r16_pipeline, dst_framebuffer, src_image_view); +} + +void BlitImageHelper::ConvertR16ToD16(const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + ConvertColorToDepthPipeline(convert_r16_to_d16_pipeline, dst_framebuffer->RenderPass()); + Convert(*convert_r16_to_d16_pipeline, dst_framebuffer, src_image_view); +} + +void BlitImageHelper::Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, + const ImageView& src_image_view) { + const VkPipelineLayout layout = *one_texture_pipeline_layout; + const VkImageView src_view = src_image_view.Handle(ImageViewType::e2D); + const VkSampler sampler = *nearest_sampler; + const VkDescriptorSet descriptor_set = one_texture_descriptor_allocator.Commit(); + const VkExtent2D extent{ + .width = src_image_view.size.width, + .height = src_image_view.size.height, + }; + scheduler.RequestRenderpass(dst_framebuffer); + scheduler.Record([pipeline, layout, sampler, src_view, descriptor_set, extent, + &device = device](vk::CommandBuffer cmdbuf) { + const VkOffset2D offset{ + .x = 0, + .y = 0, + }; + const VkViewport viewport{ + .x = 0.0f, + .y = 0.0f, + .width = static_cast<float>(extent.width), + .height = static_cast<float>(extent.height), + .minDepth = 0.0f, + .maxDepth = 0.0f, + }; + const VkRect2D scissor{ + .offset = offset, + .extent = extent, + }; + const PushConstants push_constants{ + .tex_scale = {viewport.width, viewport.height}, + .tex_offset = {0.0f, 0.0f}, + }; + UpdateOneTextureDescriptorSet(device, descriptor_set, sampler, src_view); + + // TODO: Barriers + cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, layout, 0, descriptor_set, + nullptr); + cmdbuf.SetViewport(0, viewport); + cmdbuf.SetScissor(0, scissor); + cmdbuf.PushConstants(layout, VK_SHADER_STAGE_VERTEX_BIT, push_constants); + cmdbuf.Draw(3, 1, 0, 0); + }); + scheduler.InvalidateState(); +} + +VkPipeline BlitImageHelper::FindOrEmplacePipeline(const BlitImagePipelineKey& key) { + const auto it = std::ranges::find(blit_color_keys, key); + if (it != blit_color_keys.end()) { + return *blit_color_pipelines[std::distance(blit_color_keys.begin(), it)]; + } + blit_color_keys.push_back(key); + + const std::array stages = MakeStages(*full_screen_vert, *blit_color_to_color_frag); + const VkPipelineColorBlendAttachmentState blend_attachment{ + .blendEnable = VK_FALSE, + .srcColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstColorBlendFactor = VK_BLEND_FACTOR_ZERO, + .colorBlendOp = VK_BLEND_OP_ADD, + .srcAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .dstAlphaBlendFactor = VK_BLEND_FACTOR_ZERO, + .alphaBlendOp = VK_BLEND_OP_ADD, + .colorWriteMask = VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT | + VK_COLOR_COMPONENT_B_BIT | VK_COLOR_COMPONENT_A_BIT, + }; + // TODO: programmable blending + const VkPipelineColorBlendStateCreateInfo color_blend_create_info{ + .sType = VK_STRUCTURE_TYPE_PIPELINE_COLOR_BLEND_STATE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .logicOpEnable = VK_FALSE, + .logicOp = VK_LOGIC_OP_CLEAR, + .attachmentCount = 1, + .pAttachments = &blend_attachment, + .blendConstants = {0.0f, 0.0f, 0.0f, 0.0f}, + }; + blit_color_pipelines.push_back(device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = nullptr, + .pColorBlendState = &color_blend_create_info, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *one_texture_pipeline_layout, + .renderPass = key.renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + })); + return *blit_color_pipelines.back(); +} + +VkPipeline BlitImageHelper::BlitDepthStencilPipeline(VkRenderPass renderpass) { + if (blit_depth_stencil_pipeline) { + return *blit_depth_stencil_pipeline; + } + const std::array stages = MakeStages(*full_screen_vert, *blit_depth_stencil_frag); + blit_depth_stencil_pipeline = device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = &PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *two_textures_pipeline_layout, + .renderPass = renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + }); + return *blit_depth_stencil_pipeline; +} + +void BlitImageHelper::ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass) { + if (pipeline) { + return; + } + const std::array stages = MakeStages(*full_screen_vert, *convert_depth_to_float_frag); + pipeline = device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = nullptr, + .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_GENERIC_CREATE_INFO, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *one_texture_pipeline_layout, + .renderPass = renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + }); +} + +void BlitImageHelper::ConvertColorToDepthPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass) { + if (pipeline) { + return; + } + const std::array stages = MakeStages(*full_screen_vert, *convert_float_to_depth_frag); + pipeline = device.GetLogical().CreateGraphicsPipeline({ + .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .stageCount = static_cast<u32>(stages.size()), + .pStages = stages.data(), + .pVertexInputState = &PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO, + .pInputAssemblyState = &PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, + .pTessellationState = nullptr, + .pViewportState = &PIPELINE_VIEWPORT_STATE_CREATE_INFO, + .pRasterizationState = &PIPELINE_RASTERIZATION_STATE_CREATE_INFO, + .pMultisampleState = &PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, + .pDepthStencilState = &PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO, + .pColorBlendState = &PIPELINE_COLOR_BLEND_STATE_EMPTY_CREATE_INFO, + .pDynamicState = &PIPELINE_DYNAMIC_STATE_CREATE_INFO, + .layout = *one_texture_pipeline_layout, + .renderPass = renderpass, + .subpass = 0, + .basePipelineHandle = VK_NULL_HANDLE, + .basePipelineIndex = 0, + }); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/blit_image.h b/src/video_core/renderer_vulkan/blit_image.h new file mode 100644 index 000000000..43fd3d737 --- /dev/null +++ b/src/video_core/renderer_vulkan/blit_image.h @@ -0,0 +1,96 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <compare> + +#include "video_core/engines/fermi_2d.h" +#include "video_core/renderer_vulkan/vk_descriptor_pool.h" +#include "video_core/texture_cache/types.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +using VideoCommon::Offset2D; + +class Device; +class Framebuffer; +class ImageView; +class StateTracker; +class VKScheduler; + +struct BlitImagePipelineKey { + constexpr auto operator<=>(const BlitImagePipelineKey&) const noexcept = default; + + VkRenderPass renderpass; + Tegra::Engines::Fermi2D::Operation operation; +}; + +class BlitImageHelper { +public: + explicit BlitImageHelper(const Device& device, VKScheduler& scheduler, + StateTracker& state_tracker, VKDescriptorPool& descriptor_pool); + ~BlitImageHelper(); + + void BlitColor(const Framebuffer* dst_framebuffer, const ImageView& src_image_view, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation); + + void BlitDepthStencil(const Framebuffer* dst_framebuffer, VkImageView src_depth_view, + VkImageView src_stencil_view, const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation); + + void ConvertD32ToR32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + + void ConvertR32ToD32(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + + void ConvertD16ToR16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + + void ConvertR16ToD16(const Framebuffer* dst_framebuffer, const ImageView& src_image_view); + +private: + void Convert(VkPipeline pipeline, const Framebuffer* dst_framebuffer, + const ImageView& src_image_view); + + [[nodiscard]] VkPipeline FindOrEmplacePipeline(const BlitImagePipelineKey& key); + + [[nodiscard]] VkPipeline BlitDepthStencilPipeline(VkRenderPass renderpass); + + void ConvertDepthToColorPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass); + + void ConvertColorToDepthPipeline(vk::Pipeline& pipeline, VkRenderPass renderpass); + + const Device& device; + VKScheduler& scheduler; + StateTracker& state_tracker; + + vk::DescriptorSetLayout one_texture_set_layout; + vk::DescriptorSetLayout two_textures_set_layout; + DescriptorAllocator one_texture_descriptor_allocator; + DescriptorAllocator two_textures_descriptor_allocator; + vk::PipelineLayout one_texture_pipeline_layout; + vk::PipelineLayout two_textures_pipeline_layout; + vk::ShaderModule full_screen_vert; + vk::ShaderModule blit_color_to_color_frag; + vk::ShaderModule blit_depth_stencil_frag; + vk::ShaderModule convert_depth_to_float_frag; + vk::ShaderModule convert_float_to_depth_frag; + vk::Sampler linear_sampler; + vk::Sampler nearest_sampler; + + std::vector<BlitImagePipelineKey> blit_color_keys; + std::vector<vk::Pipeline> blit_color_pipelines; + vk::Pipeline blit_depth_stencil_pipeline; + vk::Pipeline convert_d32_to_r32_pipeline; + vk::Pipeline convert_r32_to_d32_pipeline; + vk::Pipeline convert_d16_to_r16_pipeline; + vk::Pipeline convert_r16_to_d16_pipeline; +}; + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index 81a39a3b8..5be6dabd9 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp @@ -8,6 +8,7 @@ #include <boost/functional/hash.hpp> +#include "common/bit_cast.h" #include "common/cityhash.h" #include "common/common_types.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" @@ -45,7 +46,7 @@ void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_sta regs.polygon_offset_fill_enable}; const u32 topology_index = static_cast<u32>(regs.draw.topology.Value()); - raw = 0; + raw1 = 0; primitive_restart_enable.Assign(regs.primitive_restart.enabled != 0 ? 1 : 0); depth_bias_enable.Assign(enabled_lut[POLYGON_OFFSET_ENABLE_LUT[topology_index]] != 0 ? 1 : 0); depth_clamp_disabled.Assign(regs.view_volume_clip_control.depth_clamp_disabled.Value()); @@ -58,15 +59,24 @@ void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_sta logic_op_enable.Assign(regs.logic_op.enable != 0 ? 1 : 0); logic_op.Assign(PackLogicOp(regs.logic_op.operation)); rasterize_enable.Assign(regs.rasterize_enable != 0 ? 1 : 0); + topology.Assign(regs.draw.topology); + msaa_mode.Assign(regs.multisample_mode); + + raw2 = 0; + const auto test_func = + regs.alpha_test_enabled == 1 ? regs.alpha_test_func : Maxwell::ComparisonOp::Always; + alpha_test_func.Assign(PackComparisonOp(test_func)); + early_z.Assign(regs.force_early_fragment_tests != 0 ? 1 : 0); - std::memcpy(&point_size, ®s.point_size, sizeof(point_size)); // TODO: C++20 std::bit_cast + alpha_test_ref = Common::BitCast<u32>(regs.alpha_test_ref); + point_size = Common::BitCast<u32>(regs.point_size); for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { binding_divisors[index] = regs.instanced_arrays.IsInstancingEnabled(index) ? regs.vertex_array[index].divisor : 0; } - for (std::size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) { + for (size_t index = 0; index < Maxwell::NumVertexAttributes; ++index) { const auto& input = regs.vertex_attrib_format[index]; auto& attribute = attributes[index]; attribute.raw = 0; @@ -75,6 +85,7 @@ void FixedPipelineState::Fill(const Maxwell& regs, bool has_extended_dynamic_sta attribute.offset.Assign(input.offset); attribute.type.Assign(static_cast<u32>(input.type.Value())); attribute.size.Assign(static_cast<u32>(input.size.Value())); + attribute.binding_index_enabled.Assign(regs.vertex_array[index].IsEnabled() ? 1 : 0); } for (std::size_t index = 0; index < std::size(attachments); ++index) { @@ -131,7 +142,6 @@ void FixedPipelineState::BlendingAttachment::Fill(const Maxwell& regs, std::size } void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) { - const u32 topology_index = static_cast<u32>(regs.draw.topology.Value()); u32 packed_front_face = PackFrontFace(regs.front_face); if (regs.screen_y_control.triangle_rast_flip != 0) { // Flip front face @@ -161,17 +171,11 @@ void FixedPipelineState::DynamicState::Fill(const Maxwell& regs) { depth_test_enable.Assign(regs.depth_test_enable); front_face.Assign(packed_front_face); depth_test_func.Assign(PackComparisonOp(regs.depth_test_func)); - topology.Assign(topology_index); cull_face.Assign(PackCullFace(regs.cull_face)); cull_enable.Assign(regs.cull_test_enabled != 0 ? 1 : 0); - - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { - const auto& input = regs.vertex_array[index]; - VertexBinding& binding = vertex_bindings[index]; - binding.raw = 0; - binding.enabled.Assign(input.IsEnabled() ? 1 : 0); - binding.stride.Assign(static_cast<u16>(input.stride.Value())); - } + std::ranges::transform(regs.vertex_array, vertex_strides.begin(), [](const auto& array) { + return static_cast<u16>(array.stride.Value()); + }); } std::size_t FixedPipelineState::Hash() const noexcept { diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.h b/src/video_core/renderer_vulkan/fixed_pipeline_state.h index cdcbb65f5..465a55fdb 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.h +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.h @@ -96,6 +96,8 @@ struct FixedPipelineState { BitField<6, 14, u32> offset; BitField<20, 3, u32> type; BitField<23, 6, u32> size; + // Not really an element of a vertex attribute, but it can be packed here + BitField<29, 1, u32> binding_index_enabled; constexpr Maxwell::VertexAttribute::Type Type() const noexcept { return static_cast<Maxwell::VertexAttribute::Type>(type.Value()); @@ -130,12 +132,6 @@ struct FixedPipelineState { } }; - union VertexBinding { - u16 raw; - BitField<0, 12, u16> stride; - BitField<12, 1, u16> enabled; - }; - struct DynamicState { union { u32 raw1; @@ -150,11 +146,11 @@ struct FixedPipelineState { }; union { u32 raw2; - BitField<0, 4, u32> topology; - BitField<4, 2, u32> cull_face; - BitField<6, 1, u32> cull_enable; + BitField<0, 2, u32> cull_face; + BitField<2, 1, u32> cull_enable; }; - std::array<VertexBinding, Maxwell::NumVertexArrays> vertex_bindings; + // Vertex stride is a 12 bits value, we have 4 bits to spare per element + std::array<u16, Maxwell::NumVertexArrays> vertex_strides; void Fill(const Maxwell& regs); @@ -169,14 +165,10 @@ struct FixedPipelineState { Maxwell::FrontFace FrontFace() const noexcept { return UnpackFrontFace(front_face.Value()); } - - constexpr Maxwell::PrimitiveTopology Topology() const noexcept { - return static_cast<Maxwell::PrimitiveTopology>(topology.Value()); - } }; union { - u32 raw; + u32 raw1; BitField<0, 1, u32> no_extended_dynamic_state; BitField<2, 1, u32> primitive_restart_enable; BitField<3, 1, u32> depth_bias_enable; @@ -190,7 +182,16 @@ struct FixedPipelineState { BitField<18, 1, u32> logic_op_enable; BitField<19, 4, u32> logic_op; BitField<23, 1, u32> rasterize_enable; + BitField<24, 4, Maxwell::PrimitiveTopology> topology; + BitField<28, 4, Tegra::Texture::MsaaMode> msaa_mode; + }; + union { + u32 raw2; + BitField<0, 3, u32> alpha_test_func; + BitField<3, 1, u32> early_z; }; + + u32 alpha_test_ref; u32 point_size; std::array<u32, Maxwell::NumVertexArrays> binding_divisors; std::array<VertexAttribute, Maxwell::NumVertexAttributes> attributes; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index d22de1d81..ca7c2c579 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -9,9 +9,9 @@ #include "common/logging/log.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/surface.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan::MaxwellToVK { @@ -26,7 +26,7 @@ VkFilter Filter(Tegra::Texture::TextureFilter filter) { case Tegra::Texture::TextureFilter::Linear: return VK_FILTER_LINEAR; } - UNREACHABLE_MSG("Invalid sampler filter={}", static_cast<u32>(filter)); + UNREACHABLE_MSG("Invalid sampler filter={}", filter); return {}; } @@ -43,11 +43,11 @@ VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter case Tegra::Texture::TextureMipmapFilter::Linear: return VK_SAMPLER_MIPMAP_MODE_LINEAR; } - UNREACHABLE_MSG("Invalid sampler mipmap mode={}", static_cast<u32>(mipmap_filter)); + UNREACHABLE_MSG("Invalid sampler mipmap mode={}", mipmap_filter); return {}; } -VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode, +VkSamplerAddressMode WrapMode(const Device& device, Tegra::Texture::WrapMode wrap_mode, Tegra::Texture::TextureFilter filter) { switch (wrap_mode) { case Tegra::Texture::WrapMode::Wrap: @@ -79,7 +79,7 @@ VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode w UNIMPLEMENTED(); return VK_SAMPLER_ADDRESS_MODE_MIRROR_CLAMP_TO_EDGE; default: - UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode)); + UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", wrap_mode); return {}; } } @@ -103,8 +103,7 @@ VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_ case Tegra::Texture::DepthCompareFunc::Always: return VK_COMPARE_OP_ALWAYS; } - UNIMPLEMENTED_MSG("Unimplemented sampler depth compare function={}", - static_cast<u32>(depth_compare_func)); + UNIMPLEMENTED_MSG("Unimplemented sampler depth compare function={}", depth_compare_func); return {}; } @@ -123,7 +122,7 @@ struct FormatTuple { {VK_FORMAT_A8B8G8R8_SINT_PACK32, Attachable | Storage}, // A8B8G8R8_SINT {VK_FORMAT_A8B8G8R8_UINT_PACK32, Attachable | Storage}, // A8B8G8R8_UINT {VK_FORMAT_R5G6B5_UNORM_PACK16, Attachable}, // R5G6B5_UNORM - {VK_FORMAT_B5G6R5_UNORM_PACK16, Attachable}, // B5G6R5_UNORM + {VK_FORMAT_B5G6R5_UNORM_PACK16}, // B5G6R5_UNORM {VK_FORMAT_A1R5G5B5_UNORM_PACK16, Attachable}, // A1R5G5B5_UNORM {VK_FORMAT_A2B10G10R10_UNORM_PACK32, Attachable | Storage}, // A2B10G10R10_UNORM {VK_FORMAT_A2B10G10R10_UINT_PACK32, Attachable | Storage}, // A2B10G10R10_UINT @@ -164,7 +163,7 @@ struct FormatTuple { {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // R16G16_UNORM {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // R16G16_FLOAT {VK_FORMAT_UNDEFINED}, // R16G16_UINT - {VK_FORMAT_UNDEFINED}, // R16G16_SINT + {VK_FORMAT_R16G16_SINT, Attachable | Storage}, // R16G16_SINT {VK_FORMAT_R16G16_SNORM, Attachable | Storage}, // R16G16_SNORM {VK_FORMAT_UNDEFINED}, // R32G32B32_FLOAT {VK_FORMAT_R8G8B8A8_SRGB, Attachable}, // A8B8G8R8_SRGB @@ -223,30 +222,31 @@ constexpr bool IsZetaFormat(PixelFormat pixel_format) { } // Anonymous namespace -FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFormat pixel_format) { +FormatInfo SurfaceFormat(const Device& device, FormatType format_type, PixelFormat pixel_format) { ASSERT(static_cast<std::size_t>(pixel_format) < std::size(tex_format_tuples)); auto tuple = tex_format_tuples[static_cast<std::size_t>(pixel_format)]; if (tuple.format == VK_FORMAT_UNDEFINED) { - UNIMPLEMENTED_MSG("Unimplemented texture format with pixel format={}", - static_cast<u32>(pixel_format)); + UNIMPLEMENTED_MSG("Unimplemented texture format with pixel format={}", pixel_format); return {VK_FORMAT_A8B8G8R8_UNORM_PACK32, true, true}; } // Use A8B8G8R8_UNORM on hardware that doesn't support ASTC natively if (!device.IsOptimalAstcSupported() && VideoCore::Surface::IsPixelFormatASTC(pixel_format)) { - tuple.format = VideoCore::Surface::IsPixelFormatSRGB(pixel_format) - ? VK_FORMAT_A8B8G8R8_SRGB_PACK32 - : VK_FORMAT_A8B8G8R8_UNORM_PACK32; + const bool is_srgb = VideoCore::Surface::IsPixelFormatSRGB(pixel_format); + tuple.format = is_srgb ? VK_FORMAT_A8B8G8R8_SRGB_PACK32 : VK_FORMAT_A8B8G8R8_UNORM_PACK32; } const bool attachable = tuple.usage & Attachable; const bool storage = tuple.usage & Storage; - VkFormatFeatureFlags usage; - if (format_type == FormatType::Buffer) { + VkFormatFeatureFlags usage{}; + switch (format_type) { + case FormatType::Buffer: usage = VK_FORMAT_FEATURE_STORAGE_TEXEL_BUFFER_BIT | VK_FORMAT_FEATURE_UNIFORM_TEXEL_BUFFER_BIT; - } else { + break; + case FormatType::Linear: + case FormatType::Optimal: usage = VK_FORMAT_FEATURE_SAMPLED_IMAGE_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT | VK_FORMAT_FEATURE_TRANSFER_SRC_BIT; if (attachable) { @@ -256,6 +256,7 @@ FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFo if (storage) { usage |= VK_FORMAT_FEATURE_STORAGE_IMAGE_BIT; } + break; } return {device.GetSupportedFormat(tuple.format, usage, format_type), attachable, storage}; } @@ -275,11 +276,11 @@ VkShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage) { case Tegra::Engines::ShaderType::Compute: return VK_SHADER_STAGE_COMPUTE_BIT; } - UNIMPLEMENTED_MSG("Unimplemented shader stage={}", static_cast<u32>(stage)); + UNIMPLEMENTED_MSG("Unimplemented shader stage={}", stage); return {}; } -VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device, +VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const Device& device, Maxwell::PrimitiveTopology topology) { switch (topology) { case Maxwell::PrimitiveTopology::Points: @@ -300,7 +301,7 @@ VkPrimitiveTopology PrimitiveTopology([[maybe_unused]] const VKDevice& device, case Maxwell::PrimitiveTopology::Patches: return VK_PRIMITIVE_TOPOLOGY_PATCH_LIST; default: - UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology)); + UNIMPLEMENTED_MSG("Unimplemented topology={}", topology); return {}; } } @@ -490,8 +491,7 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib } break; } - UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", static_cast<u32>(type), - static_cast<u32>(size)); + UNIMPLEMENTED_MSG("Unimplemented vertex format of type={} and size={}", type, size); return {}; } @@ -522,11 +522,11 @@ VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison) { case Maxwell::ComparisonOp::AlwaysOld: return VK_COMPARE_OP_ALWAYS; } - UNIMPLEMENTED_MSG("Unimplemented comparison op={}", static_cast<u32>(comparison)); + UNIMPLEMENTED_MSG("Unimplemented comparison op={}", comparison); return {}; } -VkIndexType IndexFormat(const VKDevice& device, Maxwell::IndexFormat index_format) { +VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format) { switch (index_format) { case Maxwell::IndexFormat::UnsignedByte: if (!device.IsExtIndexTypeUint8Supported()) { @@ -539,7 +539,7 @@ VkIndexType IndexFormat(const VKDevice& device, Maxwell::IndexFormat index_forma case Maxwell::IndexFormat::UnsignedInt: return VK_INDEX_TYPE_UINT32; } - UNIMPLEMENTED_MSG("Unimplemented index_format={}", static_cast<u32>(index_format)); + UNIMPLEMENTED_MSG("Unimplemented index_format={}", index_format); return {}; } @@ -570,7 +570,7 @@ VkStencilOp StencilOp(Maxwell::StencilOp stencil_op) { case Maxwell::StencilOp::DecrWrapOGL: return VK_STENCIL_OP_DECREMENT_AND_WRAP; } - UNIMPLEMENTED_MSG("Unimplemented stencil op={}", static_cast<u32>(stencil_op)); + UNIMPLEMENTED_MSG("Unimplemented stencil op={}", stencil_op); return {}; } @@ -592,7 +592,7 @@ VkBlendOp BlendEquation(Maxwell::Blend::Equation equation) { case Maxwell::Blend::Equation::MaxGL: return VK_BLEND_OP_MAX; } - UNIMPLEMENTED_MSG("Unimplemented blend equation={}", static_cast<u32>(equation)); + UNIMPLEMENTED_MSG("Unimplemented blend equation={}", equation); return {}; } @@ -656,7 +656,7 @@ VkBlendFactor BlendFactor(Maxwell::Blend::Factor factor) { case Maxwell::Blend::Factor::OneMinusConstantAlphaGL: return VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA; } - UNIMPLEMENTED_MSG("Unimplemented blend factor={}", static_cast<u32>(factor)); + UNIMPLEMENTED_MSG("Unimplemented blend factor={}", factor); return {}; } @@ -667,7 +667,7 @@ VkFrontFace FrontFace(Maxwell::FrontFace front_face) { case Maxwell::FrontFace::CounterClockWise: return VK_FRONT_FACE_COUNTER_CLOCKWISE; } - UNIMPLEMENTED_MSG("Unimplemented front face={}", static_cast<u32>(front_face)); + UNIMPLEMENTED_MSG("Unimplemented front face={}", front_face); return {}; } @@ -680,7 +680,7 @@ VkCullModeFlags CullFace(Maxwell::CullFace cull_face) { case Maxwell::CullFace::FrontAndBack: return VK_CULL_MODE_FRONT_AND_BACK; } - UNIMPLEMENTED_MSG("Unimplemented cull face={}", static_cast<u32>(cull_face)); + UNIMPLEMENTED_MSG("Unimplemented cull face={}", cull_face); return {}; } @@ -700,7 +700,7 @@ VkComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle) { case Tegra::Texture::SwizzleSource::OneFloat: return VK_COMPONENT_SWIZZLE_ONE; } - UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", static_cast<u32>(swizzle)); + UNIMPLEMENTED_MSG("Unimplemented swizzle source={}", swizzle); return {}; } @@ -723,8 +723,21 @@ VkViewportCoordinateSwizzleNV ViewportSwizzle(Maxwell::ViewportSwizzle swizzle) case Maxwell::ViewportSwizzle::NegativeW: return VK_VIEWPORT_COORDINATE_SWIZZLE_NEGATIVE_W_NV; } - UNREACHABLE_MSG("Invalid swizzle={}", static_cast<int>(swizzle)); + UNREACHABLE_MSG("Invalid swizzle={}", swizzle); return {}; } +VkSamplerReductionMode SamplerReduction(Tegra::Texture::SamplerReduction reduction) { + switch (reduction) { + case Tegra::Texture::SamplerReduction::WeightedAverage: + return VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT; + case Tegra::Texture::SamplerReduction::Min: + return VK_SAMPLER_REDUCTION_MODE_MIN_EXT; + case Tegra::Texture::SamplerReduction::Max: + return VK_SAMPLER_REDUCTION_MODE_MAX_EXT; + } + UNREACHABLE_MSG("Invalid sampler mode={}", static_cast<int>(reduction)); + return VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT; +} + } // namespace Vulkan::MaxwellToVK diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h index 7e213452f..537969840 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.h +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h @@ -6,10 +6,10 @@ #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/surface.h" #include "video_core/textures/texture.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan::MaxwellToVK { @@ -22,7 +22,7 @@ VkFilter Filter(Tegra::Texture::TextureFilter filter); VkSamplerMipmapMode MipmapMode(Tegra::Texture::TextureMipmapFilter mipmap_filter); -VkSamplerAddressMode WrapMode(const VKDevice& device, Tegra::Texture::WrapMode wrap_mode, +VkSamplerAddressMode WrapMode(const Device& device, Tegra::Texture::WrapMode wrap_mode, Tegra::Texture::TextureFilter filter); VkCompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func); @@ -35,17 +35,17 @@ struct FormatInfo { bool storage; }; -FormatInfo SurfaceFormat(const VKDevice& device, FormatType format_type, PixelFormat pixel_format); +FormatInfo SurfaceFormat(const Device& device, FormatType format_type, PixelFormat pixel_format); VkShaderStageFlagBits ShaderStage(Tegra::Engines::ShaderType stage); -VkPrimitiveTopology PrimitiveTopology(const VKDevice& device, Maxwell::PrimitiveTopology topology); +VkPrimitiveTopology PrimitiveTopology(const Device& device, Maxwell::PrimitiveTopology topology); VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size); VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison); -VkIndexType IndexFormat(const VKDevice& device, Maxwell::IndexFormat index_format); +VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format); VkStencilOp StencilOp(Maxwell::StencilOp stencil_op); @@ -61,4 +61,6 @@ VkComponentSwizzle SwizzleSource(Tegra::Texture::SwizzleSource swizzle); VkViewportCoordinateSwizzleNV ViewportSwizzle(Maxwell::ViewportSwizzle swizzle); +VkSamplerReductionMode SamplerReduction(Tegra::Texture::SamplerReduction reduction); + } // namespace Vulkan::MaxwellToVK diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 715182b3b..d7437e185 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -12,8 +12,6 @@ #include <fmt/format.h> -#include "common/dynamic_library.h" -#include "common/file_util.h" #include "common/logging/log.h" #include "common/telemetry.h" #include "core/core.h" @@ -24,179 +22,27 @@ #include "video_core/gpu.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_swapchain.h" -#include "video_core/renderer_vulkan/wrapper.h" - -// Include these late to avoid polluting previous headers -#ifdef _WIN32 -#include <windows.h> -// ensure include order -#include <vulkan/vulkan_win32.h> -#endif - -#if !defined(_WIN32) && !defined(__APPLE__) -#include <X11/Xlib.h> -#include <vulkan/vulkan_wayland.h> -#include <vulkan/vulkan_xlib.h> -#endif +#include "video_core/vulkan_common/vulkan_debug_callback.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_instance.h" +#include "video_core/vulkan_common/vulkan_library.h" +#include "video_core/vulkan_common/vulkan_surface.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { - namespace { - -using Core::Frontend::WindowSystemType; - -VkBool32 DebugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, - VkDebugUtilsMessageTypeFlagsEXT type, - const VkDebugUtilsMessengerCallbackDataEXT* data, - [[maybe_unused]] void* user_data) { - const char* const message{data->pMessage}; - - if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) { - LOG_CRITICAL(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) { - LOG_WARNING(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) { - LOG_INFO(Render_Vulkan, "{}", message); - } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT) { - LOG_DEBUG(Render_Vulkan, "{}", message); - } - return VK_FALSE; -} - -Common::DynamicLibrary OpenVulkanLibrary() { - Common::DynamicLibrary library; -#ifdef __APPLE__ - // Check if a path to a specific Vulkan library has been specified. - char* libvulkan_env = getenv("LIBVULKAN_PATH"); - if (!libvulkan_env || !library.Open(libvulkan_env)) { - // Use the libvulkan.dylib from the application bundle. - const std::string filename = - Common::FS::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; - library.Open(filename.c_str()); - } -#else - std::string filename = Common::DynamicLibrary::GetVersionedFilename("vulkan", 1); - if (!library.Open(filename.c_str())) { - // Android devices may not have libvulkan.so.1, only libvulkan.so. - filename = Common::DynamicLibrary::GetVersionedFilename("vulkan"); - (void)library.Open(filename.c_str()); - } -#endif - return library; -} - -vk::Instance CreateInstance(Common::DynamicLibrary& library, vk::InstanceDispatch& dld, - WindowSystemType window_type = WindowSystemType::Headless, - bool enable_layers = false) { - if (!library.IsOpen()) { - LOG_ERROR(Render_Vulkan, "Vulkan library not available"); - return {}; - } - if (!library.GetSymbol("vkGetInstanceProcAddr", &dld.vkGetInstanceProcAddr)) { - LOG_ERROR(Render_Vulkan, "vkGetInstanceProcAddr not present in Vulkan"); - return {}; - } - if (!vk::Load(dld)) { - LOG_ERROR(Render_Vulkan, "Failed to load Vulkan function pointers"); - return {}; - } - - std::vector<const char*> extensions; - extensions.reserve(6); - switch (window_type) { - case Core::Frontend::WindowSystemType::Headless: - break; -#ifdef _WIN32 - case Core::Frontend::WindowSystemType::Windows: - extensions.push_back(VK_KHR_WIN32_SURFACE_EXTENSION_NAME); - break; -#endif -#if !defined(_WIN32) && !defined(__APPLE__) - case Core::Frontend::WindowSystemType::X11: - extensions.push_back(VK_KHR_XLIB_SURFACE_EXTENSION_NAME); - break; - case Core::Frontend::WindowSystemType::Wayland: - extensions.push_back(VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME); - break; -#endif - default: - LOG_ERROR(Render_Vulkan, "Presentation not supported on this platform"); - break; - } - if (window_type != Core::Frontend::WindowSystemType::Headless) { - extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); - } - if (enable_layers) { - extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); - } - extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); - - const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); - if (!properties) { - LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); - return {}; - } - - for (const char* extension : extensions) { - const auto it = - std::find_if(properties->begin(), properties->end(), [extension](const auto& prop) { - return !std::strcmp(extension, prop.extensionName); - }); - if (it == properties->end()) { - LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); - return {}; - } - } - - std::vector<const char*> layers; - layers.reserve(1); - if (enable_layers) { - layers.push_back("VK_LAYER_KHRONOS_validation"); - } - - const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld); - if (!layer_properties) { - LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers"); - layers.clear(); - } - - for (auto layer_it = layers.begin(); layer_it != layers.end();) { - const char* const layer = *layer_it; - const auto it = std::find_if( - layer_properties->begin(), layer_properties->end(), - [layer](const VkLayerProperties& prop) { return !std::strcmp(layer, prop.layerName); }); - if (it == layer_properties->end()) { - LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer); - layer_it = layers.erase(layer_it); - } else { - ++layer_it; - } - } - - vk::Instance instance = vk::Instance::Create(layers, extensions, dld); - if (!instance) { - LOG_ERROR(Render_Vulkan, "Failed to create Vulkan instance"); - return {}; - } - if (!vk::Load(*instance, dld)) { - LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers"); - } - return instance; -} - std::string GetReadableVersion(u32 version) { return fmt::format("{}.{}.{}", VK_VERSION_MAJOR(version), VK_VERSION_MINOR(version), VK_VERSION_PATCH(version)); } -std::string GetDriverVersion(const VKDevice& device) { +std::string GetDriverVersion(const Device& device) { // Extracted from // https://github.com/SaschaWillems/vulkan.gpuinfo.org/blob/5dddea46ea1120b0df14eef8f15ff8e318e35462/functions.php#L308-L314 const u32 version = device.GetDriverVersion(); @@ -213,7 +59,6 @@ std::string GetDriverVersion(const VKDevice& device) { const u32 minor = version & 0x3fff; return fmt::format("{}.{}", major, minor); } - return GetReadableVersion(version); } @@ -240,8 +85,8 @@ std::string BuildCommaSeparatedExtensions(std::vector<std::string> available_ext RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, - std::unique_ptr<Core::Frontend::GraphicsContext> context) - : RendererBase{emu_window, std::move(context)}, telemetry_session{telemetry_session_}, + std::unique_ptr<Core::Frontend::GraphicsContext> context_) + : RendererBase{emu_window, std::move(context_)}, telemetry_session{telemetry_session_}, cpu_memory{cpu_memory_}, gpu{gpu_} {} RendererVulkan::~RendererVulkan() { @@ -249,12 +94,9 @@ RendererVulkan::~RendererVulkan() { } void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - render_window.PollEvents(); - if (!framebuffer) { return; } - const auto& layout = render_window.GetFramebufferLayout(); if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) { const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset; @@ -280,17 +122,19 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { rasterizer->TickFrame(); } - render_window.PollEvents(); + render_window.OnFrameDisplayed(); } -bool RendererVulkan::Init() { - library = OpenVulkanLibrary(); - instance = CreateInstance(library, dld, render_window.GetWindowInfo().type, - Settings::values.renderer_debug); - if (!instance || !CreateDebugCallback() || !CreateSurface() || !PickDevices()) { - return false; +bool RendererVulkan::Init() try { + library = OpenLibrary(); + instance = CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, + true, Settings::values.renderer_debug); + if (Settings::values.renderer_debug) { + debug_callback = CreateDebugCallback(instance); } + surface = CreateSurface(instance, render_window); + InitializeDevice(); Report(); memory_manager = std::make_unique<VKMemoryManager>(*device); @@ -310,8 +154,11 @@ bool RendererVulkan::Init() { blit_screen = std::make_unique<VKBlitScreen>(cpu_memory, render_window, *rasterizer, *device, *memory_manager, *swapchain, *scheduler, screen_info); - return true; + +} catch (const vk::Exception& exception) { + LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what()); + return false; } void RendererVulkan::ShutDown() { @@ -321,7 +168,6 @@ void RendererVulkan::ShutDown() { if (const auto& dev = device->GetLogical()) { dev.WaitIdle(); } - rasterizer.reset(); blit_screen.reset(); scheduler.reset(); @@ -330,94 +176,15 @@ void RendererVulkan::ShutDown() { device.reset(); } -bool RendererVulkan::CreateDebugCallback() { - if (!Settings::values.renderer_debug) { - return true; - } - debug_callback = instance.TryCreateDebugCallback(DebugCallback); - if (!debug_callback) { - LOG_ERROR(Render_Vulkan, "Failed to create debug callback"); - return false; - } - return true; -} - -bool RendererVulkan::CreateSurface() { - [[maybe_unused]] const auto& window_info = render_window.GetWindowInfo(); - VkSurfaceKHR unsafe_surface = nullptr; - -#ifdef _WIN32 - if (window_info.type == Core::Frontend::WindowSystemType::Windows) { - const HWND hWnd = static_cast<HWND>(window_info.render_surface); - const VkWin32SurfaceCreateInfoKHR win32_ci{VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR, - nullptr, 0, nullptr, hWnd}; - const auto vkCreateWin32SurfaceKHR = reinterpret_cast<PFN_vkCreateWin32SurfaceKHR>( - dld.vkGetInstanceProcAddr(*instance, "vkCreateWin32SurfaceKHR")); - if (!vkCreateWin32SurfaceKHR || - vkCreateWin32SurfaceKHR(*instance, &win32_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { - LOG_ERROR(Render_Vulkan, "Failed to initialize Win32 surface"); - return false; - } - } -#endif -#if !defined(_WIN32) && !defined(__APPLE__) - if (window_info.type == Core::Frontend::WindowSystemType::X11) { - const VkXlibSurfaceCreateInfoKHR xlib_ci{ - VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, nullptr, 0, - static_cast<Display*>(window_info.display_connection), - reinterpret_cast<Window>(window_info.render_surface)}; - const auto vkCreateXlibSurfaceKHR = reinterpret_cast<PFN_vkCreateXlibSurfaceKHR>( - dld.vkGetInstanceProcAddr(*instance, "vkCreateXlibSurfaceKHR")); - if (!vkCreateXlibSurfaceKHR || - vkCreateXlibSurfaceKHR(*instance, &xlib_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { - LOG_ERROR(Render_Vulkan, "Failed to initialize Xlib surface"); - return false; - } - } - if (window_info.type == Core::Frontend::WindowSystemType::Wayland) { - const VkWaylandSurfaceCreateInfoKHR wayland_ci{ - VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR, nullptr, 0, - static_cast<wl_display*>(window_info.display_connection), - static_cast<wl_surface*>(window_info.render_surface)}; - const auto vkCreateWaylandSurfaceKHR = reinterpret_cast<PFN_vkCreateWaylandSurfaceKHR>( - dld.vkGetInstanceProcAddr(*instance, "vkCreateWaylandSurfaceKHR")); - if (!vkCreateWaylandSurfaceKHR || - vkCreateWaylandSurfaceKHR(*instance, &wayland_ci, nullptr, &unsafe_surface) != - VK_SUCCESS) { - LOG_ERROR(Render_Vulkan, "Failed to initialize Wayland surface"); - return false; - } - } -#endif - if (!unsafe_surface) { - LOG_ERROR(Render_Vulkan, "Presentation not supported on this platform"); - return false; - } - - surface = vk::SurfaceKHR(unsafe_surface, *instance, dld); - return true; -} - -bool RendererVulkan::PickDevices() { - const auto devices = instance.EnumeratePhysicalDevices(); - if (!devices) { - LOG_ERROR(Render_Vulkan, "Failed to enumerate physical devices"); - return false; - } - +void RendererVulkan::InitializeDevice() { + const std::vector<VkPhysicalDevice> devices = instance.EnumeratePhysicalDevices(); const s32 device_index = Settings::values.vulkan_device.GetValue(); - if (device_index < 0 || device_index >= static_cast<s32>(devices->size())) { + if (device_index < 0 || device_index >= static_cast<s32>(devices.size())) { LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index); - return false; - } - const vk::PhysicalDevice physical_device((*devices)[static_cast<std::size_t>(device_index)], - dld); - if (!VKDevice::IsSuitable(physical_device, *surface)) { - return false; + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); } - - device = std::make_unique<VKDevice>(*instance, physical_device, *surface, dld); - return device->Create(); + const vk::PhysicalDevice physical_device(devices[static_cast<size_t>(device_index)], dld); + device = std::make_unique<Device>(*instance, physical_device, *surface, dld); } void RendererVulkan::Report() const { @@ -426,7 +193,7 @@ void RendererVulkan::Report() const { const std::string driver_version = GetDriverVersion(*device); const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version); - const std::string api_version = GetReadableVersion(device->GetApiVersion()); + const std::string api_version = GetReadableVersion(device->ApiVersion()); const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions()); @@ -442,25 +209,21 @@ void RendererVulkan::Report() const { telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions); } -std::vector<std::string> RendererVulkan::EnumerateDevices() { +std::vector<std::string> RendererVulkan::EnumerateDevices() try { vk::InstanceDispatch dld; - Common::DynamicLibrary library = OpenVulkanLibrary(); - vk::Instance instance = CreateInstance(library, dld); - if (!instance) { - return {}; - } - - const std::optional physical_devices = instance.EnumeratePhysicalDevices(); - if (!physical_devices) { - return {}; - } - + const Common::DynamicLibrary library = OpenLibrary(); + const vk::Instance instance = CreateInstance(library, dld, VK_API_VERSION_1_0); + const std::vector<VkPhysicalDevice> physical_devices = instance.EnumeratePhysicalDevices(); std::vector<std::string> names; - names.reserve(physical_devices->size()); - for (const auto& device : *physical_devices) { + names.reserve(physical_devices.size()); + for (const VkPhysicalDevice device : physical_devices) { names.push_back(vk::PhysicalDevice(device, dld).GetProperties().deviceName); } return names; + +} catch (const vk::Exception& exception) { + LOG_ERROR(Render_Vulkan, "Failed to enumerate devices with error: {}", exception.what()); + return {}; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index 49a4141ec..5575ffc54 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -11,7 +11,7 @@ #include "common/dynamic_library.h" #include "video_core/renderer_base.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class TelemetrySession; @@ -27,16 +27,15 @@ class GPU; namespace Vulkan { +class Device; class StateTracker; class VKBlitScreen; -class VKDevice; class VKMemoryManager; class VKSwapchain; class VKScheduler; -class VKImage; struct VKScreenInfo { - VKImage* image{}; + VkImageView image_view{}; u32 width{}; u32 height{}; bool is_srgb{}; @@ -45,9 +44,9 @@ struct VKScreenInfo { class RendererVulkan final : public VideoCore::RendererBase { public: explicit RendererVulkan(Core::TelemetrySession& telemtry_session, - Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory, - Tegra::GPU& gpu, - std::unique_ptr<Core::Frontend::GraphicsContext> context); + Core::Frontend::EmuWindow& emu_window, + Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, + std::unique_ptr<Core::Frontend::GraphicsContext> context_); ~RendererVulkan() override; bool Init() override; @@ -57,11 +56,7 @@ public: static std::vector<std::string> EnumerateDevices(); private: - bool CreateDebugCallback(); - - bool CreateSurface(); - - bool PickDevices(); + void InitializeDevice(); void Report() const; @@ -73,12 +68,13 @@ private: vk::InstanceDispatch dld; vk::Instance instance; + vk::SurfaceKHR surface; VKScreenInfo screen_info; - vk::DebugCallback debug_callback; - std::unique_ptr<VKDevice> device; + vk::DebugUtilsMessenger debug_callback; + std::unique_ptr<Device> device; std::unique_ptr<VKMemoryManager> memory_manager; std::unique_ptr<StateTracker> state_tracker; std::unique_ptr<VKScheduler> scheduler; diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index b5b60309e..5e184eb42 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -16,121 +16,25 @@ #include "core/frontend/emu_window.h" #include "core/memory.h" #include "video_core/gpu.h" -#include "video_core/morton.h" +#include "video_core/host_shaders/vulkan_present_frag_spv.h" +#include "video_core/host_shaders/vulkan_present_vert_spv.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_image.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_util.h" #include "video_core/renderer_vulkan/vk_swapchain.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/surface.h" +#include "video_core/textures/decoders.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { namespace { -// Generated from the "shaders/" directory, read the instructions there. -constexpr u8 blit_vertex_code[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x27, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, - 0x00, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x25, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x04, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x19, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x25, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x0c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, - 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x04, 0x00, - 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x1c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3f, 0x20, 0x00, 0x04, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x24, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x25, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, - 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, - 0x1a, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x1d, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x50, 0x00, 0x07, 0x00, 0x07, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x91, 0x00, 0x05, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1f, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x3d, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0x3e, 0x00, 0x03, 0x00, 0x24, 0x00, 0x00, 0x00, 0x26, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, - 0x38, 0x00, 0x01, 0x00}; - -constexpr u8 blit_fragment_code[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x14, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x07, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, - 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x03, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x19, 0x00, 0x09, 0x00, 0x0a, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x03, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, - 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0f, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x10, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, - 0x05, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, - 0x0d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x57, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00}; - struct ScreenRectVertex { ScreenRectVertex() = default; explicit ScreenRectVertex(f32 x, f32 y, f32 u, f32 v) : position{{x, y}}, tex_coord{{u, v}} {} @@ -173,9 +77,9 @@ constexpr std::array<f32, 4 * 4> MakeOrthographicMatrix(f32 width, f32 height) { // clang-format on } -std::size_t GetBytesPerPixel(const Tegra::FramebufferConfig& framebuffer) { +u32 GetBytesPerPixel(const Tegra::FramebufferConfig& framebuffer) { using namespace VideoCore::Surface; - return GetBytesPerPixel(PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)); + return BytesPerBlock(PixelFormatFromGPUPixelFormat(framebuffer.pixel_format)); } std::size_t GetSizeInBytes(const Tegra::FramebufferConfig& framebuffer) { @@ -210,7 +114,7 @@ struct VKBlitScreen::BufferData { VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_, Core::Frontend::EmuWindow& render_window_, - VideoCore::RasterizerInterface& rasterizer_, const VKDevice& device_, + VideoCore::RasterizerInterface& rasterizer_, const Device& device_, VKMemoryManager& memory_manager_, VKSwapchain& swapchain_, VKScheduler& scheduler_, const VKScreenInfo& screen_info_) : cpu_memory{cpu_memory_}, render_window{render_window_}, rasterizer{rasterizer_}, @@ -239,34 +143,30 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool scheduler.Wait(resource_ticks[image_index]); resource_ticks[image_index] = scheduler.CurrentTick(); - VKImage* blit_image = use_accelerated ? screen_info.image : raw_images[image_index].get(); - - UpdateDescriptorSet(image_index, blit_image->GetPresentView()); + UpdateDescriptorSet(image_index, + use_accelerated ? screen_info.image_view : *raw_image_views[image_index]); BufferData data; SetUniformData(data, framebuffer); SetVertexData(data, framebuffer); auto map = buffer_commit->Map(); - std::memcpy(map.GetAddress(), &data, sizeof(data)); + std::memcpy(map.Address(), &data, sizeof(data)); if (!use_accelerated) { const u64 image_offset = GetRawImageOffset(framebuffer, image_index); - const auto pixel_format = - VideoCore::Surface::PixelFormatFromGPUPixelFormat(framebuffer.pixel_format); const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; - const auto host_ptr = cpu_memory.GetPointer(framebuffer_addr); - rasterizer.FlushRegion(ToCacheAddr(host_ptr), GetSizeInBytes(framebuffer)); + const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr); + const size_t size_bytes = GetSizeInBytes(framebuffer); + rasterizer.FlushRegion(ToCacheAddr(host_ptr), size_bytes); // TODO(Rodrigo): Read this from HLE constexpr u32 block_height_log2 = 4; - VideoCore::MortonSwizzle(VideoCore::MortonSwizzleMode::MortonToLinear, pixel_format, - framebuffer.stride, block_height_log2, framebuffer.height, 0, 1, 1, - map.GetAddress() + image_offset, host_ptr); - - blit_image->Transition(0, 1, 0, 1, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); + const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer); + Tegra::Texture::UnswizzleTexture( + std::span(map.Address() + image_offset, size_bytes), std::span(host_ptr, size_bytes), + bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0); const VkBufferImageCopy copy{ .bufferOffset = image_offset, @@ -288,15 +188,44 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool }, }; scheduler.Record( - [buffer = *buffer, image = *blit_image->GetHandle(), copy](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); + [buffer = *buffer, image = *raw_images[image_index], copy](vk::CommandBuffer cmdbuf) { + const VkImageMemoryBarrier base_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = 0, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }; + VkImageMemoryBarrier read_barrier = base_barrier; + read_barrier.srcAccessMask = VK_ACCESS_HOST_WRITE_BIT; + read_barrier.dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + read_barrier.oldLayout = VK_IMAGE_LAYOUT_UNDEFINED; + + VkImageMemoryBarrier write_barrier = base_barrier; + write_barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; + write_barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_HOST_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, read_barrier); + cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_GENERAL, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, 0, write_barrier); }); } map.Release(); - blit_image->Transition(0, 1, 0, 1, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT, VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); - scheduler.Record([renderpass = *renderpass, framebuffer = *framebuffers[image_index], descriptor_set = descriptor_sets[image_index], buffer = *buffer, size = swapchain.GetSize(), pipeline = *pipeline, @@ -304,31 +233,31 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool const VkClearValue clear_color{ .color = {.float32 = {0.0f, 0.0f, 0.0f, 0.0f}}, }; - - VkRenderPassBeginInfo renderpass_bi; - renderpass_bi.sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO; - renderpass_bi.pNext = nullptr; - renderpass_bi.renderPass = renderpass; - renderpass_bi.framebuffer = framebuffer; - renderpass_bi.renderArea.offset.x = 0; - renderpass_bi.renderArea.offset.y = 0; - renderpass_bi.renderArea.extent = size; - renderpass_bi.clearValueCount = 1; - renderpass_bi.pClearValues = &clear_color; - - VkViewport viewport; - viewport.x = 0.0f; - viewport.y = 0.0f; - viewport.width = static_cast<float>(size.width); - viewport.height = static_cast<float>(size.height); - viewport.minDepth = 0.0f; - viewport.maxDepth = 1.0f; - - VkRect2D scissor; - scissor.offset.x = 0; - scissor.offset.y = 0; - scissor.extent = size; - + const VkRenderPassBeginInfo renderpass_bi{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = nullptr, + .renderPass = renderpass, + .framebuffer = framebuffer, + .renderArea = + { + .offset = {0, 0}, + .extent = size, + }, + .clearValueCount = 1, + .pClearValues = &clear_color, + }; + const VkViewport viewport{ + .x = 0.0f, + .y = 0.0f, + .width = static_cast<float>(size.width), + .height = static_cast<float>(size.height), + .minDepth = 0.0f, + .maxDepth = 1.0f, + }; + const VkRect2D scissor{ + .offset = {0, 0}, + .extent = size, + }; cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline); cmdbuf.SetViewport(0, viewport); @@ -372,8 +301,8 @@ void VKBlitScreen::RefreshResources(const Tegra::FramebufferConfig& framebuffer) } void VKBlitScreen::CreateShaders() { - vertex_shader = BuildShader(device, sizeof(blit_vertex_code), blit_vertex_code); - fragment_shader = BuildShader(device, sizeof(blit_fragment_code), blit_fragment_code); + vertex_shader = BuildShader(device, VULKAN_PRESENT_VERT_SPV); + fragment_shader = BuildShader(device, VULKAN_PRESENT_FRAG_SPV); } void VKBlitScreen::CreateSemaphores() { @@ -420,7 +349,7 @@ void VKBlitScreen::CreateRenderPass() { const VkAttachmentReference color_attachment_ref{ .attachment = 0, - .layout = VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL, + .layout = VK_IMAGE_LAYOUT_GENERAL, }; const VkSubpassDescription subpass_description{ @@ -735,34 +664,56 @@ void VKBlitScreen::CreateStagingBuffer(const Tegra::FramebufferConfig& framebuff void VKBlitScreen::CreateRawImages(const Tegra::FramebufferConfig& framebuffer) { raw_images.resize(image_count); + raw_image_views.resize(image_count); raw_buffer_commits.resize(image_count); - const VkImageCreateInfo ci{ - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .imageType = VK_IMAGE_TYPE_2D, - .format = GetFormat(framebuffer), - .extent = - { - .width = framebuffer.width, - .height = framebuffer.height, - .depth = 1, - }, - .mipLevels = 1, - .arrayLayers = 1, - .samples = VK_SAMPLE_COUNT_1_BIT, - .tiling = VK_IMAGE_TILING_LINEAR, - .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - }; - - for (std::size_t i = 0; i < image_count; ++i) { - raw_images[i] = std::make_unique<VKImage>(device, scheduler, ci, VK_IMAGE_ASPECT_COLOR_BIT); - raw_buffer_commits[i] = memory_manager.Commit(raw_images[i]->GetHandle(), false); + for (size_t i = 0; i < image_count; ++i) { + raw_images[i] = device.GetLogical().CreateImage(VkImageCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .imageType = VK_IMAGE_TYPE_2D, + .format = GetFormat(framebuffer), + .extent = + { + .width = framebuffer.width, + .height = framebuffer.height, + .depth = 1, + }, + .mipLevels = 1, + .arrayLayers = 1, + .samples = VK_SAMPLE_COUNT_1_BIT, + .tiling = VK_IMAGE_TILING_LINEAR, + .usage = VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, + }); + raw_buffer_commits[i] = memory_manager.Commit(raw_images[i], false); + raw_image_views[i] = device.GetLogical().CreateImageView(VkImageViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = *raw_images[i], + .viewType = VK_IMAGE_VIEW_TYPE_2D, + .format = GetFormat(framebuffer), + .components = + { + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = + { + .aspectMask = VK_IMAGE_ASPECT_COLOR_BIT, + .baseMipLevel = 0, + .levelCount = 1, + .baseArrayLayer = 0, + .layerCount = 1, + }, + }); } } @@ -789,7 +740,7 @@ void VKBlitScreen::UpdateDescriptorSet(std::size_t image_index, VkImageView imag const VkDescriptorImageInfo image_info{ .sampler = *sampler, .imageView = image_view, - .imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, }; const VkWriteDescriptorSet sampler_write{ diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index 8f2839214..69ed61770 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -7,7 +7,7 @@ #include <memory> #include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class System; @@ -33,9 +33,8 @@ namespace Vulkan { struct ScreenInfo; +class Device; class RasterizerVulkan; -class VKDevice; -class VKImage; class VKScheduler; class VKSwapchain; @@ -43,7 +42,7 @@ class VKBlitScreen final { public: explicit VKBlitScreen(Core::Memory::Memory& cpu_memory, Core::Frontend::EmuWindow& render_window, - VideoCore::RasterizerInterface& rasterizer, const VKDevice& device, + VideoCore::RasterizerInterface& rasterizer, const Device& device, VKMemoryManager& memory_manager, VKSwapchain& swapchain, VKScheduler& scheduler, const VKScreenInfo& screen_info); ~VKBlitScreen(); @@ -86,7 +85,7 @@ private: Core::Memory::Memory& cpu_memory; Core::Frontend::EmuWindow& render_window; VideoCore::RasterizerInterface& rasterizer; - const VKDevice& device; + const Device& device; VKMemoryManager& memory_manager; VKSwapchain& swapchain; VKScheduler& scheduler; @@ -110,7 +109,8 @@ private: std::vector<u64> resource_ticks; std::vector<vk::Semaphore> semaphores; - std::vector<std::unique_ptr<VKImage>> raw_images; + std::vector<vk::Image> raw_images; + std::vector<vk::ImageView> raw_image_views; std::vector<VKMemoryCommit> raw_buffer_commits; u32 raw_width = 0; u32 raw_height = 0; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index d9d3da9ea..4d517c547 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -9,10 +9,10 @@ #include "core/core.h" #include "video_core/buffer_cache/buffer_cache.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -31,20 +31,24 @@ constexpr VkAccessFlags UPLOAD_ACCESS_BARRIERS = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_INDEX_READ_BIT; -std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const VKDevice& device, VKScheduler& scheduler) { - return std::make_unique<VKStreamBuffer>(device, scheduler, BUFFER_USAGE); +constexpr VkAccessFlags TRANSFORM_FEEDBACK_WRITE_ACCESS = + VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT; + +std::unique_ptr<VKStreamBuffer> CreateStreamBuffer(const Device& device, VKScheduler& scheduler) { + return std::make_unique<VKStreamBuffer>(device, scheduler); } } // Anonymous namespace -Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler_, - VKStagingBufferPool& staging_pool_, VAddr cpu_addr, std::size_t size) - : BufferBlock{cpu_addr, size}, scheduler{scheduler_}, staging_pool{staging_pool_} { +Buffer::Buffer(const Device& device_, VKMemoryManager& memory_manager, VKScheduler& scheduler_, + VKStagingBufferPool& staging_pool_, VAddr cpu_addr_, std::size_t size_) + : BufferBlock{cpu_addr_, size_}, device{device_}, scheduler{scheduler_}, staging_pool{ + staging_pool_} { const VkBufferCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = static_cast<VkDeviceSize>(size), + .size = static_cast<VkDeviceSize>(size_), .usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, @@ -57,69 +61,86 @@ Buffer::Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKSchedu Buffer::~Buffer() = default; -void Buffer::Upload(std::size_t offset, std::size_t size, const u8* data) { - const auto& staging = staging_pool.GetUnusedBuffer(size, true); - std::memcpy(staging.commit->Map(size), data, size); +void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { + const auto& staging = staging_pool.GetUnusedBuffer(data_size, true); + std::memcpy(staging.commit->Map(data_size), data, data_size); scheduler.RequestOutsideRenderPassOperationContext(); const VkBuffer handle = Handle(); - scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, size}); - - const VkBufferMemoryBarrier barrier{ + scheduler.Record([staging = *staging.handle, handle, offset, data_size, + &device = device](vk::CommandBuffer cmdbuf) { + const VkBufferMemoryBarrier read_barrier{ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = UPLOAD_ACCESS_BARRIERS, + .srcAccessMask = + VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT | + VK_ACCESS_HOST_WRITE_BIT | + (device.IsExtTransformFeedbackSupported() ? TRANSFORM_FEEDBACK_WRITE_ACCESS : 0), + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = handle, .offset = offset, - .size = size, + .size = data_size, }; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, - barrier, {}); - }); -} - -void Buffer::Download(std::size_t offset, std::size_t size, u8* data) { - const auto& staging = staging_pool.GetUnusedBuffer(size, true); - scheduler.RequestOutsideRenderPassOperationContext(); - - const VkBuffer handle = Handle(); - scheduler.Record([staging = *staging.handle, handle, offset, size](vk::CommandBuffer cmdbuf) { - const VkBufferMemoryBarrier barrier{ + const VkBufferMemoryBarrier write_barrier{ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = UPLOAD_ACCESS_BARRIERS, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .buffer = handle, .offset = offset, - .size = size, + .size = data_size, }; - - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {}); - cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, size}); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, read_barrier); + cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, data_size}); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, + write_barrier); }); +} + +void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { + const auto& staging = staging_pool.GetUnusedBuffer(data_size, true); + scheduler.RequestOutsideRenderPassOperationContext(); + + const VkBuffer handle = Handle(); + scheduler.Record( + [staging = *staging.handle, handle, offset, data_size](vk::CommandBuffer cmdbuf) { + const VkBufferMemoryBarrier barrier{ + .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .buffer = handle, + .offset = offset, + .size = data_size, + }; + + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | + VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {}); + cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, data_size}); + }); scheduler.Finish(); - std::memcpy(data, staging.commit->Map(size), size); + std::memcpy(data, staging.commit->Map(data_size), data_size); } void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t size) { + std::size_t copy_size) { scheduler.RequestOutsideRenderPassOperationContext(); const VkBuffer dst_buffer = Handle(); scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset, - size](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, size}); + copy_size](vk::CommandBuffer cmdbuf) { + cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, copy_size}); std::array<VkBufferMemoryBarrier, 2> barriers; barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; @@ -130,7 +151,7 @@ void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[0].buffer = src_buffer; barriers[0].offset = src_offset; - barriers[0].size = size; + barriers[0].size = copy_size; barriers[1].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; barriers[1].pNext = nullptr; barriers[1].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; @@ -139,19 +160,19 @@ void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; barriers[1].buffer = dst_buffer; barriers[1].offset = dst_offset; - barriers[1].size = size; + barriers[1].size = copy_size; cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, barriers, {}); }); } -VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, - const VKDevice& device_, VKMemoryManager& memory_manager_, - VKScheduler& scheduler_, VKStagingBufferPool& staging_pool_) - : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer, gpu_memory, cpu_memory, - CreateStreamBuffer(device_, - scheduler_)}, +VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + const Device& device_, VKMemoryManager& memory_manager_, + VKScheduler& scheduler_, VKStreamBuffer& stream_buffer_, + VKStagingBufferPool& staging_pool_) + : VideoCommon::BufferCache<Buffer, VkBuffer, VKStreamBuffer>{rasterizer_, gpu_memory_, + cpu_memory_, stream_buffer_}, device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_}, staging_pool{ staging_pool_} {} diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 7fb5ceedf..1c39aed34 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -11,26 +11,26 @@ #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKMemoryManager; class VKScheduler; class Buffer final : public VideoCommon::BufferBlock { public: - explicit Buffer(const VKDevice& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, - VKStagingBufferPool& staging_pool, VAddr cpu_addr, std::size_t size); + explicit Buffer(const Device& device, VKMemoryManager& memory_manager, VKScheduler& scheduler, + VKStagingBufferPool& staging_pool, VAddr cpu_addr_, std::size_t size_); ~Buffer(); - void Upload(std::size_t offset, std::size_t size, const u8* data); + void Upload(std::size_t offset, std::size_t data_size, const u8* data); - void Download(std::size_t offset, std::size_t size, u8* data); + void Download(std::size_t offset, std::size_t data_size, u8* data); void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t size); + std::size_t copy_size); VkBuffer Handle() const { return *buffer.handle; @@ -41,6 +41,7 @@ public: } private: + const Device& device; VKScheduler& scheduler; VKStagingBufferPool& staging_pool; @@ -51,8 +52,9 @@ class VKBufferCache final : public VideoCommon::BufferCache<Buffer, VkBuffer, VK public: explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, - const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool); + const Device& device, VKMemoryManager& memory_manager, + VKScheduler& scheduler, VKStreamBuffer& stream_buffer, + VKStagingBufferPool& staging_pool); ~VKBufferCache(); BufferInfo GetEmptyBuffer(std::size_t size) override; @@ -61,7 +63,7 @@ protected: std::shared_ptr<Buffer> CreateBlock(VAddr cpu_addr, std::size_t size) override; private: - const VKDevice& device; + const Device& device; VKMemoryManager& memory_manager; VKScheduler& scheduler; VKStagingBufferPool& staging_pool; diff --git a/src/video_core/renderer_vulkan/vk_command_pool.cpp b/src/video_core/renderer_vulkan/vk_command_pool.cpp index f1abd4b1a..a99df9323 100644 --- a/src/video_core/renderer_vulkan/vk_command_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_command_pool.cpp @@ -5,15 +5,20 @@ #include <cstddef> #include "video_core/renderer_vulkan/vk_command_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { constexpr size_t COMMAND_BUFFER_POOL_SIZE = 0x1000; -CommandPool::CommandPool(MasterSemaphore& master_semaphore, const VKDevice& device) - : ResourcePool(master_semaphore, COMMAND_BUFFER_POOL_SIZE), device{device} {} +struct CommandPool::Pool { + vk::CommandPool handle; + vk::CommandBuffers cmdbufs; +}; + +CommandPool::CommandPool(MasterSemaphore& master_semaphore_, const Device& device_) + : ResourcePool(master_semaphore_, COMMAND_BUFFER_POOL_SIZE), device{device_} {} CommandPool::~CommandPool() = default; diff --git a/src/video_core/renderer_vulkan/vk_command_pool.h b/src/video_core/renderer_vulkan/vk_command_pool.h index 3aee239b9..61c26a22a 100644 --- a/src/video_core/renderer_vulkan/vk_command_pool.h +++ b/src/video_core/renderer_vulkan/vk_command_pool.h @@ -2,33 +2,32 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#pragma once + #include <cstddef> #include <vector> #include "video_core/renderer_vulkan/vk_resource_pool.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +class Device; class MasterSemaphore; -class VKDevice; class CommandPool final : public ResourcePool { public: - explicit CommandPool(MasterSemaphore& master_semaphore, const VKDevice& device); - virtual ~CommandPool(); + explicit CommandPool(MasterSemaphore& master_semaphore_, const Device& device_); + ~CommandPool() override; void Allocate(size_t begin, size_t end) override; VkCommandBuffer Commit(); private: - struct Pool { - vk::CommandPool handle; - vk::CommandBuffers cmdbufs; - }; + struct Pool; - const VKDevice& device; + const Device& device; std::vector<Pool> pools; }; diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 9637c6059..02a6d54b7 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -10,111 +10,21 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" +#include "video_core/host_shaders/vulkan_quad_array_comp_spv.h" +#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" +#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { namespace { -// Quad array SPIR-V module. Generated from the "shaders/" directory, read the instructions there. -constexpr u8 quad_array[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x54, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, - 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x29, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x03, 0x00, 0x14, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x15, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, - 0x15, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x18, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, - 0x1b, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x29, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x04, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, - 0x3b, 0x00, 0x04, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x04, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x1c, 0x00, 0x04, 0x00, 0x34, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x2c, 0x00, 0x09, 0x00, 0x34, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, - 0x35, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, - 0x37, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x3a, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x34, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, - 0x00, 0x04, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, - 0x49, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x35, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x3a, 0x00, 0x00, 0x00, - 0x3b, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, - 0x4e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x17, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0xae, 0x00, 0x05, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, - 0x4b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, - 0x21, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x07, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, 0x1b, 0x00, 0x00, 0x00, - 0x27, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, - 0x27, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x2f, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x32, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x2f, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, - 0x3e, 0x00, 0x03, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, - 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, - 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, - 0x3d, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x44, 0x00, 0x00, 0x00, - 0x45, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, - 0x3e, 0x00, 0x03, 0x00, 0x45, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x21, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x23, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4e, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x4c, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4b, 0x00, 0x00, 0x00, - 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, -}; - VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() { return { .binding = 0, @@ -144,208 +54,6 @@ VkPushConstantRange BuildComputePushConstantRange(std::size_t size) { }; } -// Uint8 SPIR-V module. Generated from the "shaders/" directory. -constexpr u8 uint8_pass[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x2f, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, - 0x51, 0x11, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x61, 0x11, 0x00, 0x00, 0x0a, 0x00, 0x07, 0x00, - 0x53, 0x50, 0x56, 0x5f, 0x4b, 0x48, 0x52, 0x5f, 0x31, 0x36, 0x62, 0x69, 0x74, 0x5f, 0x73, 0x74, - 0x6f, 0x72, 0x61, 0x67, 0x65, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x07, 0x00, 0x53, 0x50, 0x56, 0x5f, - 0x4b, 0x48, 0x52, 0x5f, 0x38, 0x62, 0x69, 0x74, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, - 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, - 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x12, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x1f, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x2e, 0x00, 0x00, 0x00, - 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, - 0x0a, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x0d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x12, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x14, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, 0x1a, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, - 0x1f, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x20, 0x00, 0x00, 0x00, - 0x1f, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x21, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x2a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, - 0x00, 0x04, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, - 0x2c, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x2d, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x0e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x10, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x44, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, - 0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, - 0x1a, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0xf7, 0x00, 0x03, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, - 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, - 0x1c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0x08, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, - 0x15, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x71, 0x00, 0x04, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x2a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x24, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, -}; - -// Quad indexed SPIR-V module. Generated from the "shaders/" directory. -constexpr u8 QUAD_INDEXED_SPV[] = { - 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x07, 0x00, 0x08, 0x00, 0x7c, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, - 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, - 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x11, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x15, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x16, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x04, 0x00, 0x18, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x05, 0x00, 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x22, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x56, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x57, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x59, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x72, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, - 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0a, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x0b, 0x00, 0x00, 0x00, - 0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x0d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x13, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x16, 0x00, 0x00, 0x00, 0x15, 0x00, 0x00, 0x00, - 0x20, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, - 0x3b, 0x00, 0x04, 0x00, 0x17, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x02, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x21, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x04, 0x00, 0x22, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x23, 0x00, 0x00, 0x00, - 0x24, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x25, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x26, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x3b, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x3f, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x43, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x09, 0x00, 0x41, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, - 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, - 0x56, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x57, 0x00, 0x00, 0x00, - 0x56, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x57, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x58, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x69, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, - 0x00, 0x04, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, - 0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, - 0x70, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x46, 0x00, 0x00, 0x00, - 0x47, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf6, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, - 0x76, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x75, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, - 0x0f, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x13, 0x00, 0x00, 0x00, - 0x44, 0x00, 0x05, 0x00, 0x09, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, - 0x19, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x05, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, - 0x14, 0x00, 0x00, 0x00, 0x1a, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x1e, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1d, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, - 0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, - 0x26, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x27, 0x00, 0x00, 0x00, - 0xc4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, - 0x28, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, - 0x2b, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x31, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x2e, 0x00, 0x00, 0x00, 0x82, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, - 0xf5, 0x00, 0x07, 0x00, 0x09, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, - 0x1e, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x05, 0x00, - 0x1b, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x00, 0x00, - 0xf6, 0x00, 0x04, 0x00, 0x37, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0xfa, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x36, 0x00, 0x00, 0x00, 0x37, 0x00, 0x00, 0x00, - 0xf8, 0x00, 0x02, 0x00, 0x36, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x40, 0x00, 0x00, 0x00, 0x11, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, - 0x47, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x07, 0x00, 0x00, 0x00, - 0x48, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x06, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, - 0xc3, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, - 0x2e, 0x00, 0x00, 0x00, 0xc7, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, - 0x4a, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, - 0x54, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, - 0x5b, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, - 0x4e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00, - 0x5c, 0x00, 0x00, 0x00, 0xcb, 0x00, 0x06, 0x00, 0x09, 0x00, 0x00, 0x00, 0x62, 0x00, 0x00, 0x00, - 0x5d, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x29, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, - 0x41, 0x00, 0x05, 0x00, 0x69, 0x00, 0x00, 0x00, 0x6a, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, - 0x42, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x09, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, - 0x6a, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x09, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, - 0x62, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x5b, 0x00, 0x00, 0x00, - 0x6d, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, - 0x3e, 0x00, 0x03, 0x00, 0x6d, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, - 0x09, 0x00, 0x00, 0x00, 0x6f, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x35, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x37, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x76, 0x00, 0x00, 0x00, - 0xf9, 0x00, 0x02, 0x00, 0x74, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x73, 0x00, 0x00, 0x00, - 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00, -}; - std::array<VkDescriptorSetLayoutBinding, 2> BuildInputOutputDescriptorSetBindings() { return {{ { @@ -378,11 +86,11 @@ VkDescriptorUpdateTemplateEntryKHR BuildInputOutputDescriptorUpdateTemplate() { } // Anonymous namespace -VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descriptor_pool, +VKComputePass::VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorUpdateTemplateEntryKHR> templates, - vk::Span<VkPushConstantRange> push_constants, std::size_t code_size, - const u8* code) { + vk::Span<VkPushConstantRange> push_constants, + std::span<const u32> code) { descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .pNext = nullptr, @@ -390,7 +98,6 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto .bindingCount = bindings.size(), .pBindings = bindings.data(), }); - layout = device.GetLogical().CreatePipelineLayout({ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .pNext = nullptr, @@ -400,7 +107,6 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto .pushConstantRangeCount = push_constants.size(), .pPushConstantRanges = push_constants.data(), }); - if (!templates.empty()) { descriptor_template = device.GetLogical().CreateDescriptorUpdateTemplateKHR({ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_UPDATE_TEMPLATE_CREATE_INFO_KHR, @@ -417,18 +123,13 @@ VKComputePass::VKComputePass(const VKDevice& device, VKDescriptorPool& descripto descriptor_allocator.emplace(descriptor_pool, *descriptor_set_layout); } - - auto code_copy = std::make_unique<u32[]>(code_size / sizeof(u32) + 1); - std::memcpy(code_copy.get(), code, code_size); - module = device.GetLogical().CreateShaderModule({ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .codeSize = code_size, - .pCode = code_copy.get(), + .codeSize = static_cast<u32>(code.size_bytes()), + .pCode = code.data(), }); - pipeline = device.GetLogical().CreateComputePipeline({ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = nullptr, @@ -461,15 +162,15 @@ VkDescriptorSet VKComputePass::CommitDescriptorSet( return set; } -QuadArrayPass::QuadArrayPass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) - : VKComputePass(device, descriptor_pool, BuildQuadArrayPassDescriptorSetLayoutBinding(), +QuadArrayPass::QuadArrayPass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKStagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_) + : VKComputePass(device_, descriptor_pool_, BuildQuadArrayPassDescriptorSetLayoutBinding(), BuildQuadArrayPassDescriptorUpdateTemplateEntry(), - BuildComputePushConstantRange(sizeof(u32)), std::size(quad_array), quad_array), - scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, - update_descriptor_queue{update_descriptor_queue} {} + BuildComputePushConstantRange(sizeof(u32)), VULKAN_QUAD_ARRAY_COMP_SPV), + scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_}, + update_descriptor_queue{update_descriptor_queue_} {} QuadArrayPass::~QuadArrayPass() = default; @@ -510,14 +211,13 @@ std::pair<VkBuffer, VkDeviceSize> QuadArrayPass::Assemble(u32 num_vertices, u32 return {*buffer.handle, 0}; } -Uint8Pass::Uint8Pass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) +Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool, VKStagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_) : VKComputePass(device, descriptor_pool, BuildInputOutputDescriptorSetBindings(), - BuildInputOutputDescriptorUpdateTemplate(), {}, std::size(uint8_pass), - uint8_pass), - scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, - update_descriptor_queue{update_descriptor_queue} {} + BuildInputOutputDescriptorUpdateTemplate(), {}, VULKAN_UINT8_COMP_SPV), + scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_}, + update_descriptor_queue{update_descriptor_queue_} {} Uint8Pass::~Uint8Pass() = default; @@ -555,16 +255,15 @@ std::pair<VkBuffer, u64> Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buff return {*buffer.handle, 0}; } -QuadIndexedPass::QuadIndexedPass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue) - : VKComputePass(device, descriptor_pool, BuildInputOutputDescriptorSetBindings(), +QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKStagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_) + : VKComputePass(device_, descriptor_pool_, BuildInputOutputDescriptorSetBindings(), BuildInputOutputDescriptorUpdateTemplate(), - BuildComputePushConstantRange(sizeof(u32) * 2), std::size(QUAD_INDEXED_SPV), - QUAD_INDEXED_SPV), - scheduler{scheduler}, staging_buffer_pool{staging_buffer_pool}, - update_descriptor_queue{update_descriptor_queue} {} + BuildComputePushConstantRange(sizeof(u32) * 2), VULKAN_QUAD_INDEXED_COMP_SPV), + scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_}, + update_descriptor_queue{update_descriptor_queue_} {} QuadIndexedPass::~QuadIndexedPass() = default; diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index acc94f27e..7ddb09afb 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -5,27 +5,27 @@ #pragma once #include <optional> +#include <span> #include <utility> #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKScheduler; class VKStagingBufferPool; class VKUpdateDescriptorQueue; class VKComputePass { public: - explicit VKComputePass(const VKDevice& device, VKDescriptorPool& descriptor_pool, + explicit VKComputePass(const Device& device, VKDescriptorPool& descriptor_pool, vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorUpdateTemplateEntryKHR> templates, - vk::Span<VkPushConstantRange> push_constants, std::size_t code_size, - const u8* code); + vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); ~VKComputePass(); protected: @@ -43,10 +43,10 @@ private: class QuadArrayPass final : public VKComputePass { public: - explicit QuadArrayPass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); + explicit QuadArrayPass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKStagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_); ~QuadArrayPass(); std::pair<VkBuffer, VkDeviceSize> Assemble(u32 num_vertices, u32 first); @@ -59,9 +59,10 @@ private: class Uint8Pass final : public VKComputePass { public: - explicit Uint8Pass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); + explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKStagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_); ~Uint8Pass(); std::pair<VkBuffer, u64> Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset); @@ -74,10 +75,10 @@ private: class QuadIndexedPass final : public VKComputePass { public: - explicit QuadIndexedPass(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKStagingBufferPool& staging_buffer_pool, - VKUpdateDescriptorQueue& update_descriptor_queue); + explicit QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKStagingBufferPool& staging_buffer_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_); ~QuadIndexedPass(); std::pair<VkBuffer, u64> Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp index 9be72dc9b..3a48219b7 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.cpp @@ -6,25 +6,25 @@ #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -VKComputePipeline::VKComputePipeline(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - const SPIRVShader& shader) - : device{device}, scheduler{scheduler}, entries{shader.entries}, +VKComputePipeline::VKComputePipeline(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + const SPIRVShader& shader_) + : device{device_}, scheduler{scheduler_}, entries{shader_.entries}, descriptor_set_layout{CreateDescriptorSetLayout()}, - descriptor_allocator{descriptor_pool, *descriptor_set_layout}, - update_descriptor_queue{update_descriptor_queue}, layout{CreatePipelineLayout()}, + descriptor_allocator{descriptor_pool_, *descriptor_set_layout}, + update_descriptor_queue{update_descriptor_queue_}, layout{CreatePipelineLayout()}, descriptor_template{CreateDescriptorUpdateTemplate()}, - shader_module{CreateShaderModule(shader.code)}, pipeline{CreatePipeline()} {} + shader_module{CreateShaderModule(shader_.code)}, pipeline{CreatePipeline()} {} VKComputePipeline::~VKComputePipeline() = default; diff --git a/src/video_core/renderer_vulkan/vk_compute_pipeline.h b/src/video_core/renderer_vulkan/vk_compute_pipeline.h index 6e2f22a4a..7e16575ac 100644 --- a/src/video_core/renderer_vulkan/vk_compute_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_compute_pipeline.h @@ -7,20 +7,20 @@ #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKScheduler; class VKUpdateDescriptorQueue; class VKComputePipeline final { public: - explicit VKComputePipeline(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - const SPIRVShader& shader); + explicit VKComputePipeline(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + const SPIRVShader& shader_); ~VKComputePipeline(); VkDescriptorSet CommitDescriptorSet(); @@ -48,7 +48,7 @@ private: vk::Pipeline CreatePipeline() const; - const VKDevice& device; + const Device& device; VKScheduler& scheduler; ShaderEntries entries; diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp index f38e089d5..ef9fb5910 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.cpp @@ -6,10 +6,10 @@ #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -32,7 +32,7 @@ void DescriptorAllocator::Allocate(std::size_t begin, std::size_t end) { descriptors_allocations.push_back(descriptor_pool.AllocateDescriptors(layout, end - begin)); } -VKDescriptorPool::VKDescriptorPool(const VKDevice& device_, VKScheduler& scheduler) +VKDescriptorPool::VKDescriptorPool(const Device& device_, VKScheduler& scheduler) : device{device_}, master_semaphore{scheduler.GetMasterSemaphore()}, active_pool{ AllocateNewPool()} {} diff --git a/src/video_core/renderer_vulkan/vk_descriptor_pool.h b/src/video_core/renderer_vulkan/vk_descriptor_pool.h index 544f32a20..f892be7be 100644 --- a/src/video_core/renderer_vulkan/vk_descriptor_pool.h +++ b/src/video_core/renderer_vulkan/vk_descriptor_pool.h @@ -7,11 +7,11 @@ #include <vector> #include "video_core/renderer_vulkan/vk_resource_pool.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKDescriptorPool; class VKScheduler; @@ -39,7 +39,7 @@ class VKDescriptorPool final { friend DescriptorAllocator; public: - explicit VKDescriptorPool(const VKDevice& device, VKScheduler& scheduler); + explicit VKDescriptorPool(const Device& device, VKScheduler& scheduler); ~VKDescriptorPool(); VKDescriptorPool(const VKDescriptorPool&) = delete; @@ -50,7 +50,7 @@ private: vk::DescriptorSets AllocateDescriptors(VkDescriptorSetLayout layout, std::size_t count); - const VKDevice& device; + const Device& device; MasterSemaphore& master_semaphore; std::vector<vk::DescriptorPool> pools; diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp index 5babbdd0b..4c5bc0aa1 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp @@ -6,20 +6,21 @@ #include <thread> #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_fence_manager.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -InnerFence::InnerFence(const VKDevice& device, VKScheduler& scheduler, u32 payload, bool is_stubbed) - : VideoCommon::FenceBase(payload, is_stubbed), device{device}, scheduler{scheduler} {} +InnerFence::InnerFence(const Device& device_, VKScheduler& scheduler_, u32 payload_, + bool is_stubbed_) + : FenceBase{payload_, is_stubbed_}, device{device_}, scheduler{scheduler_} {} -InnerFence::InnerFence(const VKDevice& device, VKScheduler& scheduler, GPUVAddr address, - u32 payload, bool is_stubbed) - : VideoCommon::FenceBase(address, payload, is_stubbed), device{device}, scheduler{scheduler} {} +InnerFence::InnerFence(const Device& device_, VKScheduler& scheduler_, GPUVAddr address_, + u32 payload_, bool is_stubbed_) + : FenceBase{address_, payload_, is_stubbed_}, device{device_}, scheduler{scheduler_} {} InnerFence::~InnerFence() = default; @@ -71,11 +72,11 @@ bool InnerFence::IsEventSignalled() const { } } -VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, - Tegra::MemoryManager& memory_manager, VKTextureCache& texture_cache, - VKBufferCache& buffer_cache, VKQueryCache& query_cache, - const VKDevice& device_, VKScheduler& scheduler_) - : GenericFenceManager(rasterizer, gpu, texture_cache, buffer_cache, query_cache), +VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, + Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_, + VKBufferCache& buffer_cache_, VKQueryCache& query_cache_, + const Device& device_, VKScheduler& scheduler_) + : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_}, device{device_}, scheduler{scheduler_} {} Fence VKFenceManager::CreateFence(u32 value, bool is_stubbed) { diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 1547d6d30..6b51e4587 100644 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -8,7 +8,8 @@ #include "video_core/fence_manager.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class System; @@ -20,18 +21,17 @@ class RasterizerInterface; namespace Vulkan { +class Device; class VKBufferCache; -class VKDevice; class VKQueryCache; class VKScheduler; -class VKTextureCache; class InnerFence : public VideoCommon::FenceBase { public: - explicit InnerFence(const VKDevice& device, VKScheduler& scheduler, u32 payload, - bool is_stubbed); - explicit InnerFence(const VKDevice& device, VKScheduler& scheduler, GPUVAddr address, - u32 payload, bool is_stubbed); + explicit InnerFence(const Device& device_, VKScheduler& scheduler_, u32 payload_, + bool is_stubbed_); + explicit InnerFence(const Device& device_, VKScheduler& scheduler_, GPUVAddr address_, + u32 payload_, bool is_stubbed_); ~InnerFence(); void Queue(); @@ -43,7 +43,7 @@ public: private: bool IsEventSignalled() const; - const VKDevice& device; + const Device& device; VKScheduler& scheduler; vk::Event event; u64 ticks = 0; @@ -51,14 +51,14 @@ private: using Fence = std::shared_ptr<InnerFence>; using GenericFenceManager = - VideoCommon::FenceManager<Fence, VKTextureCache, VKBufferCache, VKQueryCache>; + VideoCommon::FenceManager<Fence, TextureCache, VKBufferCache, VKQueryCache>; class VKFenceManager final : public GenericFenceManager { public: - explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, - Tegra::MemoryManager& memory_manager, VKTextureCache& texture_cache, - VKBufferCache& buffer_cache, VKQueryCache& query_cache, - const VKDevice& device, VKScheduler& scheduler); + explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, + Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_, + VKBufferCache& buffer_cache_, VKQueryCache& query_cache_, + const Device& device_, VKScheduler& scheduler_); protected: Fence CreateFence(u32 value, bool is_stubbed) override; @@ -68,7 +68,7 @@ protected: void WaitFence(Fence& fence) override; private: - const VKDevice& device; + const Device& device; VKScheduler& scheduler; }; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index a4b9e7ef5..a5214d0bc 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -12,13 +12,12 @@ #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -69,23 +68,45 @@ VkViewportSwizzleNV UnpackViewportSwizzle(u16 swizzle) { }; } +VkSampleCountFlagBits ConvertMsaaMode(Tegra::Texture::MsaaMode msaa_mode) { + switch (msaa_mode) { + case Tegra::Texture::MsaaMode::Msaa1x1: + return VK_SAMPLE_COUNT_1_BIT; + case Tegra::Texture::MsaaMode::Msaa2x1: + case Tegra::Texture::MsaaMode::Msaa2x1_D3D: + return VK_SAMPLE_COUNT_2_BIT; + case Tegra::Texture::MsaaMode::Msaa2x2: + case Tegra::Texture::MsaaMode::Msaa2x2_VC4: + case Tegra::Texture::MsaaMode::Msaa2x2_VC12: + return VK_SAMPLE_COUNT_4_BIT; + case Tegra::Texture::MsaaMode::Msaa4x2: + case Tegra::Texture::MsaaMode::Msaa4x2_D3D: + case Tegra::Texture::MsaaMode::Msaa4x2_VC8: + case Tegra::Texture::MsaaMode::Msaa4x2_VC24: + return VK_SAMPLE_COUNT_8_BIT; + case Tegra::Texture::MsaaMode::Msaa4x4: + return VK_SAMPLE_COUNT_16_BIT; + default: + UNREACHABLE_MSG("Invalid msaa_mode={}", static_cast<int>(msaa_mode)); + return VK_SAMPLE_COUNT_1_BIT; + } +} + } // Anonymous namespace -VKGraphicsPipeline::VKGraphicsPipeline(const VKDevice& device, VKScheduler& scheduler, - VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - VKRenderPassCache& renderpass_cache, +VKGraphicsPipeline::VKGraphicsPipeline(const Device& device_, VKScheduler& scheduler_, + VKDescriptorPool& descriptor_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, const GraphicsPipelineCacheKey& key, vk::Span<VkDescriptorSetLayoutBinding> bindings, - const SPIRVProgram& program) - : device{device}, scheduler{scheduler}, cache_key{key}, hash{cache_key.Hash()}, + const SPIRVProgram& program, u32 num_color_buffers) + : device{device_}, scheduler{scheduler_}, cache_key{key}, hash{cache_key.Hash()}, descriptor_set_layout{CreateDescriptorSetLayout(bindings)}, - descriptor_allocator{descriptor_pool, *descriptor_set_layout}, - update_descriptor_queue{update_descriptor_queue}, layout{CreatePipelineLayout()}, - descriptor_template{CreateDescriptorUpdateTemplate(program)}, modules{CreateShaderModules( - program)}, - renderpass{renderpass_cache.GetRenderPass(cache_key.renderpass_params)}, - pipeline{CreatePipeline(cache_key.renderpass_params, program)} {} + descriptor_allocator{descriptor_pool_, *descriptor_set_layout}, + update_descriptor_queue{update_descriptor_queue_}, layout{CreatePipelineLayout()}, + descriptor_template{CreateDescriptorUpdateTemplate(program)}, + modules(CreateShaderModules(program)), + pipeline(CreatePipeline(program, cache_key.renderpass, num_color_buffers)) {} VKGraphicsPipeline::~VKGraphicsPipeline() = default; @@ -159,10 +180,11 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules( .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = nullptr, .flags = 0, + .codeSize = 0, }; - std::vector<vk::ShaderModule> modules; - modules.reserve(Maxwell::MaxShaderStage); + std::vector<vk::ShaderModule> shader_modules; + shader_modules.reserve(Maxwell::MaxShaderStage); for (std::size_t i = 0; i < Maxwell::MaxShaderStage; ++i) { const auto& stage = program[i]; if (!stage) { @@ -173,13 +195,14 @@ std::vector<vk::ShaderModule> VKGraphicsPipeline::CreateShaderModules( ci.codeSize = stage->code.size() * sizeof(u32); ci.pCode = stage->code.data(); - modules.push_back(device.GetLogical().CreateShaderModule(ci)); + shader_modules.push_back(device.GetLogical().CreateShaderModule(ci)); } - return modules; + return shader_modules; } -vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpass_params, - const SPIRVProgram& program) const { +vk::Pipeline VKGraphicsPipeline::CreatePipeline(const SPIRVProgram& program, + VkRenderPass renderpass, + u32 num_color_buffers) const { const auto& state = cache_key.fixed_state; const auto& viewport_swizzles = state.viewport_swizzles; @@ -189,11 +212,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa // state is ignored dynamic.raw1 = 0; dynamic.raw2 = 0; - for (FixedPipelineState::VertexBinding& binding : dynamic.vertex_bindings) { - // Enable all vertex bindings - binding.raw = 0; - binding.enabled.Assign(1); - } + dynamic.vertex_strides.fill(0); } else { dynamic = state.dynamic_state; } @@ -201,19 +220,16 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa std::vector<VkVertexInputBindingDescription> vertex_bindings; std::vector<VkVertexInputBindingDivisorDescriptionEXT> vertex_binding_divisors; for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { - const auto& binding = dynamic.vertex_bindings[index]; - if (!binding.enabled) { + if (state.attributes[index].binding_index_enabled == 0) { continue; } const bool instanced = state.binding_divisors[index] != 0; const auto rate = instanced ? VK_VERTEX_INPUT_RATE_INSTANCE : VK_VERTEX_INPUT_RATE_VERTEX; - vertex_bindings.push_back({ .binding = static_cast<u32>(index), - .stride = binding.stride, + .stride = dynamic.vertex_strides[index], .inputRate = rate, }); - if (instanced) { vertex_binding_divisors.push_back({ .binding = static_cast<u32>(index), @@ -229,7 +245,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa if (!attribute.enabled) { continue; } - if (input_attributes.find(static_cast<u32>(index)) == input_attributes.end()) { + if (!input_attributes.contains(static_cast<u32>(index))) { // Skip attributes not used by the vertex shaders. continue; } @@ -261,12 +277,12 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa vertex_input_ci.pNext = &input_divisor_ci; } - const auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, dynamic.Topology()); + const auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, state.topology); const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .topology = MaxwellToVK::PrimitiveTopology(device, dynamic.Topology()), + .topology = MaxwellToVK::PrimitiveTopology(device, state.topology), .primitiveRestartEnable = state.primitive_restart_enable != 0 && SupportsPrimitiveRestart(input_assembly_topology), }; @@ -289,8 +305,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa }; std::array<VkViewportSwizzleNV, Maxwell::NumViewports> swizzles; - std::transform(viewport_swizzles.begin(), viewport_swizzles.end(), swizzles.begin(), - UnpackViewportSwizzle); + std::ranges::transform(viewport_swizzles, swizzles.begin(), UnpackViewportSwizzle); VkPipelineViewportSwizzleStateCreateInfoNV swizzle_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_SWIZZLE_STATE_CREATE_INFO_NV, .pNext = nullptr, @@ -325,7 +340,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa .sType = VK_STRUCTURE_TYPE_PIPELINE_MULTISAMPLE_STATE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .rasterizationSamples = VK_SAMPLE_COUNT_1_BIT, + .rasterizationSamples = ConvertMsaaMode(state.msaa_mode), .sampleShadingEnable = VK_FALSE, .minSampleShading = 0.0f, .pSampleMask = nullptr, @@ -351,8 +366,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa }; std::array<VkPipelineColorBlendAttachmentState, Maxwell::NumRenderTargets> cb_attachments; - const auto num_attachments = static_cast<std::size_t>(renderpass_params.num_color_attachments); - for (std::size_t index = 0; index < num_attachments; ++index) { + for (std::size_t index = 0; index < num_color_buffers; ++index) { static constexpr std::array COMPONENT_TABLE{ VK_COLOR_COMPONENT_R_BIT, VK_COLOR_COMPONENT_G_BIT, @@ -386,8 +400,9 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa .flags = 0, .logicOpEnable = VK_FALSE, .logicOp = VK_LOGIC_OP_COPY, - .attachmentCount = static_cast<u32>(num_attachments), + .attachmentCount = num_color_buffers, .pAttachments = cb_attachments.data(), + .blendConstants = {}, }; std::vector dynamic_states{ @@ -400,7 +415,6 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa static constexpr std::array extended{ VK_DYNAMIC_STATE_CULL_MODE_EXT, VK_DYNAMIC_STATE_FRONT_FACE_EXT, - VK_DYNAMIC_STATE_PRIMITIVE_TOPOLOGY_EXT, VK_DYNAMIC_STATE_VERTEX_INPUT_BINDING_STRIDE_EXT, VK_DYNAMIC_STATE_DEPTH_TEST_ENABLE_EXT, VK_DYNAMIC_STATE_DEPTH_WRITE_ENABLE_EXT, @@ -446,8 +460,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa stage_ci.pNext = &subgroup_size_ci; } } - - const VkGraphicsPipelineCreateInfo ci{ + return device.GetLogical().CreateGraphicsPipeline(VkGraphicsPipelineCreateInfo{ .sType = VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -467,8 +480,7 @@ vk::Pipeline VKGraphicsPipeline::CreatePipeline(const RenderPassParams& renderpa .subpass = 0, .basePipelineHandle = nullptr, .basePipelineIndex = 0, - }; - return device.GetLogical().CreateGraphicsPipeline(ci); + }); } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h index 58aa35efd..8b6a98fe0 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.h +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.h @@ -8,20 +8,19 @@ #include <optional> #include <vector> +#include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; struct GraphicsPipelineCacheKey { - RenderPassParams renderpass_params; - u32 padding; + VkRenderPass renderpass; std::array<GPUVAddr, Maxwell::MaxShaderProgram> shaders; FixedPipelineState fixed_state; @@ -34,16 +33,15 @@ struct GraphicsPipelineCacheKey { } std::size_t Size() const noexcept { - return sizeof(renderpass_params) + sizeof(padding) + sizeof(shaders) + fixed_state.Size(); + return sizeof(renderpass) + sizeof(shaders) + fixed_state.Size(); } }; static_assert(std::has_unique_object_representations_v<GraphicsPipelineCacheKey>); static_assert(std::is_trivially_copyable_v<GraphicsPipelineCacheKey>); static_assert(std::is_trivially_constructible_v<GraphicsPipelineCacheKey>); +class Device; class VKDescriptorPool; -class VKDevice; -class VKRenderPassCache; class VKScheduler; class VKUpdateDescriptorQueue; @@ -51,13 +49,12 @@ using SPIRVProgram = std::array<std::optional<SPIRVShader>, Maxwell::MaxShaderSt class VKGraphicsPipeline final { public: - explicit VKGraphicsPipeline(const VKDevice& device, VKScheduler& scheduler, + explicit VKGraphicsPipeline(const Device& device_, VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - VKRenderPassCache& renderpass_cache, + VKUpdateDescriptorQueue& update_descriptor_queue_, const GraphicsPipelineCacheKey& key, vk::Span<VkDescriptorSetLayoutBinding> bindings, - const SPIRVProgram& program); + const SPIRVProgram& program, u32 num_color_buffers); ~VKGraphicsPipeline(); VkDescriptorSet CommitDescriptorSet(); @@ -70,10 +67,6 @@ public: return *layout; } - VkRenderPass GetRenderPass() const { - return renderpass; - } - GraphicsPipelineCacheKey GetCacheKey() const { return cache_key; } @@ -89,10 +82,10 @@ private: std::vector<vk::ShaderModule> CreateShaderModules(const SPIRVProgram& program) const; - vk::Pipeline CreatePipeline(const RenderPassParams& renderpass_params, - const SPIRVProgram& program) const; + vk::Pipeline CreatePipeline(const SPIRVProgram& program, VkRenderPass renderpass, + u32 num_color_buffers) const; - const VKDevice& device; + const Device& device; VKScheduler& scheduler; const GraphicsPipelineCacheKey cache_key; const u64 hash; @@ -104,7 +97,6 @@ private: vk::DescriptorUpdateTemplateKHR descriptor_template; std::vector<vk::ShaderModule> modules; - VkRenderPass renderpass; vk::Pipeline pipeline; }; diff --git a/src/video_core/renderer_vulkan/vk_image.cpp b/src/video_core/renderer_vulkan/vk_image.cpp deleted file mode 100644 index 1c418ea17..000000000 --- a/src/video_core/renderer_vulkan/vk_image.cpp +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <memory> -#include <vector> - -#include "common/assert.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_image.h" -#include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -VKImage::VKImage(const VKDevice& device, VKScheduler& scheduler, const VkImageCreateInfo& image_ci, - VkImageAspectFlags aspect_mask) - : device{device}, scheduler{scheduler}, format{image_ci.format}, aspect_mask{aspect_mask}, - image_num_layers{image_ci.arrayLayers}, image_num_levels{image_ci.mipLevels} { - UNIMPLEMENTED_IF_MSG(image_ci.queueFamilyIndexCount != 0, - "Queue family tracking is not implemented"); - - image = device.GetLogical().CreateImage(image_ci); - - const u32 num_ranges = image_num_layers * image_num_levels; - barriers.resize(num_ranges); - subrange_states.resize(num_ranges, {{}, image_ci.initialLayout}); -} - -VKImage::~VKImage() = default; - -void VKImage::Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkPipelineStageFlags new_stage_mask, VkAccessFlags new_access, - VkImageLayout new_layout) { - if (!HasChanged(base_layer, num_layers, base_level, num_levels, new_access, new_layout)) { - return; - } - - std::size_t cursor = 0; - for (u32 layer_it = 0; layer_it < num_layers; ++layer_it) { - for (u32 level_it = 0; level_it < num_levels; ++level_it, ++cursor) { - const u32 layer = base_layer + layer_it; - const u32 level = base_level + level_it; - auto& state = GetSubrangeState(layer, level); - auto& barrier = barriers[cursor]; - barrier.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = state.access; - barrier.dstAccessMask = new_access; - barrier.oldLayout = state.layout; - barrier.newLayout = new_layout; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.image = *image; - barrier.subresourceRange.aspectMask = aspect_mask; - barrier.subresourceRange.baseMipLevel = level; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.baseArrayLayer = layer; - barrier.subresourceRange.layerCount = 1; - state.access = new_access; - state.layout = new_layout; - } - } - - scheduler.RequestOutsideRenderPassOperationContext(); - - scheduler.Record([barriers = barriers, cursor](vk::CommandBuffer cmdbuf) { - // TODO(Rodrigo): Implement a way to use the latest stage across subresources. - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, {}, {}, - vk::Span(barriers.data(), cursor)); - }); -} - -bool VKImage::HasChanged(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkAccessFlags new_access, VkImageLayout new_layout) noexcept { - const bool is_full_range = base_layer == 0 && num_layers == image_num_layers && - base_level == 0 && num_levels == image_num_levels; - if (!is_full_range) { - state_diverged = true; - } - - if (!state_diverged) { - auto& state = GetSubrangeState(0, 0); - if (state.access != new_access || state.layout != new_layout) { - return true; - } - } - - for (u32 layer_it = 0; layer_it < num_layers; ++layer_it) { - for (u32 level_it = 0; level_it < num_levels; ++level_it) { - const u32 layer = base_layer + layer_it; - const u32 level = base_level + level_it; - auto& state = GetSubrangeState(layer, level); - if (state.access != new_access || state.layout != new_layout) { - return true; - } - } - } - return false; -} - -void VKImage::CreatePresentView() { - // Image type has to be 2D to be presented. - present_view = device.GetLogical().CreateImageView({ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .image = *image, - .viewType = VK_IMAGE_VIEW_TYPE_2D, - .format = format, - .components = - { - .r = VK_COMPONENT_SWIZZLE_IDENTITY, - .g = VK_COMPONENT_SWIZZLE_IDENTITY, - .b = VK_COMPONENT_SWIZZLE_IDENTITY, - .a = VK_COMPONENT_SWIZZLE_IDENTITY, - }, - .subresourceRange = - { - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = 1, - .baseArrayLayer = 0, - .layerCount = 1, - }, - }); -} - -VKImage::SubrangeState& VKImage::GetSubrangeState(u32 layer, u32 level) noexcept { - return subrange_states[static_cast<std::size_t>(layer * image_num_levels) + - static_cast<std::size_t>(level)]; -} - -} // namespace Vulkan
\ No newline at end of file diff --git a/src/video_core/renderer_vulkan/vk_image.h b/src/video_core/renderer_vulkan/vk_image.h deleted file mode 100644 index b4d7229e5..000000000 --- a/src/video_core/renderer_vulkan/vk_image.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright 2018 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <memory> -#include <vector> - -#include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -class VKDevice; -class VKScheduler; - -class VKImage { -public: - explicit VKImage(const VKDevice& device, VKScheduler& scheduler, - const VkImageCreateInfo& image_ci, VkImageAspectFlags aspect_mask); - ~VKImage(); - - /// Records in the passed command buffer an image transition and updates the state of the image. - void Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkPipelineStageFlags new_stage_mask, VkAccessFlags new_access, - VkImageLayout new_layout); - - /// Returns a view compatible with presentation, the image has to be 2D. - VkImageView GetPresentView() { - if (!present_view) { - CreatePresentView(); - } - return *present_view; - } - - /// Returns the Vulkan image handler. - const vk::Image& GetHandle() const { - return image; - } - - /// Returns the Vulkan format for this image. - VkFormat GetFormat() const { - return format; - } - - /// Returns the Vulkan aspect mask. - VkImageAspectFlags GetAspectMask() const { - return aspect_mask; - } - -private: - struct SubrangeState final { - VkAccessFlags access = 0; ///< Current access bits. - VkImageLayout layout = VK_IMAGE_LAYOUT_UNDEFINED; ///< Current image layout. - }; - - bool HasChanged(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkAccessFlags new_access, VkImageLayout new_layout) noexcept; - - /// Creates a presentation view. - void CreatePresentView(); - - /// Returns the subrange state for a layer and layer. - SubrangeState& GetSubrangeState(u32 layer, u32 level) noexcept; - - const VKDevice& device; ///< Device handler. - VKScheduler& scheduler; ///< Device scheduler. - - const VkFormat format; ///< Vulkan format. - const VkImageAspectFlags aspect_mask; ///< Vulkan aspect mask. - const u32 image_num_layers; ///< Number of layers. - const u32 image_num_levels; ///< Number of mipmap levels. - - vk::Image image; ///< Image handle. - vk::ImageView present_view; ///< Image view compatible with presentation. - - std::vector<VkImageMemoryBarrier> barriers; ///< Pool of barriers. - std::vector<SubrangeState> subrange_states; ///< Current subrange state. - - bool state_diverged = false; ///< True when subresources mismatch in layout. -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp index ae26e558d..56ec5e380 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.cpp +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.cpp @@ -6,15 +6,15 @@ #include <chrono> #include "core/settings.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { using namespace std::chrono_literals; -MasterSemaphore::MasterSemaphore(const VKDevice& device) { +MasterSemaphore::MasterSemaphore(const Device& device) { static constexpr VkSemaphoreTypeCreateInfoKHR semaphore_type_ci{ .sType = VK_STRUCTURE_TYPE_SEMAPHORE_TYPE_CREATE_INFO_KHR, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_master_semaphore.h b/src/video_core/renderer_vulkan/vk_master_semaphore.h index 0e93706d7..f336f1862 100644 --- a/src/video_core/renderer_vulkan/vk_master_semaphore.h +++ b/src/video_core/renderer_vulkan/vk_master_semaphore.h @@ -8,15 +8,15 @@ #include <thread> #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class MasterSemaphore { public: - explicit MasterSemaphore(const VKDevice& device); + explicit MasterSemaphore(const Device& device); ~MasterSemaphore(); /// Returns the current logical tick. diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.cpp b/src/video_core/renderer_vulkan/vk_memory_manager.cpp index 24c8960ac..a6abd0eee 100644 --- a/src/video_core/renderer_vulkan/vk_memory_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_memory_manager.cpp @@ -11,9 +11,9 @@ #include "common/assert.h" #include "common/common_types.h" #include "common/logging/log.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -29,10 +29,10 @@ u64 GetAllocationChunkSize(u64 required_size) { class VKMemoryAllocation final { public: - explicit VKMemoryAllocation(const VKDevice& device, vk::DeviceMemory memory, - VkMemoryPropertyFlags properties, u64 allocation_size, u32 type) - : device{device}, memory{std::move(memory)}, properties{properties}, - allocation_size{allocation_size}, shifted_type{ShiftType(type)} {} + explicit VKMemoryAllocation(const Device& device_, vk::DeviceMemory memory_, + VkMemoryPropertyFlags properties_, u64 allocation_size_, u32 type_) + : device{device_}, memory{std::move(memory_)}, properties{properties_}, + allocation_size{allocation_size_}, shifted_type{ShiftType(type_)} {} VKMemoryCommit Commit(VkDeviceSize commit_size, VkDeviceSize alignment) { auto found = TryFindFreeSection(free_iterator, allocation_size, @@ -104,7 +104,7 @@ private: return std::nullopt; } - const VKDevice& device; ///< Vulkan device. + const Device& device; ///< Vulkan device. const vk::DeviceMemory memory; ///< Vulkan memory allocation handler. const VkMemoryPropertyFlags properties; ///< Vulkan properties. const u64 allocation_size; ///< Size of this allocation. @@ -117,8 +117,8 @@ private: std::vector<const VKMemoryCommitImpl*> commits; }; -VKMemoryManager::VKMemoryManager(const VKDevice& device) - : device{device}, properties{device.GetPhysical().GetMemoryProperties()} {} +VKMemoryManager::VKMemoryManager(const Device& device_) + : device{device_}, properties{device_.GetPhysical().GetMemoryProperties()} {} VKMemoryManager::~VKMemoryManager() = default; @@ -207,16 +207,16 @@ VKMemoryCommit VKMemoryManager::TryAllocCommit(const VkMemoryRequirements& requi return {}; } -VKMemoryCommitImpl::VKMemoryCommitImpl(const VKDevice& device, VKMemoryAllocation* allocation, - const vk::DeviceMemory& memory, u64 begin, u64 end) - : device{device}, memory{memory}, interval{begin, end}, allocation{allocation} {} +VKMemoryCommitImpl::VKMemoryCommitImpl(const Device& device_, VKMemoryAllocation* allocation_, + const vk::DeviceMemory& memory_, u64 begin_, u64 end_) + : device{device_}, memory{memory_}, interval{begin_, end_}, allocation{allocation_} {} VKMemoryCommitImpl::~VKMemoryCommitImpl() { allocation->Free(this); } MemoryMap VKMemoryCommitImpl::Map(u64 size, u64 offset_) const { - return MemoryMap{this, memory.Map(interval.first + offset_, size)}; + return MemoryMap(this, std::span<u8>(memory.Map(interval.first + offset_, size), size)); } void VKMemoryCommitImpl::Unmap() const { diff --git a/src/video_core/renderer_vulkan/vk_memory_manager.h b/src/video_core/renderer_vulkan/vk_memory_manager.h index 1af88e3d4..2452bca4e 100644 --- a/src/video_core/renderer_vulkan/vk_memory_manager.h +++ b/src/video_core/renderer_vulkan/vk_memory_manager.h @@ -5,15 +5,16 @@ #pragma once #include <memory> +#include <span> #include <utility> #include <vector> #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +class Device; class MemoryMap; -class VKDevice; class VKMemoryAllocation; class VKMemoryCommitImpl; @@ -21,7 +22,7 @@ using VKMemoryCommit = std::unique_ptr<VKMemoryCommitImpl>; class VKMemoryManager final { public: - explicit VKMemoryManager(const VKDevice& device); + explicit VKMemoryManager(const Device& device_); VKMemoryManager(const VKMemoryManager&) = delete; ~VKMemoryManager(); @@ -48,7 +49,7 @@ private: VKMemoryCommit TryAllocCommit(const VkMemoryRequirements& requirements, VkMemoryPropertyFlags wanted_properties); - const VKDevice& device; ///< Device handler. + const Device& device; ///< Device handler. const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. std::vector<std::unique_ptr<VKMemoryAllocation>> allocations; ///< Current allocations. }; @@ -58,8 +59,8 @@ class VKMemoryCommitImpl final { friend MemoryMap; public: - explicit VKMemoryCommitImpl(const VKDevice& device, VKMemoryAllocation* allocation, - const vk::DeviceMemory& memory, u64 begin, u64 end); + explicit VKMemoryCommitImpl(const Device& device_, VKMemoryAllocation* allocation_, + const vk::DeviceMemory& memory_, u64 begin_, u64 end_); ~VKMemoryCommitImpl(); /// Maps a memory region and returns a pointer to it. @@ -84,7 +85,7 @@ private: /// Unmaps memory. void Unmap() const; - const VKDevice& device; ///< Vulkan device. + const Device& device; ///< Vulkan device. const vk::DeviceMemory& memory; ///< Vulkan device memory handler. std::pair<u64, u64> interval{}; ///< Interval where the commit exists. VKMemoryAllocation* allocation{}; ///< Pointer to the large memory allocation. @@ -93,8 +94,8 @@ private: /// Holds ownership of a memory map. class MemoryMap final { public: - explicit MemoryMap(const VKMemoryCommitImpl* commit, u8* address) - : commit{commit}, address{address} {} + explicit MemoryMap(const VKMemoryCommitImpl* commit_, std::span<u8> span_) + : commit{commit_}, span{span_} {} ~MemoryMap() { if (commit) { @@ -108,19 +109,24 @@ public: commit = nullptr; } + /// Returns a span to the memory map. + [[nodiscard]] std::span<u8> Span() const noexcept { + return span; + } + /// Returns the address of the memory map. - u8* GetAddress() const { - return address; + [[nodiscard]] u8* Address() const noexcept { + return span.data(); } /// Returns the address of the memory map; - operator u8*() const { - return address; + [[nodiscard]] operator u8*() const noexcept { + return span.data(); } private: const VKMemoryCommitImpl* commit{}; ///< Mapped memory commit. - u8* address{}; ///< Address to the mapped memory. + std::span<u8> span; ///< Address to the mapped memory. }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 5c038f4bc..02282e36f 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -7,6 +7,8 @@ #include <memory> #include <vector> +#include "common/bit_cast.h" +#include "common/cityhash.h" #include "common/microprofile.h" #include "core/core.h" #include "core/memory.h" @@ -17,18 +19,17 @@ #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/compiler_settings.h" #include "video_core/shader/memory_util.h" #include "video_core/shader_cache.h" #include "video_core/shader_notify.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -51,7 +52,9 @@ constexpr VkDescriptorType STORAGE_TEXEL_BUFFER = VK_DESCRIPTOR_TYPE_STORAGE_TEX constexpr VkDescriptorType STORAGE_IMAGE = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; constexpr VideoCommon::Shader::CompilerSettings compiler_settings{ - VideoCommon::Shader::CompileDepth::FullDecompile}; + .depth = VideoCommon::Shader::CompileDepth::FullDecompile, + .disable_else_derivation = true, +}; constexpr std::size_t GetStageFromProgram(std::size_t program) { return program == 0 ? 0 : program - 1; @@ -74,7 +77,7 @@ ShaderType GetShaderType(Maxwell::ShaderProgram program) { case Maxwell::ShaderProgram::Fragment: return ShaderType::Fragment; default: - UNIMPLEMENTED_MSG("program={}", static_cast<u32>(program)); + UNIMPLEMENTED_MSG("program={}", program); return ShaderType::Vertex; } } @@ -135,26 +138,24 @@ bool ComputePipelineCacheKey::operator==(const ComputePipelineCacheKey& rhs) con return std::memcmp(&rhs, this, sizeof *this) == 0; } -Shader::Shader(Tegra::Engines::ConstBufferEngineInterface& engine, Tegra::Engines::ShaderType stage, - GPUVAddr gpu_addr_, VAddr cpu_addr, VideoCommon::Shader::ProgramCode program_code_, - u32 main_offset) - : gpu_addr(gpu_addr_), program_code(std::move(program_code_)), registry(stage, engine), - shader_ir(program_code, main_offset, compiler_settings, registry), +Shader::Shader(Tegra::Engines::ConstBufferEngineInterface& engine_, ShaderType stage_, + GPUVAddr gpu_addr_, VAddr cpu_addr_, ProgramCode program_code_, u32 main_offset_) + : gpu_addr(gpu_addr_), program_code(std::move(program_code_)), registry(stage_, engine_), + shader_ir(program_code, main_offset_, compiler_settings, registry), entries(GenerateShaderEntries(shader_ir)) {} Shader::~Shader() = default; -VKPipelineCache::VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu_, +VKPipelineCache::VKPipelineCache(RasterizerVulkan& rasterizer_, Tegra::GPU& gpu_, Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::Engines::KeplerCompute& kepler_compute_, - Tegra::MemoryManager& gpu_memory_, const VKDevice& device_, + Tegra::MemoryManager& gpu_memory_, const Device& device_, VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool_, - VKUpdateDescriptorQueue& update_descriptor_queue_, - VKRenderPassCache& renderpass_cache_) - : VideoCommon::ShaderCache<Shader>{rasterizer}, gpu{gpu_}, maxwell3d{maxwell3d_}, + VKUpdateDescriptorQueue& update_descriptor_queue_) + : VideoCommon::ShaderCache<Shader>{rasterizer_}, gpu{gpu_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_}, device{device_}, - scheduler{scheduler_}, descriptor_pool{descriptor_pool_}, - update_descriptor_queue{update_descriptor_queue_}, renderpass_cache{renderpass_cache_} {} + scheduler{scheduler_}, descriptor_pool{descriptor_pool_}, update_descriptor_queue{ + update_descriptor_queue_} {} VKPipelineCache::~VKPipelineCache() = default; @@ -199,7 +200,8 @@ std::array<Shader*, Maxwell::MaxShaderProgram> VKPipelineCache::GetShaders() { } VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline( - const GraphicsPipelineCacheKey& key, VideoCommon::Shader::AsyncShaders& async_shaders) { + const GraphicsPipelineCacheKey& key, u32 num_color_buffers, + VideoCommon::Shader::AsyncShaders& async_shaders) { MICROPROFILE_SCOPE(Vulkan_PipelineCache); if (last_graphics_pipeline && last_graphics_key == key) { @@ -215,8 +217,8 @@ VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline( LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash()); const auto [program, bindings] = DecompileShaders(key.fixed_state); async_shaders.QueueVulkanShader(this, device, scheduler, descriptor_pool, - update_descriptor_queue, renderpass_cache, bindings, - program, key); + update_descriptor_queue, bindings, program, key, + num_color_buffers); } last_graphics_pipeline = pair->second.get(); return last_graphics_pipeline; @@ -229,8 +231,8 @@ VKGraphicsPipeline* VKPipelineCache::GetGraphicsPipeline( LOG_INFO(Render_Vulkan, "Compile 0x{:016X}", key.Hash()); const auto [program, bindings] = DecompileShaders(key.fixed_state); entry = std::make_unique<VKGraphicsPipeline>(device, scheduler, descriptor_pool, - update_descriptor_queue, renderpass_cache, key, - bindings, program); + update_descriptor_queue, key, bindings, + program, num_color_buffers); gpu.ShaderNotify().MarkShaderComplete(); } last_graphics_pipeline = entry.get(); @@ -331,8 +333,7 @@ void VKPipelineCache::OnShaderRemoval(Shader* shader) { std::pair<SPIRVProgram, std::vector<VkDescriptorSetLayoutBinding>> VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) { Specialization specialization; - if (fixed_state.dynamic_state.Topology() == Maxwell::PrimitiveTopology::Points || - device.IsExtExtendedDynamicStateSupported()) { + if (fixed_state.topology == Maxwell::PrimitiveTopology::Points) { float point_size; std::memcpy(&point_size, &fixed_state.point_size, sizeof(float)); specialization.point_size = point_size; @@ -344,6 +345,12 @@ VKPipelineCache::DecompileShaders(const FixedPipelineState& fixed_state) { specialization.attribute_types[i] = attribute.Type(); } specialization.ndc_minus_one_to_one = fixed_state.ndc_minus_one_to_one; + specialization.early_fragment_tests = fixed_state.early_z; + + // Alpha test + specialization.alpha_test_func = + FixedPipelineState::UnpackComparisonOp(fixed_state.alpha_test_func.Value()); + specialization.alpha_test_ref = Common::BitCast<float>(fixed_state.alpha_test_ref); SPIRVProgram program; std::vector<VkDescriptorSetLayoutBinding> bindings; diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.h b/src/video_core/renderer_vulkan/vk_pipeline_cache.h index e558e6658..89d635a3d 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.h +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.h @@ -19,14 +19,13 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/async_shaders.h" #include "video_core/shader/memory_util.h" #include "video_core/shader/registry.h" #include "video_core/shader/shader_ir.h" #include "video_core/shader_cache.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class System; @@ -34,10 +33,10 @@ class System; namespace Vulkan { +class Device; class RasterizerVulkan; class VKComputePipeline; class VKDescriptorPool; -class VKDevice; class VKScheduler; class VKUpdateDescriptorQueue; @@ -84,9 +83,9 @@ namespace Vulkan { class Shader { public: - explicit Shader(Tegra::Engines::ConstBufferEngineInterface& engine, - Tegra::Engines::ShaderType stage, GPUVAddr gpu_addr, VAddr cpu_addr, - VideoCommon::Shader::ProgramCode program_code, u32 main_offset); + explicit Shader(Tegra::Engines::ConstBufferEngineInterface& engine_, + Tegra::Engines::ShaderType stage_, GPUVAddr gpu_addr, VAddr cpu_addr_, + VideoCommon::Shader::ProgramCode program_code, u32 main_offset_); ~Shader(); GPUVAddr GetGpuAddr() const { @@ -122,15 +121,15 @@ public: explicit VKPipelineCache(RasterizerVulkan& rasterizer, Tegra::GPU& gpu, Tegra::Engines::Maxwell3D& maxwell3d, Tegra::Engines::KeplerCompute& kepler_compute, - Tegra::MemoryManager& gpu_memory, const VKDevice& device, + Tegra::MemoryManager& gpu_memory, const Device& device, VKScheduler& scheduler, VKDescriptorPool& descriptor_pool, - VKUpdateDescriptorQueue& update_descriptor_queue, - VKRenderPassCache& renderpass_cache); + VKUpdateDescriptorQueue& update_descriptor_queue); ~VKPipelineCache() override; std::array<Shader*, Maxwell::MaxShaderProgram> GetShaders(); VKGraphicsPipeline* GetGraphicsPipeline(const GraphicsPipelineCacheKey& key, + u32 num_color_buffers, VideoCommon::Shader::AsyncShaders& async_shaders); VKComputePipeline& GetComputePipeline(const ComputePipelineCacheKey& key); @@ -149,11 +148,10 @@ private: Tegra::Engines::KeplerCompute& kepler_compute; Tegra::MemoryManager& gpu_memory; - const VKDevice& device; + const Device& device; VKScheduler& scheduler; VKDescriptorPool& descriptor_pool; VKUpdateDescriptorQueue& update_descriptor_queue; - VKRenderPassCache& renderpass_cache; std::unique_ptr<Shader> null_shader; std::unique_ptr<Shader> null_kernel; diff --git a/src/video_core/renderer_vulkan/vk_query_cache.cpp b/src/video_core/renderer_vulkan/vk_query_cache.cpp index ee2d871e3..7cadd5147 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_query_cache.cpp @@ -7,11 +7,11 @@ #include <utility> #include <vector> -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -27,7 +27,7 @@ constexpr VkQueryType GetTarget(QueryType type) { } // Anonymous namespace -QueryPool::QueryPool(const VKDevice& device_, VKScheduler& scheduler, QueryType type_) +QueryPool::QueryPool(const Device& device_, VKScheduler& scheduler, QueryType type_) : ResourcePool{scheduler.GetMasterSemaphore(), GROW_STEP}, device{device_}, type{type_} {} QueryPool::~QueryPool() = default; @@ -66,15 +66,13 @@ void QueryPool::Reserve(std::pair<VkQueryPool, u32> query) { usage[pool_index * GROW_STEP + static_cast<std::ptrdiff_t>(query.second)] = false; } -VKQueryCache::VKQueryCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory, - const VKDevice& device, VKScheduler& scheduler) - : VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, - HostCounter>{rasterizer, maxwell3d, gpu_memory}, - device{device}, scheduler{scheduler}, query_pools{ - QueryPool{device, scheduler, - QueryType::SamplesPassed}, - } {} +VKQueryCache::VKQueryCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::MemoryManager& gpu_memory_, + const Device& device_, VKScheduler& scheduler_) + : QueryCacheBase{rasterizer_, maxwell3d_, gpu_memory_}, device{device_}, scheduler{scheduler_}, + query_pools{ + QueryPool{device_, scheduler_, QueryType::SamplesPassed}, + } {} VKQueryCache::~VKQueryCache() { // TODO(Rodrigo): This is a hack to destroy all HostCounter instances before the base class @@ -95,12 +93,12 @@ void VKQueryCache::Reserve(QueryType type, std::pair<VkQueryPool, u32> query) { query_pools[static_cast<std::size_t>(type)].Reserve(query); } -HostCounter::HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, - QueryType type) - : VideoCommon::HostCounterBase<VKQueryCache, HostCounter>{std::move(dependency)}, cache{cache}, - type{type}, query{cache.AllocateQuery(type)}, tick{cache.Scheduler().CurrentTick()} { - const vk::Device* logical = &cache.Device().GetLogical(); - cache.Scheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) { +HostCounter::HostCounter(VKQueryCache& cache_, std::shared_ptr<HostCounter> dependency_, + QueryType type_) + : HostCounterBase{std::move(dependency_)}, cache{cache_}, type{type_}, + query{cache_.AllocateQuery(type_)}, tick{cache_.GetScheduler().CurrentTick()} { + const vk::Device* logical = &cache.GetDevice().GetLogical(); + cache.GetScheduler().Record([logical, query = query](vk::CommandBuffer cmdbuf) { logical->ResetQueryPoolEXT(query.first, query.second, 1); cmdbuf.BeginQuery(query.first, query.second, VK_QUERY_CONTROL_PRECISE_BIT); }); @@ -111,26 +109,28 @@ HostCounter::~HostCounter() { } void HostCounter::EndQuery() { - cache.Scheduler().Record( + cache.GetScheduler().Record( [query = query](vk::CommandBuffer cmdbuf) { cmdbuf.EndQuery(query.first, query.second); }); } u64 HostCounter::BlockingQuery() const { - if (tick >= cache.Scheduler().CurrentTick()) { - cache.Scheduler().Flush(); + if (tick >= cache.GetScheduler().CurrentTick()) { + cache.GetScheduler().Flush(); } + u64 data; - const VkResult result = cache.Device().GetLogical().GetQueryResults( + const VkResult query_result = cache.GetDevice().GetLogical().GetQueryResults( query.first, query.second, 1, sizeof(data), &data, sizeof(data), VK_QUERY_RESULT_64_BIT | VK_QUERY_RESULT_WAIT_BIT); - switch (result) { + + switch (query_result) { case VK_SUCCESS: return data; case VK_ERROR_DEVICE_LOST: - cache.Device().ReportLoss(); + cache.GetDevice().ReportLoss(); [[fallthrough]]; default: - throw vk::Exception(result); + throw vk::Exception(query_result); } } diff --git a/src/video_core/renderer_vulkan/vk_query_cache.h b/src/video_core/renderer_vulkan/vk_query_cache.h index 2e57fb75d..7190946b9 100644 --- a/src/video_core/renderer_vulkan/vk_query_cache.h +++ b/src/video_core/renderer_vulkan/vk_query_cache.h @@ -12,7 +12,7 @@ #include "common/common_types.h" #include "video_core/query_cache.h" #include "video_core/renderer_vulkan/vk_resource_pool.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace VideoCore { class RasterizerInterface; @@ -21,8 +21,8 @@ class RasterizerInterface; namespace Vulkan { class CachedQuery; +class Device; class HostCounter; -class VKDevice; class VKQueryCache; class VKScheduler; @@ -30,7 +30,7 @@ using CounterStream = VideoCommon::CounterStreamBase<VKQueryCache, HostCounter>; class QueryPool final : public ResourcePool { public: - explicit QueryPool(const VKDevice& device, VKScheduler& scheduler, VideoCore::QueryType type); + explicit QueryPool(const Device& device, VKScheduler& scheduler, VideoCore::QueryType type); ~QueryPool() override; std::pair<VkQueryPool, u32> Commit(); @@ -43,7 +43,7 @@ protected: private: static constexpr std::size_t GROW_STEP = 512; - const VKDevice& device; + const Device& device; const VideoCore::QueryType type; std::vector<vk::QueryPool> pools; @@ -53,33 +53,33 @@ private: class VKQueryCache final : public VideoCommon::QueryCacheBase<VKQueryCache, CachedQuery, CounterStream, HostCounter> { public: - explicit VKQueryCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory, - const VKDevice& device, VKScheduler& scheduler); + explicit VKQueryCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::MemoryManager& gpu_memory_, + const Device& device_, VKScheduler& scheduler_); ~VKQueryCache(); std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type); void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query); - const VKDevice& Device() const noexcept { + const Device& GetDevice() const noexcept { return device; } - VKScheduler& Scheduler() const noexcept { + VKScheduler& GetScheduler() const noexcept { return scheduler; } private: - const VKDevice& device; + const Device& device; VKScheduler& scheduler; std::array<QueryPool, VideoCore::NumQueryTypes> query_pools; }; class HostCounter final : public VideoCommon::HostCounterBase<VKQueryCache, HostCounter> { public: - explicit HostCounter(VKQueryCache& cache, std::shared_ptr<HostCounter> dependency, - VideoCore::QueryType type); + explicit HostCounter(VKQueryCache& cache_, std::shared_ptr<HostCounter> dependency_, + VideoCore::QueryType type_); ~HostCounter(); void EndQuery(); @@ -95,8 +95,8 @@ private: class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> { public: - explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr, u8* host_ptr) - : VideoCommon::CachedQueryBase<HostCounter>{cpu_addr, host_ptr} {} + explicit CachedQuery(VKQueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_) + : CachedQueryBase{cpu_addr_, host_ptr_} {} }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index f3c2483c8..93fbea510 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -19,6 +19,7 @@ #include "core/settings.h" #include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" @@ -26,23 +27,24 @@ #include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader_cache.h" +#include "video_core/texture_cache/texture_cache.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { using Maxwell = Tegra::Engines::Maxwell3D::Regs; +using VideoCommon::ImageViewId; +using VideoCommon::ImageViewType; MICROPROFILE_DEFINE(Vulkan_WaitForWorker, "Vulkan", "Wait for worker", MP_RGB(255, 192, 192)); MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128)); @@ -58,9 +60,9 @@ MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192 namespace { -constexpr auto ComputeShaderIndex = static_cast<std::size_t>(Tegra::Engines::ShaderType::Compute); +constexpr auto COMPUTE_SHADER_INDEX = static_cast<size_t>(Tegra::Engines::ShaderType::Compute); -VkViewport GetViewportState(const VKDevice& device, const Maxwell& regs, std::size_t index) { +VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t index) { const auto& src = regs.viewport_transform[index]; const float width = src.scale_x * 2.0f; const float height = src.scale_y * 2.0f; @@ -83,7 +85,7 @@ VkViewport GetViewportState(const VKDevice& device, const Maxwell& regs, std::si return viewport; } -VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) { +VkRect2D GetScissorState(const Maxwell& regs, size_t index) { const auto& src = regs.scissor_test[index]; VkRect2D scissor; if (src.enable) { @@ -103,98 +105,122 @@ VkRect2D GetScissorState(const Maxwell& regs, std::size_t index) { std::array<GPUVAddr, Maxwell::MaxShaderProgram> GetShaderAddresses( const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { std::array<GPUVAddr, Maxwell::MaxShaderProgram> addresses; - for (std::size_t i = 0; i < std::size(addresses); ++i) { + for (size_t i = 0; i < std::size(addresses); ++i) { addresses[i] = shaders[i] ? shaders[i]->GetGpuAddr() : 0; } return addresses; } -void TransitionImages(const std::vector<ImageView>& views, VkPipelineStageFlags pipeline_stage, - VkAccessFlags access) { - for (auto& [view, layout] : views) { - view->Transition(*layout, pipeline_stage, access); +struct TextureHandle { + constexpr TextureHandle(u32 data, bool via_header_index) { + const Tegra::Texture::TextureHandle handle{data}; + image = handle.tic_id; + sampler = via_header_index ? image : handle.tsc_id.Value(); } -} + + u32 image; + u32 sampler; +}; template <typename Engine, typename Entry> -Tegra::Texture::FullTextureInfo GetTextureInfo(const Engine& engine, const Entry& entry, - std::size_t stage, std::size_t index = 0) { - const auto stage_type = static_cast<Tegra::Engines::ShaderType>(stage); +TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const Entry& entry, + size_t stage, size_t index = 0) { + const auto shader_type = static_cast<Tegra::Engines::ShaderType>(stage); if constexpr (std::is_same_v<Entry, SamplerEntry>) { if (entry.is_separated) { const u32 buffer_1 = entry.buffer; const u32 buffer_2 = entry.secondary_buffer; const u32 offset_1 = entry.offset; const u32 offset_2 = entry.secondary_offset; - const u32 handle_1 = engine.AccessConstBuffer32(stage_type, buffer_1, offset_1); - const u32 handle_2 = engine.AccessConstBuffer32(stage_type, buffer_2, offset_2); - return engine.GetTextureInfo(handle_1 | handle_2); + const u32 handle_1 = engine.AccessConstBuffer32(shader_type, buffer_1, offset_1); + const u32 handle_2 = engine.AccessConstBuffer32(shader_type, buffer_2, offset_2); + return TextureHandle(handle_1 | handle_2, via_header_index); } } if (entry.is_bindless) { - const auto tex_handle = engine.AccessConstBuffer32(stage_type, entry.buffer, entry.offset); - return engine.GetTextureInfo(tex_handle); - } - const auto& gpu_profile = engine.AccessGuestDriverProfile(); - const u32 entry_offset = static_cast<u32>(index * gpu_profile.GetTextureHandlerSize()); - const u32 offset = entry.offset + entry_offset; - if constexpr (std::is_same_v<Engine, Tegra::Engines::Maxwell3D>) { - return engine.GetStageTexture(stage_type, offset); - } else { - return engine.GetTexture(offset); - } -} - -/// @brief Determine if an attachment to be updated has to preserve contents -/// @param is_clear True when a clear is being executed -/// @param regs 3D registers -/// @return True when the contents have to be preserved -bool HasToPreserveColorContents(bool is_clear, const Maxwell& regs) { - if (!is_clear) { - return true; - } - // First we have to make sure all clear masks are enabled. - if (!regs.clear_buffers.R || !regs.clear_buffers.G || !regs.clear_buffers.B || - !regs.clear_buffers.A) { - return true; - } - // If scissors are disabled, the whole screen is cleared - if (!regs.clear_flags.scissor) { - return false; + const u32 raw = engine.AccessConstBuffer32(shader_type, entry.buffer, entry.offset); + return TextureHandle(raw, via_header_index); } - // Then we have to confirm scissor testing clears the whole image - const std::size_t index = regs.clear_buffers.RT; - const auto& scissor = regs.scissor_test[0]; - return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.rt[index].width || - scissor.max_y < regs.rt[index].height; + const u32 buffer = engine.GetBoundBuffer(); + const u64 offset = (entry.offset + index) * sizeof(u32); + return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); } -/// @brief Determine if an attachment to be updated has to preserve contents -/// @param is_clear True when a clear is being executed -/// @param regs 3D registers -/// @return True when the contents have to be preserved -bool HasToPreserveDepthContents(bool is_clear, const Maxwell& regs) { - // If we are not clearing, the contents have to be preserved - if (!is_clear) { - return true; - } - // For depth stencil clears we only have to confirm scissor test covers the whole image - if (!regs.clear_flags.scissor) { - return false; - } - // Make sure the clear cover the whole image - const auto& scissor = regs.scissor_test[0]; - return scissor.min_x > 0 || scissor.min_y > 0 || scissor.max_x < regs.zeta_width || - scissor.max_y < regs.zeta_height; -} - -template <std::size_t N> +template <size_t N> std::array<VkDeviceSize, N> ExpandStrides(const std::array<u16, N>& strides) { std::array<VkDeviceSize, N> expanded; std::copy(strides.begin(), strides.end(), expanded.begin()); return expanded; } +ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { + if (entry.is_buffer) { + return ImageViewType::e2D; + } + switch (entry.type) { + case Tegra::Shader::TextureType::Texture1D: + return entry.is_array ? ImageViewType::e1DArray : ImageViewType::e1D; + case Tegra::Shader::TextureType::Texture2D: + return entry.is_array ? ImageViewType::e2DArray : ImageViewType::e2D; + case Tegra::Shader::TextureType::Texture3D: + return ImageViewType::e3D; + case Tegra::Shader::TextureType::TextureCube: + return entry.is_array ? ImageViewType::CubeArray : ImageViewType::Cube; + } + UNREACHABLE(); + return ImageViewType::e2D; +} + +ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) { + switch (entry.type) { + case Tegra::Shader::ImageType::Texture1D: + return ImageViewType::e1D; + case Tegra::Shader::ImageType::Texture1DArray: + return ImageViewType::e1DArray; + case Tegra::Shader::ImageType::Texture2D: + return ImageViewType::e2D; + case Tegra::Shader::ImageType::Texture2DArray: + return ImageViewType::e2DArray; + case Tegra::Shader::ImageType::Texture3D: + return ImageViewType::e3D; + case Tegra::Shader::ImageType::TextureBuffer: + return ImageViewType::Buffer; + } + UNREACHABLE(); + return ImageViewType::e2D; +} + +void PushImageDescriptors(const ShaderEntries& entries, TextureCache& texture_cache, + VKUpdateDescriptorQueue& update_descriptor_queue, + ImageViewId*& image_view_id_ptr, VkSampler*& sampler_ptr) { + for ([[maybe_unused]] const auto& entry : entries.uniform_texels) { + const ImageViewId image_view_id = *image_view_id_ptr++; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + update_descriptor_queue.AddTexelBuffer(image_view.BufferView()); + } + for (const auto& entry : entries.samplers) { + for (size_t i = 0; i < entry.size; ++i) { + const VkSampler sampler = *sampler_ptr++; + const ImageViewId image_view_id = *image_view_id_ptr++; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + const VkImageView handle = image_view.Handle(ImageViewTypeFromEntry(entry)); + update_descriptor_queue.AddSampledImage(handle, sampler); + } + } + for ([[maybe_unused]] const auto& entry : entries.storage_texels) { + const ImageViewId image_view_id = *image_view_id_ptr++; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + update_descriptor_queue.AddTexelBuffer(image_view.BufferView()); + } + for (const auto& entry : entries.images) { + // TODO: Mark as modified + const ImageViewId image_view_id = *image_view_id_ptr++; + const ImageView& image_view = texture_cache.GetImageView(image_view_id); + const VkImageView handle = image_view.Handle(ImageViewTypeFromEntry(entry)); + update_descriptor_queue.AddImage(handle); + } +} + } // Anonymous namespace class BufferBindings final { @@ -213,7 +239,7 @@ public: index.type = type; } - void Bind(const VKDevice& device, VKScheduler& scheduler) const { + void Bind(const Device& device, VKScheduler& scheduler) const { // Use this large switch case to avoid dispatching more memory in the record lambda than // what we need. It looks horrible, but it's the best we can do on standard C++. switch (vertex.num_buffers) { @@ -290,7 +316,7 @@ public: private: // Some of these fields are intentionally left uninitialized to avoid initializing them twice. struct { - std::size_t num_buffers = 0; + size_t num_buffers = 0; std::array<VkBuffer, Maxwell::NumVertexArrays> buffers; std::array<VkDeviceSize, Maxwell::NumVertexArrays> offsets; std::array<VkDeviceSize, Maxwell::NumVertexArrays> sizes; @@ -303,8 +329,8 @@ private: VkIndexType type; } index; - template <std::size_t N> - void BindStatic(const VKDevice& device, VKScheduler& scheduler) const { + template <size_t N> + void BindStatic(const Device& device, VKScheduler& scheduler) const { if (device.IsExtExtendedDynamicStateSupported()) { if (index.buffer) { BindStatic<N, true, true>(scheduler); @@ -320,7 +346,7 @@ private: } } - template <std::size_t N, bool is_indexed, bool has_extended_dynamic_state> + template <size_t N, bool is_indexed, bool has_extended_dynamic_state> void BindStatic(VKScheduler& scheduler) const { static_assert(N <= Maxwell::NumVertexArrays); if constexpr (N == 0) { @@ -380,28 +406,31 @@ void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const { } } -RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu_, +RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, Tegra::MemoryManager& gpu_memory_, - Core::Memory::Memory& cpu_memory, VKScreenInfo& screen_info_, - const VKDevice& device_, VKMemoryManager& memory_manager_, + Core::Memory::Memory& cpu_memory_, VKScreenInfo& screen_info_, + const Device& device_, VKMemoryManager& memory_manager_, StateTracker& state_tracker_, VKScheduler& scheduler_) - : RasterizerAccelerated(cpu_memory), gpu(gpu_), gpu_memory(gpu_memory_), - maxwell3d(gpu.Maxwell3D()), kepler_compute(gpu.KeplerCompute()), screen_info(screen_info_), - device(device_), memory_manager(memory_manager_), state_tracker(state_tracker_), - scheduler(scheduler_), staging_pool(device, memory_manager, scheduler), - descriptor_pool(device, scheduler_), update_descriptor_queue(device, scheduler), - renderpass_cache(device), + : RasterizerAccelerated{cpu_memory_}, gpu{gpu_}, + gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()}, + screen_info{screen_info_}, device{device_}, memory_manager{memory_manager_}, + state_tracker{state_tracker_}, scheduler{scheduler_}, stream_buffer(device, scheduler), + staging_pool(device, memory_manager, scheduler), descriptor_pool(device, scheduler), + update_descriptor_queue(device, scheduler), + blit_image(device, scheduler, state_tracker, descriptor_pool), quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), - texture_cache(*this, maxwell3d, gpu_memory, device, memory_manager, scheduler, staging_pool), + texture_cache_runtime{device, scheduler, memory_manager, staging_pool, blit_image}, + texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler, - descriptor_pool, update_descriptor_queue, renderpass_cache), - buffer_cache(*this, gpu_memory, cpu_memory, device, memory_manager, scheduler, staging_pool), - sampler_cache(device), query_cache(*this, maxwell3d, gpu_memory, device, scheduler), + descriptor_pool, update_descriptor_queue), + buffer_cache(*this, gpu_memory, cpu_memory_, device, memory_manager, scheduler, stream_buffer, + staging_pool), + query_cache{*this, maxwell3d, gpu_memory, device, scheduler}, fence_manager(*this, gpu, gpu_memory, texture_cache, buffer_cache, query_cache, device, scheduler), - wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window) { + wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) { scheduler.SetQueryCache(query_cache); if (device.UseAsynchronousShaders()) { async_shaders.AllocateWorkers(); @@ -427,9 +456,10 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { const DrawParameters draw_params = SetupGeometry(key.fixed_state, buffer_bindings, is_indexed, is_instanced); - update_descriptor_queue.Acquire(); - sampled_views.clear(); - image_views.clear(); + auto lock = texture_cache.AcquireLock(); + texture_cache.SynchronizeGraphicsDescriptors(); + + texture_cache.UpdateRenderTargets(false); const auto shaders = pipeline_cache.GetShaders(); key.shaders = GetShaderAddresses(shaders); @@ -437,30 +467,24 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { buffer_cache.Unmap(); - const Texceptions texceptions = UpdateAttachments(false); - SetupImageTransitions(texceptions, color_attachments, zeta_attachment); - - key.renderpass_params = GetRenderPassParams(texceptions); - key.padding = 0; + const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); + key.renderpass = framebuffer->RenderPass(); - auto* pipeline = pipeline_cache.GetGraphicsPipeline(key, async_shaders); + auto* const pipeline = + pipeline_cache.GetGraphicsPipeline(key, framebuffer->NumColorBuffers(), async_shaders); if (pipeline == nullptr || pipeline->GetHandle() == VK_NULL_HANDLE) { // Async graphics pipeline was not ready. return; } - scheduler.BindGraphicsPipeline(pipeline->GetHandle()); - - const auto renderpass = pipeline->GetRenderPass(); - const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); - scheduler.RequestRenderpass(renderpass, framebuffer, render_area); - - UpdateDynamicStates(); - buffer_bindings.Bind(device, scheduler); BeginTransformFeedback(); + scheduler.RequestRenderpass(framebuffer); + scheduler.BindGraphicsPipeline(pipeline->GetHandle()); + UpdateDynamicStates(); + const auto pipeline_layout = pipeline->GetLayout(); const auto descriptor_set = pipeline->CommitDescriptorSet(); scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) { @@ -481,9 +505,6 @@ void RasterizerVulkan::Clear() { return; } - sampled_views.clear(); - image_views.clear(); - query_cache.UpdateCounters(); const auto& regs = maxwell3d.regs; @@ -495,20 +516,24 @@ void RasterizerVulkan::Clear() { return; } - [[maybe_unused]] const auto texceptions = UpdateAttachments(true); - DEBUG_ASSERT(texceptions.none()); - SetupImageTransitions(0, color_attachments, zeta_attachment); - - const VkRenderPass renderpass = renderpass_cache.GetRenderPass(GetRenderPassParams(0)); - const auto [framebuffer, render_area] = ConfigureFramebuffers(renderpass); - scheduler.RequestRenderpass(renderpass, framebuffer, render_area); + auto lock = texture_cache.AcquireLock(); + texture_cache.UpdateRenderTargets(true); + const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); + const VkExtent2D render_area = framebuffer->RenderArea(); + scheduler.RequestRenderpass(framebuffer); - VkClearRect clear_rect; - clear_rect.baseArrayLayer = regs.clear_buffers.layer; - clear_rect.layerCount = 1; - clear_rect.rect = GetScissorState(regs, 0); - clear_rect.rect.extent.width = std::min(clear_rect.rect.extent.width, render_area.width); - clear_rect.rect.extent.height = std::min(clear_rect.rect.extent.height, render_area.height); + VkClearRect clear_rect{ + .rect = GetScissorState(regs, 0), + .baseArrayLayer = regs.clear_buffers.layer, + .layerCount = 1, + }; + if (clear_rect.rect.extent.width == 0 || clear_rect.rect.extent.height == 0) { + return; + } + clear_rect.rect.extent = VkExtent2D{ + .width = std::min(clear_rect.rect.extent.width, render_area.width), + .height = std::min(clear_rect.rect.extent.height, render_area.height), + }; if (use_color) { VkClearValue clear_value; @@ -549,9 +574,6 @@ void RasterizerVulkan::Clear() { void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { MICROPROFILE_SCOPE(Vulkan_Compute); - update_descriptor_queue.Acquire(); - sampled_views.clear(); - image_views.clear(); query_cache.UpdateCounters(); @@ -570,30 +592,46 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { // Compute dispatches can't be executed inside a renderpass scheduler.RequestOutsideRenderPassOperationContext(); - buffer_cache.Map(CalculateComputeStreamBufferSize()); + image_view_indices.clear(); + sampler_handles.clear(); + + auto lock = texture_cache.AcquireLock(); + texture_cache.SynchronizeComputeDescriptors(); const auto& entries = pipeline.GetEntries(); - SetupComputeConstBuffers(entries); - SetupComputeGlobalBuffers(entries); SetupComputeUniformTexels(entries); SetupComputeTextures(entries); SetupComputeStorageTexels(entries); SetupComputeImages(entries); - buffer_cache.Unmap(); + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + texture_cache.FillComputeImageViews(indices_span, image_view_ids); + + buffer_cache.Map(CalculateComputeStreamBufferSize()); + + update_descriptor_queue.Acquire(); + + SetupComputeConstBuffers(entries); + SetupComputeGlobalBuffers(entries); + + ImageViewId* image_view_id_ptr = image_view_ids.data(); + VkSampler* sampler_ptr = sampler_handles.data(); + PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr, + sampler_ptr); - TransitionImages(sampled_views, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT); - TransitionImages(image_views, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); + buffer_cache.Unmap(); + const VkPipeline pipeline_handle = pipeline.GetHandle(); + const VkPipelineLayout pipeline_layout = pipeline.GetLayout(); + const VkDescriptorSet descriptor_set = pipeline.CommitDescriptorSet(); scheduler.Record([grid_x = launch_desc.grid_dim_x, grid_y = launch_desc.grid_dim_y, - grid_z = launch_desc.grid_dim_z, pipeline_handle = pipeline.GetHandle(), - layout = pipeline.GetLayout(), - descriptor_set = pipeline.CommitDescriptorSet()](vk::CommandBuffer cmdbuf) { + grid_z = launch_desc.grid_dim_z, pipeline_handle, pipeline_layout, + descriptor_set](vk::CommandBuffer cmdbuf) { cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_handle); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, DESCRIPTOR_SET, - descriptor_set, {}); + if (descriptor_set) { + cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline_layout, + DESCRIPTOR_SET, descriptor_set, nullptr); + } cmdbuf.Dispatch(grid_x, grid_y, grid_z); }); } @@ -613,7 +651,10 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.FlushRegion(addr, size); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.DownloadMemory(addr, size); + } buffer_cache.FlushRegion(addr, size); query_cache.FlushRegion(addr, size); } @@ -622,14 +663,18 @@ bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) { if (!Settings::IsGPULevelHigh()) { return buffer_cache.MustFlushRegion(addr, size); } - return texture_cache.MustFlushRegion(addr, size) || buffer_cache.MustFlushRegion(addr, size); + return texture_cache.IsRegionGpuModified(addr, size) || + buffer_cache.MustFlushRegion(addr, size); } void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.InvalidateRegion(addr, size); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.WriteMemory(addr, size); + } pipeline_cache.InvalidateRegion(addr, size); buffer_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size); @@ -639,17 +684,28 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } - texture_cache.OnCPUWrite(addr, size); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.WriteMemory(addr, size); + } pipeline_cache.OnCPUWrite(addr, size); buffer_cache.OnCPUWrite(addr, size); } void RasterizerVulkan::SyncGuestHost() { - texture_cache.SyncGuestHost(); buffer_cache.SyncGuestHost(); pipeline_cache.SyncGuestHost(); } +void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { + { + auto lock = texture_cache.AcquireLock(); + texture_cache.UnmapMemory(addr, size); + } + buffer_cache.OnCPUWrite(addr, size); + pipeline_cache.OnCPUWrite(addr, size); +} + void RasterizerVulkan::SignalSemaphore(GPUVAddr addr, u32 value) { if (!gpu.IsAsync()) { gpu_memory.Write<u32>(addr, value); @@ -700,6 +756,14 @@ void RasterizerVulkan::WaitForIdle() { }); } +void RasterizerVulkan::FragmentBarrier() { + // We already put barriers when a render pass finishes +} + +void RasterizerVulkan::TiledCacheBarrier() { + // TODO: Implementing tiled barriers requires rewriting a good chunk of the Vulkan backend +} + void RasterizerVulkan::FlushCommands() { if (draw_counter > 0) { draw_counter = 0; @@ -710,14 +774,20 @@ void RasterizerVulkan::FlushCommands() { void RasterizerVulkan::TickFrame() { draw_counter = 0; update_descriptor_queue.TickFrame(); + fence_manager.TickFrame(); buffer_cache.TickFrame(); staging_pool.TickFrame(); + { + auto lock = texture_cache.AcquireLock(); + texture_cache.TickFrame(); + } } -bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, +bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { - texture_cache.DoFermiCopy(src, dst, copy_config); + auto lock = texture_cache.AcquireLock(); + texture_cache.BlitImage(dst, src, copy_config); return true; } @@ -727,20 +797,16 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, return false; } - const auto surface{texture_cache.TryFindFramebufferSurface(framebuffer_addr)}; - if (!surface) { + auto lock = texture_cache.AcquireLock(); + ImageView* const image_view = texture_cache.TryFindFramebufferImageView(framebuffer_addr); + if (!image_view) { return false; } - // Verify that the cached surface is the same size and format as the requested framebuffer - const auto& params{surface->GetSurfaceParams()}; - ASSERT_MSG(params.width == config.width, "Framebuffer width is different"); - ASSERT_MSG(params.height == config.height, "Framebuffer height is different"); - - screen_info.image = &surface->GetImage(); - screen_info.width = params.width; - screen_info.height = params.height; - screen_info.is_srgb = surface->GetSurfaceParams().srgb_conversion; + screen_info.image_view = image_view->Handle(VideoCommon::ImageViewType::e2D); + screen_info.width = image_view->size.width; + screen_info.height = image_view->size.height; + screen_info.is_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format); return true; } @@ -765,103 +831,6 @@ void RasterizerVulkan::FlushWork() { draw_counter = 0; } -RasterizerVulkan::Texceptions RasterizerVulkan::UpdateAttachments(bool is_clear) { - MICROPROFILE_SCOPE(Vulkan_RenderTargets); - - const auto& regs = maxwell3d.regs; - auto& dirty = maxwell3d.dirty.flags; - const bool update_rendertargets = dirty[VideoCommon::Dirty::RenderTargets]; - dirty[VideoCommon::Dirty::RenderTargets] = false; - - texture_cache.GuardRenderTargets(true); - - Texceptions texceptions; - for (std::size_t rt = 0; rt < Maxwell::NumRenderTargets; ++rt) { - if (update_rendertargets) { - const bool preserve_contents = HasToPreserveColorContents(is_clear, regs); - color_attachments[rt] = texture_cache.GetColorBufferSurface(rt, preserve_contents); - } - if (color_attachments[rt] && WalkAttachmentOverlaps(*color_attachments[rt])) { - texceptions[rt] = true; - } - } - - if (update_rendertargets) { - const bool preserve_contents = HasToPreserveDepthContents(is_clear, regs); - zeta_attachment = texture_cache.GetDepthBufferSurface(preserve_contents); - } - if (zeta_attachment && WalkAttachmentOverlaps(*zeta_attachment)) { - texceptions[ZETA_TEXCEPTION_INDEX] = true; - } - - texture_cache.GuardRenderTargets(false); - - return texceptions; -} - -bool RasterizerVulkan::WalkAttachmentOverlaps(const CachedSurfaceView& attachment) { - bool overlap = false; - for (auto& [view, layout] : sampled_views) { - if (!attachment.IsSameSurface(*view)) { - continue; - } - overlap = true; - *layout = VK_IMAGE_LAYOUT_GENERAL; - } - return overlap; -} - -std::tuple<VkFramebuffer, VkExtent2D> RasterizerVulkan::ConfigureFramebuffers( - VkRenderPass renderpass) { - FramebufferCacheKey key{ - .renderpass = renderpass, - .width = std::numeric_limits<u32>::max(), - .height = std::numeric_limits<u32>::max(), - .layers = std::numeric_limits<u32>::max(), - .views = {}, - }; - - const auto try_push = [&key](const View& view) { - if (!view) { - return false; - } - key.views.push_back(view->GetAttachment()); - key.width = std::min(key.width, view->GetWidth()); - key.height = std::min(key.height, view->GetHeight()); - key.layers = std::min(key.layers, view->GetNumLayers()); - return true; - }; - - const auto& regs = maxwell3d.regs; - const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count); - for (std::size_t index = 0; index < num_attachments; ++index) { - if (try_push(color_attachments[index])) { - texture_cache.MarkColorBufferInUse(index); - } - } - if (try_push(zeta_attachment)) { - texture_cache.MarkDepthBufferInUse(); - } - - const auto [fbentry, is_cache_miss] = framebuffer_cache.try_emplace(key); - auto& framebuffer = fbentry->second; - if (is_cache_miss) { - framebuffer = device.GetLogical().CreateFramebuffer({ - .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .renderPass = key.renderpass, - .attachmentCount = static_cast<u32>(key.views.size()), - .pAttachments = key.views.data(), - .width = key.width, - .height = key.height, - .layers = key.layers, - }); - } - - return {*framebuffer, VkExtent2D{key.width, key.height}}; -} - RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings, bool is_indexed, @@ -885,51 +854,37 @@ RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineSt void RasterizerVulkan::SetupShaderDescriptors( const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders) { - texture_cache.GuardSamplers(true); - - for (std::size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { - // Skip VertexA stage + image_view_indices.clear(); + sampler_handles.clear(); + for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { Shader* const shader = shaders[stage + 1]; if (!shader) { continue; } const auto& entries = shader->GetEntries(); - SetupGraphicsConstBuffers(entries, stage); - SetupGraphicsGlobalBuffers(entries, stage); SetupGraphicsUniformTexels(entries, stage); SetupGraphicsTextures(entries, stage); SetupGraphicsStorageTexels(entries, stage); SetupGraphicsImages(entries, stage); } - texture_cache.GuardSamplers(false); -} + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); -void RasterizerVulkan::SetupImageTransitions( - Texceptions texceptions, const std::array<View, Maxwell::NumRenderTargets>& color_attachments, - const View& zeta_attachment) { - TransitionImages(sampled_views, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, VK_ACCESS_SHADER_READ_BIT); - TransitionImages(image_views, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, - VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT); + update_descriptor_queue.Acquire(); - for (std::size_t rt = 0; rt < std::size(color_attachments); ++rt) { - const auto color_attachment = color_attachments[rt]; - if (color_attachment == nullptr) { + ImageViewId* image_view_id_ptr = image_view_ids.data(); + VkSampler* sampler_ptr = sampler_handles.data(); + for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { + // Skip VertexA stage + Shader* const shader = shaders[stage + 1]; + if (!shader) { continue; } - const auto image_layout = - texceptions[rt] ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - color_attachment->Transition(image_layout, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT); - } - - if (zeta_attachment != nullptr) { - const auto image_layout = texceptions[ZETA_TEXCEPTION_INDEX] - ? VK_IMAGE_LAYOUT_GENERAL - : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - zeta_attachment->Transition(image_layout, VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT, - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT); + const auto& entries = shader->GetEntries(); + SetupGraphicsConstBuffers(entries, stage); + SetupGraphicsGlobalBuffers(entries, stage); + PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr, + sampler_ptr); } } @@ -948,7 +903,6 @@ void RasterizerVulkan::UpdateDynamicStates() { UpdateDepthWriteEnable(regs); UpdateDepthCompareOp(regs); UpdateFrontFace(regs); - UpdatePrimitiveTopology(regs); UpdateStencilOp(regs); UpdateStencilTestEnable(regs); } @@ -1002,7 +956,7 @@ void RasterizerVulkan::EndTransformFeedback() { void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) { const auto& regs = maxwell3d.regs; - for (std::size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { + for (size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { const auto& vertex_array = regs.vertex_array[index]; if (!vertex_array.IsEnabled()) { continue; @@ -1011,7 +965,7 @@ void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) { const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; ASSERT(end >= start); - const std::size_t size = end - start; + const size_t size = end - start; if (size == 0) { buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0); continue; @@ -1072,7 +1026,7 @@ void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawPar } } -void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, size_t stage) { MICROPROFILE_SCOPE(Vulkan_ConstBuffers); const auto& shader_stage = maxwell3d.state.shader_stages[stage]; for (const auto& entry : entries.const_buffers) { @@ -1080,7 +1034,7 @@ void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, s } } -void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, size_t stage) { MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); const auto& cbufs{maxwell3d.state.shader_stages[stage]}; @@ -1090,37 +1044,49 @@ void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, } } -void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, size_t stage) { MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& regs = maxwell3d.regs; + const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.uniform_texels) { - const auto image = GetTextureInfo(maxwell3d, entry, stage).tic; - SetupUniformTexels(image, entry); + const TextureHandle handle = GetTextureInfo(maxwell3d, via_header_index, entry, stage); + image_view_indices.push_back(handle.image); } } -void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_t stage) { MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& regs = maxwell3d.regs; + const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.samplers) { - for (std::size_t i = 0; i < entry.size; ++i) { - const auto texture = GetTextureInfo(maxwell3d, entry, stage, i); - SetupTexture(texture, entry); + for (size_t index = 0; index < entry.size; ++index) { + const TextureHandle handle = + GetTextureInfo(maxwell3d, via_header_index, entry, stage, index); + image_view_indices.push_back(handle.image); + + Sampler* const sampler = texture_cache.GetGraphicsSampler(handle.sampler); + sampler_handles.push_back(sampler->Handle()); } } } -void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, size_t stage) { MICROPROFILE_SCOPE(Vulkan_Textures); + const auto& regs = maxwell3d.regs; + const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.storage_texels) { - const auto image = GetTextureInfo(maxwell3d, entry, stage).tic; - SetupStorageTexel(image, entry); + const TextureHandle handle = GetTextureInfo(maxwell3d, via_header_index, entry, stage); + image_view_indices.push_back(handle.image); } } -void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage) { +void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t stage) { MICROPROFILE_SCOPE(Vulkan_Images); + const auto& regs = maxwell3d.regs; + const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.images) { - const auto tic = GetTextureInfo(maxwell3d, entry, stage).tic; - SetupImage(tic, entry); + const TextureHandle handle = GetTextureInfo(maxwell3d, via_header_index, entry, stage); + image_view_indices.push_back(handle.image); } } @@ -1130,11 +1096,12 @@ void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) { for (const auto& entry : entries.const_buffers) { const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); - Tegra::Engines::ConstBufferInfo buffer; - buffer.address = config.Address(); - buffer.size = config.size; - buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(entry, buffer); + const Tegra::Engines::ConstBufferInfo info{ + .address = config.Address(), + .size = config.size, + .enabled = mask[entry.GetIndex()], + }; + SetupConstBuffer(entry, info); } } @@ -1149,35 +1116,46 @@ void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) { void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.uniform_texels) { - const auto image = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic; - SetupUniformTexels(image, entry); + const TextureHandle handle = + GetTextureInfo(kepler_compute, via_header_index, entry, COMPUTE_SHADER_INDEX); + image_view_indices.push_back(handle.image); } } void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.samplers) { - for (std::size_t i = 0; i < entry.size; ++i) { - const auto texture = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex, i); - SetupTexture(texture, entry); + for (size_t index = 0; index < entry.size; ++index) { + const TextureHandle handle = GetTextureInfo(kepler_compute, via_header_index, entry, + COMPUTE_SHADER_INDEX, index); + image_view_indices.push_back(handle.image); + + Sampler* const sampler = texture_cache.GetComputeSampler(handle.sampler); + sampler_handles.push_back(sampler->Handle()); } } } void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Textures); + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.storage_texels) { - const auto image = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic; - SetupStorageTexel(image, entry); + const TextureHandle handle = + GetTextureInfo(kepler_compute, via_header_index, entry, COMPUTE_SHADER_INDEX); + image_view_indices.push_back(handle.image); } } void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { MICROPROFILE_SCOPE(Vulkan_Images); + const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.images) { - const auto tic = GetTextureInfo(kepler_compute, entry, ComputeShaderIndex).tic; - SetupImage(tic, entry); + const TextureHandle handle = + GetTextureInfo(kepler_compute, via_header_index, entry, COMPUTE_SHADER_INDEX); + image_view_indices.push_back(handle.image); } } @@ -1188,14 +1166,12 @@ void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE); return; } - // Align the size to avoid bad std140 interactions - const std::size_t size = - Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); + const size_t size = Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); ASSERT(size <= MaxConstbufferSize); - const auto info = - buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); + const u64 alignment = device.GetUniformBufferAlignment(); + const auto info = buffer_cache.UploadMemory(buffer.address, size, alignment); update_descriptor_queue.AddBuffer(info.handle, info.offset, size); } @@ -1208,7 +1184,7 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd // because Vulkan doesn't like empty buffers. // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the // default buffer. - static constexpr std::size_t dummy_size = 4; + static constexpr size_t dummy_size = 4; const auto info = buffer_cache.GetEmptyBuffer(dummy_size); update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size); return; @@ -1219,55 +1195,6 @@ void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAdd update_descriptor_queue.AddBuffer(info.handle, info.offset, size); } -void RasterizerVulkan::SetupUniformTexels(const Tegra::Texture::TICEntry& tic, - const UniformTexelEntry& entry) { - const auto view = texture_cache.GetTextureSurface(tic, entry); - ASSERT(view->IsBufferView()); - - update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); -} - -void RasterizerVulkan::SetupTexture(const Tegra::Texture::FullTextureInfo& texture, - const SamplerEntry& entry) { - auto view = texture_cache.GetTextureSurface(texture.tic, entry); - ASSERT(!view->IsBufferView()); - - const VkImageView image_view = view->GetImageView(texture.tic.x_source, texture.tic.y_source, - texture.tic.z_source, texture.tic.w_source); - const auto sampler = sampler_cache.GetSampler(texture.tsc); - update_descriptor_queue.AddSampledImage(sampler, image_view); - - VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); - *image_layout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - sampled_views.push_back(ImageView{std::move(view), image_layout}); -} - -void RasterizerVulkan::SetupStorageTexel(const Tegra::Texture::TICEntry& tic, - const StorageTexelEntry& entry) { - const auto view = texture_cache.GetImageSurface(tic, entry); - ASSERT(view->IsBufferView()); - - update_descriptor_queue.AddTexelBuffer(view->GetBufferView()); -} - -void RasterizerVulkan::SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry) { - auto view = texture_cache.GetImageSurface(tic, entry); - - if (entry.is_written) { - view->MarkAsModified(texture_cache.Tick()); - } - - UNIMPLEMENTED_IF(tic.IsBuffer()); - - const VkImageView image_view = - view->GetImageView(tic.x_source, tic.y_source, tic.z_source, tic.w_source); - update_descriptor_queue.AddImage(image_view); - - VkImageLayout* const image_layout = update_descriptor_queue.LastImageLayout(); - *image_layout = VK_IMAGE_LAYOUT_GENERAL; - image_views.push_back(ImageView{std::move(view), image_layout}); -} - void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { if (!state_tracker.TouchViewports()) { return; @@ -1418,16 +1345,6 @@ void RasterizerVulkan::UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs) { [front_face](vk::CommandBuffer cmdbuf) { cmdbuf.SetFrontFaceEXT(front_face); }); } -void RasterizerVulkan::UpdatePrimitiveTopology(Tegra::Engines::Maxwell3D::Regs& regs) { - const Maxwell::PrimitiveTopology primitive_topology = regs.draw.topology.Value(); - if (!state_tracker.ChangePrimitiveTopology(primitive_topology)) { - return; - } - scheduler.Record([this, primitive_topology](vk::CommandBuffer cmdbuf) { - cmdbuf.SetPrimitiveTopologyEXT(MaxwellToVK::PrimitiveTopology(device, primitive_topology)); - }); -} - void RasterizerVulkan::UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs) { if (!state_tracker.TouchStencilOp()) { return; @@ -1469,8 +1386,8 @@ void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& }); } -std::size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const { - std::size_t size = CalculateVertexArraysSize(); +size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const { + size_t size = CalculateVertexArraysSize(); if (is_indexed) { size = Common::AlignUp(size, 4) + CalculateIndexBufferSize(); } @@ -1478,15 +1395,15 @@ std::size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) return size; } -std::size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const { +size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const { return Tegra::Engines::KeplerCompute::NumConstBuffers * (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); } -std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { +size_t RasterizerVulkan::CalculateVertexArraysSize() const { const auto& regs = maxwell3d.regs; - std::size_t size = 0; + size_t size = 0; for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { // This implementation assumes that all attributes are used in the shader. const GPUVAddr start{regs.vertex_array[index].StartAddress()}; @@ -1498,12 +1415,12 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { return size; } -std::size_t RasterizerVulkan::CalculateIndexBufferSize() const { - return static_cast<std::size_t>(maxwell3d.regs.index_array.count) * - static_cast<std::size_t>(maxwell3d.regs.index_array.FormatSizeInBytes()); +size_t RasterizerVulkan::CalculateIndexBufferSize() const { + return static_cast<size_t>(maxwell3d.regs.index_array.count) * + static_cast<size_t>(maxwell3d.regs.index_array.FormatSizeInBytes()); } -std::size_t RasterizerVulkan::CalculateConstBufferSize( +size_t RasterizerVulkan::CalculateConstBufferSize( const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) const { if (entry.IsIndirect()) { // Buffer is accessed indirectly, so upload the entire thing @@ -1514,37 +1431,10 @@ std::size_t RasterizerVulkan::CalculateConstBufferSize( } } -RenderPassParams RasterizerVulkan::GetRenderPassParams(Texceptions texceptions) const { - const auto& regs = maxwell3d.regs; - const std::size_t num_attachments = static_cast<std::size_t>(regs.rt_control.count); - - RenderPassParams params; - params.color_formats = {}; - std::size_t color_texceptions = 0; - - std::size_t index = 0; - for (std::size_t rt = 0; rt < num_attachments; ++rt) { - const auto& rendertarget = regs.rt[rt]; - if (rendertarget.Address() == 0 || rendertarget.format == Tegra::RenderTargetFormat::NONE) { - continue; - } - params.color_formats[index] = static_cast<u8>(rendertarget.format); - color_texceptions |= (texceptions[rt] ? 1ULL : 0ULL) << index; - ++index; - } - params.num_color_attachments = static_cast<u8>(index); - params.texceptions = static_cast<u8>(color_texceptions); - - params.zeta_format = regs.zeta_enable ? static_cast<u8>(regs.zeta.format) : 0; - params.zeta_texception = texceptions[ZETA_TEXCEPTION_INDEX]; - return params; -} - VkBuffer RasterizerVulkan::DefaultBuffer() { if (default_buffer) { return *default_buffer; } - default_buffer = device.GetLogical().CreateBuffer({ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index b47c8fc13..4695718e9 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -11,11 +11,11 @@ #include <vector> #include <boost/container/static_vector.hpp> -#include <boost/functional/hash.hpp> #include "common/common_types.h" #include "video_core/rasterizer_accelerated.h" #include "video_core/rasterizer_interface.h" +#include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" @@ -24,14 +24,13 @@ #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_query_cache.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/vk_sampler_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" #include "video_core/shader/async_shaders.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { class System; @@ -49,67 +48,16 @@ namespace Vulkan { struct VKScreenInfo; -using ImageViewsPack = boost::container::static_vector<VkImageView, Maxwell::NumRenderTargets + 1>; - -struct FramebufferCacheKey { - VkRenderPass renderpass{}; - u32 width = 0; - u32 height = 0; - u32 layers = 0; - ImageViewsPack views; - - std::size_t Hash() const noexcept { - std::size_t hash = 0; - boost::hash_combine(hash, static_cast<VkRenderPass>(renderpass)); - for (const auto& view : views) { - boost::hash_combine(hash, static_cast<VkImageView>(view)); - } - boost::hash_combine(hash, width); - boost::hash_combine(hash, height); - boost::hash_combine(hash, layers); - return hash; - } - - bool operator==(const FramebufferCacheKey& rhs) const noexcept { - return std::tie(renderpass, views, width, height, layers) == - std::tie(rhs.renderpass, rhs.views, rhs.width, rhs.height, rhs.layers); - } - - bool operator!=(const FramebufferCacheKey& rhs) const noexcept { - return !operator==(rhs); - } -}; - -} // namespace Vulkan - -namespace std { - -template <> -struct hash<Vulkan::FramebufferCacheKey> { - std::size_t operator()(const Vulkan::FramebufferCacheKey& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std - -namespace Vulkan { - class StateTracker; class BufferBindings; -struct ImageView { - View view; - VkImageLayout* layout = nullptr; -}; - class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { public: - explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window, Tegra::GPU& gpu, - Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, - VKScreenInfo& screen_info, const VKDevice& device, - VKMemoryManager& memory_manager, StateTracker& state_tracker, - VKScheduler& scheduler); + explicit RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + VKScreenInfo& screen_info_, const Device& device_, + VKMemoryManager& memory_manager_, StateTracker& state_tracker_, + VKScheduler& scheduler_); ~RasterizerVulkan() override; void Draw(bool is_indexed, bool is_instanced) override; @@ -123,15 +71,18 @@ public: void InvalidateRegion(VAddr addr, u64 size) override; void OnCPUWrite(VAddr addr, u64 size) override; void SyncGuestHost() override; + void UnmapMemory(VAddr addr, u64 size) override; void SignalSemaphore(GPUVAddr addr, u32 value) override; void SignalSyncPoint(u32 value) override; void ReleaseFences() override; void FlushAndInvalidateRegion(VAddr addr, u64 size) override; void WaitForIdle() override; + void FragmentBarrier() override; + void TiledCacheBarrier() override; void FlushCommands() override; void TickFrame() override; - bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src, - const Tegra::Engines::Fermi2D::Regs::Surface& dst, + bool AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) override; bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr, u32 pixel_stride) override; @@ -145,11 +96,17 @@ public: } /// Maximum supported size that a constbuffer can have in bytes. - static constexpr std::size_t MaxConstbufferSize = 0x10000; + static constexpr size_t MaxConstbufferSize = 0x10000; static_assert(MaxConstbufferSize % (4 * sizeof(float)) == 0, "The maximum size of a constbuffer must be a multiple of the size of GLvec4"); private: + static constexpr size_t MAX_TEXTURES = 192; + static constexpr size_t MAX_IMAGES = 48; + static constexpr size_t MAX_IMAGE_VIEWS = MAX_TEXTURES + MAX_IMAGES; + + static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float); + struct DrawParameters { void Draw(vk::CommandBuffer cmdbuf) const; @@ -160,20 +117,8 @@ private: bool is_indexed = 0; }; - using Texceptions = std::bitset<Maxwell::NumRenderTargets + 1>; - - static constexpr std::size_t ZETA_TEXCEPTION_INDEX = 8; - static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float); - void FlushWork(); - /// @brief Updates the currently bound attachments - /// @param is_clear True when the framebuffer is updated as a clear - /// @return Bitfield of attachments being used as sampled textures - Texceptions UpdateAttachments(bool is_clear); - - std::tuple<VkFramebuffer, VkExtent2D> ConfigureFramebuffers(VkRenderPass renderpass); - /// Setups geometry buffers and state. DrawParameters SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings, bool is_indexed, bool is_instanced); @@ -181,18 +126,12 @@ private: /// Setup descriptors in the graphics pipeline. void SetupShaderDescriptors(const std::array<Shader*, Maxwell::MaxShaderProgram>& shaders); - void SetupImageTransitions(Texceptions texceptions, - const std::array<View, Maxwell::NumRenderTargets>& color_attachments, - const View& zeta_attachment); - void UpdateDynamicStates(); void BeginTransformFeedback(); void EndTransformFeedback(); - bool WalkAttachmentOverlaps(const CachedSurfaceView& attachment); - void SetupVertexArrays(BufferBindings& buffer_bindings); void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed); @@ -238,14 +177,6 @@ private: void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); - void SetupUniformTexels(const Tegra::Texture::TICEntry& image, const UniformTexelEntry& entry); - - void SetupTexture(const Tegra::Texture::FullTextureInfo& texture, const SamplerEntry& entry); - - void SetupStorageTexel(const Tegra::Texture::TICEntry& tic, const StorageTexelEntry& entry); - - void SetupImage(const Tegra::Texture::TICEntry& tic, const ImageEntry& entry); - void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs); @@ -259,22 +190,19 @@ private: void UpdateDepthWriteEnable(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateDepthCompareOp(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateFrontFace(Tegra::Engines::Maxwell3D::Regs& regs); - void UpdatePrimitiveTopology(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs); - std::size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const; - - std::size_t CalculateComputeStreamBufferSize() const; + size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const; - std::size_t CalculateVertexArraysSize() const; + size_t CalculateComputeStreamBufferSize() const; - std::size_t CalculateIndexBufferSize() const; + size_t CalculateVertexArraysSize() const; - std::size_t CalculateConstBufferSize(const ConstBufferEntry& entry, - const Tegra::Engines::ConstBufferInfo& buffer) const; + size_t CalculateIndexBufferSize() const; - RenderPassParams GetRenderPassParams(Texceptions texceptions) const; + size_t CalculateConstBufferSize(const ConstBufferEntry& entry, + const Tegra::Engines::ConstBufferInfo& buffer) const; VkBuffer DefaultBuffer(); @@ -284,23 +212,24 @@ private: Tegra::Engines::KeplerCompute& kepler_compute; VKScreenInfo& screen_info; - const VKDevice& device; + const Device& device; VKMemoryManager& memory_manager; StateTracker& state_tracker; VKScheduler& scheduler; + VKStreamBuffer stream_buffer; VKStagingBufferPool staging_pool; VKDescriptorPool descriptor_pool; VKUpdateDescriptorQueue update_descriptor_queue; - VKRenderPassCache renderpass_cache; + BlitImageHelper blit_image; QuadArrayPass quad_array_pass; QuadIndexedPass quad_indexed_pass; Uint8Pass uint8_pass; - VKTextureCache texture_cache; + TextureCacheRuntime texture_cache_runtime; + TextureCache texture_cache; VKPipelineCache pipeline_cache; VKBufferCache buffer_cache; - VKSamplerCache sampler_cache; VKQueryCache query_cache; VKFenceManager fence_manager; @@ -309,16 +238,11 @@ private: vk::Event wfi_event; VideoCommon::Shader::AsyncShaders async_shaders; - std::array<View, Maxwell::NumRenderTargets> color_attachments; - View zeta_attachment; - - std::vector<ImageView> sampled_views; - std::vector<ImageView> image_views; + boost::container::static_vector<u32, MAX_IMAGE_VIEWS> image_view_indices; + std::array<VideoCommon::ImageViewId, MAX_IMAGE_VIEWS> image_view_ids; + boost::container::static_vector<VkSampler, MAX_TEXTURES> sampler_handles; u32 draw_counter = 0; - - // TODO(Rodrigo): Invalidate on image destruction - std::unordered_map<FramebufferCacheKey, vk::Framebuffer> framebuffer_cache; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp b/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp deleted file mode 100644 index 80284cf92..000000000 --- a/src/video_core/renderer_vulkan/vk_renderpass_cache.cpp +++ /dev/null @@ -1,158 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <cstring> -#include <memory> -#include <vector> - -#include "common/cityhash.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/vk_renderpass_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" - -namespace Vulkan { - -std::size_t RenderPassParams::Hash() const noexcept { - const u64 hash = Common::CityHash64(reinterpret_cast<const char*>(this), sizeof *this); - return static_cast<std::size_t>(hash); -} - -bool RenderPassParams::operator==(const RenderPassParams& rhs) const noexcept { - return std::memcmp(&rhs, this, sizeof *this) == 0; -} - -VKRenderPassCache::VKRenderPassCache(const VKDevice& device) : device{device} {} - -VKRenderPassCache::~VKRenderPassCache() = default; - -VkRenderPass VKRenderPassCache::GetRenderPass(const RenderPassParams& params) { - const auto [pair, is_cache_miss] = cache.try_emplace(params); - auto& entry = pair->second; - if (is_cache_miss) { - entry = CreateRenderPass(params); - } - return *entry; -} - -vk::RenderPass VKRenderPassCache::CreateRenderPass(const RenderPassParams& params) const { - using namespace VideoCore::Surface; - const std::size_t num_attachments = static_cast<std::size_t>(params.num_color_attachments); - - std::vector<VkAttachmentDescription> descriptors; - descriptors.reserve(num_attachments); - - std::vector<VkAttachmentReference> color_references; - color_references.reserve(num_attachments); - - for (std::size_t rt = 0; rt < num_attachments; ++rt) { - const auto guest_format = static_cast<Tegra::RenderTargetFormat>(params.color_formats[rt]); - const PixelFormat pixel_format = PixelFormatFromRenderTargetFormat(guest_format); - const auto format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format); - ASSERT_MSG(format.attachable, "Trying to attach a non-attachable format with format={}", - static_cast<int>(pixel_format)); - - // TODO(Rodrigo): Add MAY_ALIAS_BIT when it's needed. - const VkImageLayout color_layout = ((params.texceptions >> rt) & 1) != 0 - ? VK_IMAGE_LAYOUT_GENERAL - : VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL; - descriptors.push_back({ - .flags = VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT, - .format = format.format, - .samples = VK_SAMPLE_COUNT_1_BIT, - .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, - .storeOp = VK_ATTACHMENT_STORE_OP_STORE, - .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_DONT_CARE, - .stencilStoreOp = VK_ATTACHMENT_STORE_OP_DONT_CARE, - .initialLayout = color_layout, - .finalLayout = color_layout, - }); - - color_references.push_back({ - .attachment = static_cast<u32>(rt), - .layout = color_layout, - }); - } - - VkAttachmentReference zeta_attachment_ref; - const bool has_zeta = params.zeta_format != 0; - if (has_zeta) { - const auto guest_format = static_cast<Tegra::DepthFormat>(params.zeta_format); - const PixelFormat pixel_format = PixelFormatFromDepthFormat(guest_format); - const auto format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format); - ASSERT_MSG(format.attachable, "Trying to attach a non-attachable format with format={}", - static_cast<int>(pixel_format)); - - const VkImageLayout zeta_layout = params.zeta_texception != 0 - ? VK_IMAGE_LAYOUT_GENERAL - : VK_IMAGE_LAYOUT_DEPTH_STENCIL_ATTACHMENT_OPTIMAL; - descriptors.push_back({ - .flags = 0, - .format = format.format, - .samples = VK_SAMPLE_COUNT_1_BIT, - .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, - .storeOp = VK_ATTACHMENT_STORE_OP_STORE, - .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD, - .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE, - .initialLayout = zeta_layout, - .finalLayout = zeta_layout, - }); - - zeta_attachment_ref = { - .attachment = static_cast<u32>(num_attachments), - .layout = zeta_layout, - }; - } - - const VkSubpassDescription subpass_description{ - .flags = 0, - .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, - .inputAttachmentCount = 0, - .pInputAttachments = nullptr, - .colorAttachmentCount = static_cast<u32>(color_references.size()), - .pColorAttachments = color_references.data(), - .pResolveAttachments = nullptr, - .pDepthStencilAttachment = has_zeta ? &zeta_attachment_ref : nullptr, - .preserveAttachmentCount = 0, - .pPreserveAttachments = nullptr, - }; - - VkAccessFlags access = 0; - VkPipelineStageFlags stage = 0; - if (!color_references.empty()) { - access |= VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT; - stage |= VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT; - } - - if (has_zeta) { - access |= VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; - stage |= VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; - } - - const VkSubpassDependency subpass_dependency{ - .srcSubpass = VK_SUBPASS_EXTERNAL, - .dstSubpass = 0, - .srcStageMask = stage, - .dstStageMask = stage, - .srcAccessMask = 0, - .dstAccessMask = access, - .dependencyFlags = 0, - }; - - return device.GetLogical().CreateRenderPass({ - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .attachmentCount = static_cast<u32>(descriptors.size()), - .pAttachments = descriptors.data(), - .subpassCount = 1, - .pSubpasses = &subpass_description, - .dependencyCount = 1, - .pDependencies = &subpass_dependency, - }); -} - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_renderpass_cache.h b/src/video_core/renderer_vulkan/vk_renderpass_cache.h deleted file mode 100644 index 8b0fec720..000000000 --- a/src/video_core/renderer_vulkan/vk_renderpass_cache.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <type_traits> -#include <unordered_map> - -#include <boost/container/static_vector.hpp> -#include <boost/functional/hash.hpp> - -#include "video_core/engines/maxwell_3d.h" -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/surface.h" - -namespace Vulkan { - -class VKDevice; - -struct RenderPassParams { - std::array<u8, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> color_formats; - u8 num_color_attachments; - u8 texceptions; - - u8 zeta_format; - u8 zeta_texception; - - std::size_t Hash() const noexcept; - - bool operator==(const RenderPassParams& rhs) const noexcept; - - bool operator!=(const RenderPassParams& rhs) const noexcept { - return !operator==(rhs); - } -}; -static_assert(std::has_unique_object_representations_v<RenderPassParams>); -static_assert(std::is_trivially_copyable_v<RenderPassParams>); -static_assert(std::is_trivially_constructible_v<RenderPassParams>); - -} // namespace Vulkan - -namespace std { - -template <> -struct hash<Vulkan::RenderPassParams> { - std::size_t operator()(const Vulkan::RenderPassParams& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std - -namespace Vulkan { - -class VKRenderPassCache final { -public: - explicit VKRenderPassCache(const VKDevice& device); - ~VKRenderPassCache(); - - VkRenderPass GetRenderPass(const RenderPassParams& params); - -private: - vk::RenderPass CreateRenderPass(const RenderPassParams& params) const; - - const VKDevice& device; - std::unordered_map<RenderPassParams, vk::RenderPass> cache; -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp b/src/video_core/renderer_vulkan/vk_sampler_cache.cpp deleted file mode 100644 index b068888f9..000000000 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.cpp +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <unordered_map> - -#include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/vk_sampler_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/textures/texture.h" - -using Tegra::Texture::TextureMipmapFilter; - -namespace Vulkan { - -namespace { - -VkBorderColor ConvertBorderColor(std::array<float, 4> color) { - // TODO(Rodrigo): Manage integer border colors - if (color == std::array<float, 4>{0, 0, 0, 0}) { - return VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; - } else if (color == std::array<float, 4>{0, 0, 0, 1}) { - return VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; - } else if (color == std::array<float, 4>{1, 1, 1, 1}) { - return VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; - } - if (color[0] + color[1] + color[2] > 1.35f) { - // If color elements are brighter than roughly 0.5 average, use white border - return VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; - } else if (color[3] > 0.5f) { - return VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; - } else { - return VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; - } -} - -} // Anonymous namespace - -VKSamplerCache::VKSamplerCache(const VKDevice& device) : device{device} {} - -VKSamplerCache::~VKSamplerCache() = default; - -vk::Sampler VKSamplerCache::CreateSampler(const Tegra::Texture::TSCEntry& tsc) const { - const bool arbitrary_borders = device.IsExtCustomBorderColorSupported(); - const std::array color = tsc.GetBorderColor(); - - VkSamplerCustomBorderColorCreateInfoEXT border{ - .sType = VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT, - .pNext = nullptr, - .customBorderColor = {}, - .format = VK_FORMAT_UNDEFINED, - }; - std::memcpy(&border.customBorderColor, color.data(), sizeof(color)); - - return device.GetLogical().CreateSampler({ - .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, - .pNext = arbitrary_borders ? &border : nullptr, - .flags = 0, - .magFilter = MaxwellToVK::Sampler::Filter(tsc.mag_filter), - .minFilter = MaxwellToVK::Sampler::Filter(tsc.min_filter), - .mipmapMode = MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter), - .addressModeU = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter), - .addressModeV = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter), - .addressModeW = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), - .mipLodBias = tsc.GetLodBias(), - .anisotropyEnable = - static_cast<VkBool32>(tsc.GetMaxAnisotropy() > 1.0f ? VK_TRUE : VK_FALSE), - .maxAnisotropy = tsc.GetMaxAnisotropy(), - .compareEnable = tsc.depth_compare_enabled, - .compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), - .minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.GetMinLod(), - .maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.GetMaxLod(), - .borderColor = - arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color), - .unnormalizedCoordinates = VK_FALSE, - }); -} - -VkSampler VKSamplerCache::ToSamplerType(const vk::Sampler& sampler) const { - return *sampler; -} - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_sampler_cache.h b/src/video_core/renderer_vulkan/vk_sampler_cache.h deleted file mode 100644 index a33d1c0ee..000000000 --- a/src/video_core/renderer_vulkan/vk_sampler_cache.h +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/sampler_cache.h" -#include "video_core/textures/texture.h" - -namespace Vulkan { - -class VKDevice; - -class VKSamplerCache final : public VideoCommon::SamplerCache<VkSampler, vk::Sampler> { -public: - explicit VKSamplerCache(const VKDevice& device); - ~VKSamplerCache(); - -protected: - vk::Sampler CreateSampler(const Tegra::Texture::TSCEntry& tsc) const override; - - VkSampler ToSamplerType(const vk::Sampler& sampler) const override; - -private: - const VKDevice& device; -}; - -} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 1a483dc71..66004f9c0 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -11,12 +11,13 @@ #include "common/microprofile.h" #include "common/thread.h" #include "video_core/renderer_vulkan/vk_command_pool.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_state_tracker.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/renderer_vulkan/vk_texture_cache.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -36,7 +37,7 @@ void VKScheduler::CommandChunk::ExecuteAll(vk::CommandBuffer cmdbuf) { last = nullptr; } -VKScheduler::VKScheduler(const VKDevice& device_, StateTracker& state_tracker_) +VKScheduler::VKScheduler(const Device& device_, StateTracker& state_tracker_) : device{device_}, state_tracker{state_tracker_}, master_semaphore{std::make_unique<MasterSemaphore>(device)}, command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} { @@ -96,38 +97,39 @@ void VKScheduler::DispatchWork() { AcquireNewChunk(); } -void VKScheduler::RequestRenderpass(VkRenderPass renderpass, VkFramebuffer framebuffer, - VkExtent2D render_area) { - if (renderpass == state.renderpass && framebuffer == state.framebuffer && +void VKScheduler::RequestRenderpass(const Framebuffer* framebuffer) { + const VkRenderPass renderpass = framebuffer->RenderPass(); + const VkFramebuffer framebuffer_handle = framebuffer->Handle(); + const VkExtent2D render_area = framebuffer->RenderArea(); + if (renderpass == state.renderpass && framebuffer_handle == state.framebuffer && render_area.width == state.render_area.width && render_area.height == state.render_area.height) { return; } - const bool end_renderpass = state.renderpass != nullptr; + EndRenderPass(); state.renderpass = renderpass; - state.framebuffer = framebuffer; + state.framebuffer = framebuffer_handle; state.render_area = render_area; - const VkRenderPassBeginInfo renderpass_bi{ - .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, - .pNext = nullptr, - .renderPass = renderpass, - .framebuffer = framebuffer, - .renderArea = - { - .offset = {.x = 0, .y = 0}, - .extent = render_area, - }, - .clearValueCount = 0, - .pClearValues = nullptr, - }; - - Record([renderpass_bi, end_renderpass](vk::CommandBuffer cmdbuf) { - if (end_renderpass) { - cmdbuf.EndRenderPass(); - } + Record([renderpass, framebuffer_handle, render_area](vk::CommandBuffer cmdbuf) { + const VkRenderPassBeginInfo renderpass_bi{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO, + .pNext = nullptr, + .renderPass = renderpass, + .framebuffer = framebuffer_handle, + .renderArea = + { + .offset = {.x = 0, .y = 0}, + .extent = render_area, + }, + .clearValueCount = 0, + .pClearValues = nullptr, + }; cmdbuf.BeginRenderPass(renderpass_bi, VK_SUBPASS_CONTENTS_INLINE); }); + num_renderpass_images = framebuffer->NumImages(); + renderpass_images = framebuffer->Images(); + renderpass_image_ranges = framebuffer->ImageRanges(); } void VKScheduler::RequestOutsideRenderPassOperationContext() { @@ -241,8 +243,37 @@ void VKScheduler::EndRenderPass() { if (!state.renderpass) { return; } + Record([num_images = num_renderpass_images, images = renderpass_images, + ranges = renderpass_image_ranges](vk::CommandBuffer cmdbuf) { + std::array<VkImageMemoryBarrier, 9> barriers; + for (size_t i = 0; i < num_images; ++i) { + barriers[i] = VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = images[i], + .subresourceRange = ranges[i], + }; + } + cmdbuf.EndRenderPass(); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | + VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, + VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, 0, nullptr, nullptr, + vk::Span(barriers.data(), num_images)); + }); state.renderpass = nullptr; - Record([](vk::CommandBuffer cmdbuf) { cmdbuf.EndRenderPass(); }); + num_renderpass_images = 0; } void VKScheduler::AcquireNewChunk() { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 7be8a19f0..4cd43e425 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -12,21 +12,22 @@ #include <utility> #include "common/common_types.h" #include "common/threadsafe_queue.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { class CommandPool; +class Device; +class Framebuffer; class MasterSemaphore; class StateTracker; -class VKDevice; class VKQueryCache; /// The scheduler abstracts command buffer and fence management with an interface that's able to do /// OpenGL-like operations on Vulkan command buffers. class VKScheduler { public: - explicit VKScheduler(const VKDevice& device, StateTracker& state_tracker); + explicit VKScheduler(const Device& device, StateTracker& state_tracker); ~VKScheduler(); /// Returns the current command buffer tick. @@ -52,8 +53,7 @@ public: void DispatchWork(); /// Requests to begin a renderpass. - void RequestRenderpass(VkRenderPass renderpass, VkFramebuffer framebuffer, - VkExtent2D render_area); + void RequestRenderpass(const Framebuffer* framebuffer); /// Requests the current executino context to be able to execute operations only allowed outside /// of a renderpass. @@ -62,6 +62,9 @@ public: /// Binds a pipeline to the current execution context. void BindGraphicsPipeline(VkPipeline pipeline); + /// Invalidates current command buffer state except for render passes + void InvalidateState(); + /// Assigns the query cache. void SetQueryCache(VKQueryCache& query_cache_) { query_cache = &query_cache_; @@ -104,7 +107,7 @@ private: template <typename T> class TypedCommand final : public Command { public: - explicit TypedCommand(T&& command) : command{std::move(command)} {} + explicit TypedCommand(T&& command_) : command{std::move(command_)} {} ~TypedCommand() override = default; TypedCommand(TypedCommand&&) = delete; @@ -170,15 +173,13 @@ private: void AllocateNewContext(); - void InvalidateState(); - void EndPendingOperations(); void EndRenderPass(); void AcquireNewChunk(); - const VKDevice& device; + const Device& device; StateTracker& state_tracker; std::unique_ptr<MasterSemaphore> master_semaphore; @@ -192,6 +193,11 @@ private: std::thread worker_thread; State state; + + u32 num_renderpass_images = 0; + std::array<VkImage, 9> renderpass_images{}; + std::array<VkImageSubresourceRange, 9> renderpass_image_ranges{}; + Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_queue; Common::SPSCQueue<std::unique_ptr<CommandChunk>> chunk_reserve; std::mutex mutex; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index cd7d7a4e4..89cbe01ad 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -22,11 +22,11 @@ #include "video_core/engines/shader_bytecode.h" #include "video_core/engines/shader_header.h" #include "video_core/engines/shader_type.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_shader_decompiler.h" #include "video_core/shader/node.h" #include "video_core/shader/shader_ir.h" #include "video_core/shader/transform_feedback.h" +#include "video_core/vulkan_common/vulkan_device.h" namespace Vulkan { @@ -55,8 +55,8 @@ enum class Type { Void, Bool, Bool2, Float, Int, Uint, HalfFloat }; class Expression final { public: - Expression(Id id, Type type) : id{id}, type{type} { - ASSERT(type != Type::Void); + Expression(Id id_, Type type_) : id{id_}, type{type_} { + ASSERT(type_ != Type::Void); } Expression() : type{Type::Void} {} @@ -102,7 +102,7 @@ struct GenericVaryingDescription { bool is_scalar = false; }; -spv::Dim GetSamplerDim(const Sampler& sampler) { +spv::Dim GetSamplerDim(const SamplerEntry& sampler) { ASSERT(!sampler.is_buffer); switch (sampler.type) { case Tegra::Shader::TextureType::Texture1D: @@ -114,12 +114,12 @@ spv::Dim GetSamplerDim(const Sampler& sampler) { case Tegra::Shader::TextureType::TextureCube: return spv::Dim::Cube; default: - UNIMPLEMENTED_MSG("Unimplemented sampler type={}", static_cast<int>(sampler.type)); + UNIMPLEMENTED_MSG("Unimplemented sampler type={}", sampler.type); return spv::Dim::Dim2D; } } -std::pair<spv::Dim, bool> GetImageDim(const Image& image) { +std::pair<spv::Dim, bool> GetImageDim(const ImageEntry& image) { switch (image.type) { case Tegra::Shader::ImageType::Texture1D: return {spv::Dim::Dim1D, false}; @@ -134,7 +134,7 @@ std::pair<spv::Dim, bool> GetImageDim(const Image& image) { case Tegra::Shader::ImageType::Texture3D: return {spv::Dim::Dim3D, false}; default: - UNIMPLEMENTED_MSG("Unimplemented image type={}", static_cast<int>(image.type)); + UNIMPLEMENTED_MSG("Unimplemented image type={}", image.type); return {spv::Dim::Dim2D, false}; } } @@ -274,12 +274,12 @@ bool IsPrecise(Operation operand) { class SPIRVDecompiler final : public Sirit::Module { public: - explicit SPIRVDecompiler(const VKDevice& device, const ShaderIR& ir, ShaderType stage, - const Registry& registry, const Specialization& specialization) - : Module(0x00010300), device{device}, ir{ir}, stage{stage}, header{ir.GetHeader()}, - registry{registry}, specialization{specialization} { - if (stage != ShaderType::Compute) { - transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); + explicit SPIRVDecompiler(const Device& device_, const ShaderIR& ir_, ShaderType stage_, + const Registry& registry_, const Specialization& specialization_) + : Module(0x00010300), device{device_}, ir{ir_}, stage{stage_}, header{ir_.GetHeader()}, + registry{registry_}, specialization{specialization_} { + if (stage_ != ShaderType::Compute) { + transform_feedback = BuildTransformFeedback(registry_.GetGraphicsInfo()); } AddCapability(spv::Capability::Shader); @@ -293,6 +293,7 @@ public: AddCapability(spv::Capability::DrawParameters); AddCapability(spv::Capability::SubgroupBallotKHR); AddCapability(spv::Capability::SubgroupVoteKHR); + AddExtension("SPV_KHR_16bit_storage"); AddExtension("SPV_KHR_shader_ballot"); AddExtension("SPV_KHR_subgroup_vote"); AddExtension("SPV_KHR_storage_buffer_storage_class"); @@ -307,7 +308,6 @@ public: "supported on this device"); } } - if (ir.UsesLayer() || ir.UsesViewportIndex()) { if (ir.UsesViewportIndex()) { AddCapability(spv::Capability::MultiViewport); @@ -317,15 +317,13 @@ public: AddCapability(spv::Capability::ShaderViewportIndexLayerEXT); } } - if (device.IsFormatlessImageLoadSupported()) { AddCapability(spv::Capability::StorageImageReadWithoutFormat); } - if (device.IsFloat16Supported()) { AddCapability(spv::Capability::Float16); } - t_scalar_half = Name(TypeFloat(device.IsFloat16Supported() ? 16 : 32), "scalar_half"); + t_scalar_half = Name(TypeFloat(device_.IsFloat16Supported() ? 16 : 32), "scalar_half"); t_half = Name(TypeVector(t_scalar_half, 2), "half"); const Id main = Decompile(); @@ -369,6 +367,9 @@ public: if (header.ps.omap.depth) { AddExecutionMode(main, spv::ExecutionMode::DepthReplacing); } + if (specialization.early_fragment_tests) { + AddExecutionMode(main, spv::ExecutionMode::EarlyFragmentTests); + } break; case ShaderType::Compute: const auto workgroup_size = specialization.workgroup_size; @@ -972,7 +973,7 @@ private: return binding; } - void DeclareImage(const Image& image, u32& binding) { + void DeclareImage(const ImageEntry& image, u32& binding) { const auto [dim, arrayed] = GetImageDim(image); constexpr int depth = 0; constexpr bool ms = false; @@ -1080,9 +1081,9 @@ private: indices.point_size = AddBuiltIn(t_float, spv::BuiltIn::PointSize, "point_size"); } - const auto& output_attributes = ir.GetOutputAttributes(); - const bool declare_clip_distances = - std::any_of(output_attributes.begin(), output_attributes.end(), [](const auto& index) { + const auto& ir_output_attributes = ir.GetOutputAttributes(); + const bool declare_clip_distances = std::any_of( + ir_output_attributes.begin(), ir_output_attributes.end(), [](const auto& index) { return index == Attribute::Index::ClipDistances0123 || index == Attribute::Index::ClipDistances4567; }); @@ -1246,7 +1247,7 @@ private: const Id pointer = ArrayPass(type_descriptor.scalar, attribute_id, elements); return {OpLoad(GetTypeDefinition(type), pointer), type}; } - UNIMPLEMENTED_MSG("Unhandled input attribute: {}", static_cast<u32>(attribute)); + UNIMPLEMENTED_MSG("Unhandled input attribute: {}", attribute); return {v_float_zero, Type::Float}; } @@ -1882,7 +1883,7 @@ private: case Tegra::Shader::TextureType::Texture3D: return 3; default: - UNREACHABLE_MSG("Invalid texture type={}", static_cast<int>(type)); + UNREACHABLE_MSG("Invalid texture type={}", type); return 2; } }(); @@ -2067,6 +2068,46 @@ private: return {}; } + Id MaxwellToSpirvComparison(Maxwell::ComparisonOp compare_op, Id operand_1, Id operand_2) { + using Compare = Maxwell::ComparisonOp; + switch (compare_op) { + case Compare::NeverOld: + return v_false; // Never let the test pass + case Compare::LessOld: + return OpFOrdLessThan(t_bool, operand_1, operand_2); + case Compare::EqualOld: + return OpFOrdEqual(t_bool, operand_1, operand_2); + case Compare::LessEqualOld: + return OpFOrdLessThanEqual(t_bool, operand_1, operand_2); + case Compare::GreaterOld: + return OpFOrdGreaterThan(t_bool, operand_1, operand_2); + case Compare::NotEqualOld: + return OpFOrdNotEqual(t_bool, operand_1, operand_2); + case Compare::GreaterEqualOld: + return OpFOrdGreaterThanEqual(t_bool, operand_1, operand_2); + default: + UNREACHABLE(); + return v_true; + } + } + + void AlphaTest(Id pointer) { + if (specialization.alpha_test_func == Maxwell::ComparisonOp::AlwaysOld) { + return; + } + const Id true_label = OpLabel(); + const Id discard_label = OpLabel(); + const Id alpha_reference = Constant(t_float, specialization.alpha_test_ref); + const Id alpha_value = OpLoad(t_float, pointer); + const Id condition = + MaxwellToSpirvComparison(specialization.alpha_test_func, alpha_value, alpha_reference); + + OpBranchConditional(condition, true_label, discard_label); + AddLabel(discard_label); + OpKill(); + AddLabel(true_label); + } + void PreExit() { if (stage == ShaderType::Vertex && specialization.ndc_minus_one_to_one) { const u32 position_index = out_indices.position.value(); @@ -2078,8 +2119,7 @@ private: OpStore(z_pointer, depth); } if (stage == ShaderType::Fragment) { - const auto SafeGetRegister = [&](u32 reg) { - // TODO(Rodrigo): Replace with contains once C++20 releases + const auto SafeGetRegister = [this](u32 reg) { if (const auto it = registers.find(reg); it != registers.end()) { return OpLoad(t_float, it->second); } @@ -2089,8 +2129,6 @@ private: UNIMPLEMENTED_IF_MSG(header.ps.omap.sample_mask != 0, "Sample mask write is unimplemented"); - // TODO(Rodrigo): Alpha testing - // Write the color outputs using the data in the shader registers, disabled // rendertargets/components are skipped in the register assignment. u32 current_reg = 0; @@ -2102,6 +2140,9 @@ private: } const Id pointer = AccessElement(t_out_float, frag_colors[rt], component); OpStore(pointer, SafeGetRegister(current_reg)); + if (rt == 0 && component == 3) { + AlphaTest(pointer); + } ++current_reg; } } @@ -2701,7 +2742,7 @@ private: }; static_assert(operation_decompilers.size() == static_cast<std::size_t>(OperationCode::Amount)); - const VKDevice& device; + const Device& device; const ShaderIR& ir; const ShaderType stage; const Tegra::Shader::Header header; @@ -2843,7 +2884,7 @@ private: class ExprDecompiler { public: - explicit ExprDecompiler(SPIRVDecompiler& decomp) : decomp{decomp} {} + explicit ExprDecompiler(SPIRVDecompiler& decomp_) : decomp{decomp_} {} Id operator()(const ExprAnd& expr) { const Id type_def = decomp.GetTypeDefinition(Type::Bool); @@ -2899,7 +2940,7 @@ private: class ASTDecompiler { public: - explicit ASTDecompiler(SPIRVDecompiler& decomp) : decomp{decomp} {} + explicit ASTDecompiler(SPIRVDecompiler& decomp_) : decomp{decomp_} {} void operator()(const ASTProgram& ast) { ASTNode current = ast.nodes.GetFirst(); @@ -3089,7 +3130,7 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { return entries; } -std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, +std::vector<u32> Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, ShaderType stage, const VideoCommon::Shader::Registry& registry, const Specialization& specialization) { return SPIRVDecompiler(device, ir, stage, registry, specialization).Assemble(); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index 2b0e90396..26381e444 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -15,23 +15,21 @@ #include "video_core/shader/shader_ir.h" namespace Vulkan { -class VKDevice; -} -namespace Vulkan { +class Device; using Maxwell = Tegra::Engines::Maxwell3D::Regs; -using UniformTexelEntry = VideoCommon::Shader::Sampler; -using SamplerEntry = VideoCommon::Shader::Sampler; -using StorageTexelEntry = VideoCommon::Shader::Image; -using ImageEntry = VideoCommon::Shader::Image; +using UniformTexelEntry = VideoCommon::Shader::SamplerEntry; +using SamplerEntry = VideoCommon::Shader::SamplerEntry; +using StorageTexelEntry = VideoCommon::Shader::ImageEntry; +using ImageEntry = VideoCommon::Shader::ImageEntry; constexpr u32 DESCRIPTOR_SET = 0; class ConstBufferEntry : public VideoCommon::Shader::ConstBuffer { public: - explicit constexpr ConstBufferEntry(const VideoCommon::Shader::ConstBuffer& entry, u32 index) - : VideoCommon::Shader::ConstBuffer{entry}, index{index} {} + explicit constexpr ConstBufferEntry(const ConstBuffer& entry_, u32 index_) + : ConstBuffer{entry_}, index{index_} {} constexpr u32 GetIndex() const { return index; @@ -43,8 +41,8 @@ private: class GlobalBufferEntry { public: - constexpr explicit GlobalBufferEntry(u32 cbuf_index, u32 cbuf_offset, bool is_written) - : cbuf_index{cbuf_index}, cbuf_offset{cbuf_offset}, is_written{is_written} {} + constexpr explicit GlobalBufferEntry(u32 cbuf_index_, u32 cbuf_offset_, bool is_written_) + : cbuf_index{cbuf_index_}, cbuf_offset{cbuf_offset_}, is_written{is_written_} {} constexpr u32 GetCbufIndex() const { return cbuf_index; @@ -95,6 +93,9 @@ struct Specialization final { std::bitset<Maxwell::NumVertexAttributes> enabled_attributes; std::array<Maxwell::VertexAttribute::Type, Maxwell::NumVertexAttributes> attribute_types{}; bool ndc_minus_one_to_one{}; + bool early_fragment_tests{}; + float alpha_test_ref{}; + Maxwell::ComparisonOp alpha_test_func{}; }; // Old gcc versions don't consider this trivially copyable. // static_assert(std::is_trivially_copyable_v<Specialization>); @@ -106,7 +107,7 @@ struct SPIRVShader { ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir); -std::vector<u32> Decompile(const VKDevice& device, const VideoCommon::Shader::ShaderIR& ir, +std::vector<u32> Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, Tegra::Engines::ShaderType stage, const VideoCommon::Shader::Registry& registry, const Specialization& specialization); diff --git a/src/video_core/renderer_vulkan/vk_shader_util.cpp b/src/video_core/renderer_vulkan/vk_shader_util.cpp index c1a218d76..aaad4f292 100644 --- a/src/video_core/renderer_vulkan/vk_shader_util.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_util.cpp @@ -7,24 +7,19 @@ #include "common/assert.h" #include "common/common_types.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_shader_util.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -vk::ShaderModule BuildShader(const VKDevice& device, std::size_t code_size, const u8* code_data) { - // Avoid undefined behavior by copying to a staging allocation - ASSERT(code_size % sizeof(u32) == 0); - const auto data = std::make_unique<u32[]>(code_size / sizeof(u32)); - std::memcpy(data.get(), code_data, code_size); - +vk::ShaderModule BuildShader(const Device& device, std::span<const u32> code) { return device.GetLogical().CreateShaderModule({ .sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .codeSize = code_size, - .pCode = data.get(), + .codeSize = static_cast<u32>(code.size_bytes()), + .pCode = code.data(), }); } diff --git a/src/video_core/renderer_vulkan/vk_shader_util.h b/src/video_core/renderer_vulkan/vk_shader_util.h index d1d3f3cae..9517cbe84 100644 --- a/src/video_core/renderer_vulkan/vk_shader_util.h +++ b/src/video_core/renderer_vulkan/vk_shader_util.h @@ -4,13 +4,15 @@ #pragma once +#include <span> + #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; -vk::ShaderModule BuildShader(const VKDevice& device, std::size_t code_size, const u8* code_data); +vk::ShaderModule BuildShader(const Device& device, std::span<const u32> code); } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 2fd3b7f39..1e0b8b922 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -9,17 +9,17 @@ #include "common/bit_util.h" #include "common/common_types.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { VKStagingBufferPool::StagingBuffer::StagingBuffer(std::unique_ptr<VKBuffer> buffer_) : buffer{std::move(buffer_)} {} -VKStagingBufferPool::VKStagingBufferPool(const VKDevice& device_, VKMemoryManager& memory_manager_, +VKStagingBufferPool::VKStagingBufferPool(const Device& device_, VKMemoryManager& memory_manager_, VKScheduler& scheduler_) : device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_} {} diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h index 2dd5049ac..90dadcbbe 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.h @@ -10,11 +10,11 @@ #include "common/common_types.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKScheduler; struct VKBuffer final { @@ -24,7 +24,7 @@ struct VKBuffer final { class VKStagingBufferPool final { public: - explicit VKStagingBufferPool(const VKDevice& device, VKMemoryManager& memory_manager, + explicit VKStagingBufferPool(const Device& device, VKMemoryManager& memory_manager, VKScheduler& scheduler); ~VKStagingBufferPool(); @@ -58,7 +58,7 @@ private: u64 ReleaseLevel(StagingBuffersCache& cache, std::size_t log2); - const VKDevice& device; + const Device& device; VKMemoryManager& memory_manager; VKScheduler& scheduler; diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp index 5d2c4a796..1779a2e30 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp +++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include <algorithm> +#include <array> #include <cstddef> #include <iterator> @@ -14,7 +15,7 @@ #include "video_core/renderer_vulkan/vk_state_tracker.h" #define OFF(field_name) MAXWELL3D_REG_INDEX(field_name) -#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / sizeof(u32)) +#define NUM(field_name) (sizeof(Maxwell3D::Regs::field_name) / (sizeof(u32))) namespace Vulkan { @@ -29,21 +30,15 @@ using Table = Maxwell3D::DirtyState::Table; using Flags = Maxwell3D::DirtyState::Flags; Flags MakeInvalidationFlags() { + static constexpr std::array INVALIDATION_FLAGS{ + Viewports, Scissors, DepthBias, BlendConstants, DepthBounds, + StencilProperties, CullMode, DepthBoundsEnable, DepthTestEnable, DepthWriteEnable, + DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, + }; Flags flags{}; - flags[Viewports] = true; - flags[Scissors] = true; - flags[DepthBias] = true; - flags[BlendConstants] = true; - flags[DepthBounds] = true; - flags[StencilProperties] = true; - flags[CullMode] = true; - flags[DepthBoundsEnable] = true; - flags[DepthTestEnable] = true; - flags[DepthWriteEnable] = true; - flags[DepthCompareOp] = true; - flags[FrontFace] = true; - flags[StencilOp] = true; - flags[StencilTestEnable] = true; + for (const int flag : INVALIDATION_FLAGS) { + flags[flag] = true; + } return flags; } diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.h b/src/video_core/renderer_vulkan/vk_state_tracker.h index 1de789e57..c335d2bdf 100644 --- a/src/video_core/renderer_vulkan/vk_state_tracker.h +++ b/src/video_core/renderer_vulkan/vk_state_tracker.h @@ -52,6 +52,14 @@ public: current_topology = INVALID_TOPOLOGY; } + void InvalidateViewports() { + flags[Dirty::Viewports] = true; + } + + void InvalidateScissors() { + flags[Dirty::Scissors] = true; + } + bool TouchViewports() { return Exchange(Dirty::Viewports, false); } diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp index 5218c875b..a09fe084e 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.cpp +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.cpp @@ -10,15 +10,19 @@ #include "common/alignment.h" #include "common/assert.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_stream_buffer.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { namespace { +constexpr VkBufferUsageFlags BUFFER_USAGE = + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; + constexpr u64 WATCHES_INITIAL_RESERVE = 0x4000; constexpr u64 WATCHES_RESERVE_CHUNK = 0x1000; @@ -56,17 +60,16 @@ u32 GetMemoryType(const VkPhysicalDeviceMemoryProperties& properties, } // Anonymous namespace -VKStreamBuffer::VKStreamBuffer(const VKDevice& device_, VKScheduler& scheduler_, - VkBufferUsageFlags usage) +VKStreamBuffer::VKStreamBuffer(const Device& device_, VKScheduler& scheduler_) : device{device_}, scheduler{scheduler_} { - CreateBuffers(usage); + CreateBuffers(); ReserveWatches(current_watches, WATCHES_INITIAL_RESERVE); ReserveWatches(previous_watches, WATCHES_INITIAL_RESERVE); } VKStreamBuffer::~VKStreamBuffer() = default; -std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) { +std::pair<u8*, u64> VKStreamBuffer::Map(u64 size, u64 alignment) { ASSERT(size <= stream_buffer_size); mapped_size = size; @@ -76,7 +79,6 @@ std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) { WaitPendingOperations(offset); - bool invalidated = false; if (offset + size > stream_buffer_size) { // The buffer would overflow, save the amount of used watches and reset the state. invalidation_mark = current_watch_cursor; @@ -90,11 +92,9 @@ std::tuple<u8*, u64, bool> VKStreamBuffer::Map(u64 size, u64 alignment) { // Ensure that we don't wait for uncommitted fences. scheduler.Flush(); - - invalidated = true; } - return {memory.Map(offset, size), offset, invalidated}; + return std::make_pair(memory.Map(offset, size), offset); } void VKStreamBuffer::Unmap(u64 size) { @@ -113,20 +113,21 @@ void VKStreamBuffer::Unmap(u64 size) { watch.tick = scheduler.CurrentTick(); } -void VKStreamBuffer::CreateBuffers(VkBufferUsageFlags usage) { +void VKStreamBuffer::CreateBuffers() { const auto memory_properties = device.GetPhysical().GetMemoryProperties(); const u32 preferred_type = GetMemoryType(memory_properties); const u32 preferred_heap = memory_properties.memoryTypes[preferred_type].heapIndex; // Substract from the preferred heap size some bytes to avoid getting out of memory. const VkDeviceSize heap_size = memory_properties.memoryHeaps[preferred_heap].size; - const VkDeviceSize allocable_size = heap_size - 9 * 1024 * 1024; + // As per DXVK's example, using `heap_size / 2` + const VkDeviceSize allocable_size = heap_size / 2; buffer = device.GetLogical().CreateBuffer({ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, .size = std::min(PREFERRED_STREAM_BUFFER_SIZE, allocable_size), - .usage = usage, + .usage = BUFFER_USAGE, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_stream_buffer.h b/src/video_core/renderer_vulkan/vk_stream_buffer.h index 5e15ad78f..2e9c8cb46 100644 --- a/src/video_core/renderer_vulkan/vk_stream_buffer.h +++ b/src/video_core/renderer_vulkan/vk_stream_buffer.h @@ -5,31 +5,29 @@ #pragma once #include <optional> -#include <tuple> +#include <utility> #include <vector> #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKFenceWatch; class VKScheduler; class VKStreamBuffer final { public: - explicit VKStreamBuffer(const VKDevice& device, VKScheduler& scheduler, - VkBufferUsageFlags usage); + explicit VKStreamBuffer(const Device& device, VKScheduler& scheduler); ~VKStreamBuffer(); /** * Reserves a region of memory from the stream buffer. * @param size Size to reserve. - * @returns A tuple in the following order: Raw memory pointer (with offset added), buffer - * offset and a boolean that's true when buffer has been invalidated. + * @returns A pair of a raw memory pointer (with offset added), and the buffer offset */ - std::tuple<u8*, u64, bool> Map(u64 size, u64 alignment); + std::pair<u8*, u64> Map(u64 size, u64 alignment); /// Ensures that "size" bytes of memory are available to the GPU, potentially recording a copy. void Unmap(u64 size); @@ -49,14 +47,14 @@ private: }; /// Creates Vulkan buffer handles committing the required the required memory. - void CreateBuffers(VkBufferUsageFlags usage); + void CreateBuffers(); /// Increases the amount of watches available. void ReserveWatches(std::vector<Watch>& watches, std::size_t grow_size); void WaitPendingOperations(u64 requested_upper_bound); - const VKDevice& device; ///< Vulkan device manager. + const Device& device; ///< Vulkan device manager. VKScheduler& scheduler; ///< Command scheduler. vk::Buffer buffer; ///< Mapped buffer. diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index 9636a7c65..725a2a05d 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -11,10 +11,10 @@ #include "common/logging/log.h" #include "core/core.h" #include "core/frontend/framebuffer_layout.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_swapchain.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -56,7 +56,7 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi } // Anonymous namespace -VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const VKDevice& device_, VKScheduler& scheduler_) +VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const Device& device_, VKScheduler& scheduler_) : surface{surface_}, device{device_}, scheduler{scheduler_} {} VKSwapchain::~VKSwapchain() = default; diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index 6b39befdf..2eadd62b3 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -7,7 +7,7 @@ #include <vector> #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Layout { struct FramebufferLayout; @@ -15,12 +15,12 @@ struct FramebufferLayout; namespace Vulkan { -class VKDevice; +class Device; class VKScheduler; class VKSwapchain { public: - explicit VKSwapchain(VkSurfaceKHR surface, const VKDevice& device, VKScheduler& scheduler); + explicit VKSwapchain(VkSurfaceKHR surface, const Device& device, VKScheduler& scheduler); ~VKSwapchain(); /// Creates (or recreates) the swapchain with a given size. @@ -73,7 +73,7 @@ private: void Destroy(); const VkSurfaceKHR surface; - const VKDevice& device; + const Device& device; VKScheduler& scheduler; vk::SwapchainKHR swapchain; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index f2c8f2ae1..bd11de012 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -4,613 +4,1105 @@ #include <algorithm> #include <array> -#include <cstddef> -#include <cstring> -#include <memory> -#include <variant> +#include <span> #include <vector> -#include "common/assert.h" -#include "common/common_types.h" -#include "core/core.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/morton.h" +#include "video_core/engines/fermi_2d.h" +#include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/maxwell_to_vk.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" #include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/surface.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -using VideoCore::MortonSwizzle; -using VideoCore::MortonSwizzleMode; - +using Tegra::Engines::Fermi2D; using Tegra::Texture::SwizzleSource; -using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::SurfaceTarget; +using Tegra::Texture::TextureMipmapFilter; +using VideoCommon::BufferImageCopy; +using VideoCommon::ImageInfo; +using VideoCommon::ImageType; +using VideoCommon::SubresourceRange; +using VideoCore::Surface::IsPixelFormatASTC; namespace { -VkImageType SurfaceTargetToImage(SurfaceTarget target) { - switch (target) { - case SurfaceTarget::Texture1D: - case SurfaceTarget::Texture1DArray: +constexpr std::array ATTACHMENT_REFERENCES{ + VkAttachmentReference{0, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{1, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{2, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{3, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{4, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{5, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{6, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{7, VK_IMAGE_LAYOUT_GENERAL}, + VkAttachmentReference{8, VK_IMAGE_LAYOUT_GENERAL}, +}; + +constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { + if (color == std::array<float, 4>{0, 0, 0, 0}) { + return VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; + } else if (color == std::array<float, 4>{0, 0, 0, 1}) { + return VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; + } else if (color == std::array<float, 4>{1, 1, 1, 1}) { + return VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; + } + if (color[0] + color[1] + color[2] > 1.35f) { + // If color elements are brighter than roughly 0.5 average, use white border + return VK_BORDER_COLOR_FLOAT_OPAQUE_WHITE; + } else if (color[3] > 0.5f) { + return VK_BORDER_COLOR_FLOAT_OPAQUE_BLACK; + } else { + return VK_BORDER_COLOR_FLOAT_TRANSPARENT_BLACK; + } +} + +[[nodiscard]] VkImageType ConvertImageType(const ImageType type) { + switch (type) { + case ImageType::e1D: return VK_IMAGE_TYPE_1D; - case SurfaceTarget::Texture2D: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::TextureCubeArray: + case ImageType::e2D: + case ImageType::Linear: return VK_IMAGE_TYPE_2D; - case SurfaceTarget::Texture3D: + case ImageType::e3D: return VK_IMAGE_TYPE_3D; - case SurfaceTarget::TextureBuffer: - UNREACHABLE(); - return {}; + case ImageType::Buffer: + break; } - UNREACHABLE_MSG("Unknown texture target={}", static_cast<u32>(target)); + UNREACHABLE_MSG("Invalid image type={}", type); return {}; } -VkImageAspectFlags PixelFormatToImageAspect(PixelFormat pixel_format) { - if (pixel_format < PixelFormat::MaxColorFormat) { - return VK_IMAGE_ASPECT_COLOR_BIT; - } else if (pixel_format < PixelFormat::MaxDepthFormat) { - return VK_IMAGE_ASPECT_DEPTH_BIT; - } else if (pixel_format < PixelFormat::MaxDepthStencilFormat) { - return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; - } else { - UNREACHABLE_MSG("Invalid pixel format={}", static_cast<int>(pixel_format)); - return VK_IMAGE_ASPECT_COLOR_BIT; +[[nodiscard]] VkSampleCountFlagBits ConvertSampleCount(u32 num_samples) { + switch (num_samples) { + case 1: + return VK_SAMPLE_COUNT_1_BIT; + case 2: + return VK_SAMPLE_COUNT_2_BIT; + case 4: + return VK_SAMPLE_COUNT_4_BIT; + case 8: + return VK_SAMPLE_COUNT_8_BIT; + case 16: + return VK_SAMPLE_COUNT_16_BIT; + default: + UNREACHABLE_MSG("Invalid number of samples={}", num_samples); + return VK_SAMPLE_COUNT_1_BIT; } } -VkImageViewType GetImageViewType(SurfaceTarget target) { - switch (target) { - case SurfaceTarget::Texture1D: - return VK_IMAGE_VIEW_TYPE_1D; - case SurfaceTarget::Texture2D: - return VK_IMAGE_VIEW_TYPE_2D; - case SurfaceTarget::Texture3D: - return VK_IMAGE_VIEW_TYPE_3D; - case SurfaceTarget::Texture1DArray: - return VK_IMAGE_VIEW_TYPE_1D_ARRAY; - case SurfaceTarget::Texture2DArray: - return VK_IMAGE_VIEW_TYPE_2D_ARRAY; - case SurfaceTarget::TextureCubemap: - return VK_IMAGE_VIEW_TYPE_CUBE; - case SurfaceTarget::TextureCubeArray: - return VK_IMAGE_VIEW_TYPE_CUBE_ARRAY; - case SurfaceTarget::TextureBuffer: - break; +[[nodiscard]] VkImageCreateInfo MakeImageCreateInfo(const Device& device, const ImageInfo& info) { + const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, info.format); + VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; + if (info.type == ImageType::e2D && info.resources.layers >= 6 && + info.size.width == info.size.height) { + flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; } - UNREACHABLE(); - return {}; -} - -vk::Buffer CreateBuffer(const VKDevice& device, const SurfaceParams& params, - std::size_t host_memory_size) { - // TODO(Rodrigo): Move texture buffer creation to the buffer cache - return device.GetLogical().CreateBuffer({ - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + if (info.type == ImageType::e3D) { + flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; + } + VkImageUsageFlags usage = VK_IMAGE_USAGE_TRANSFER_SRC_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | + VK_IMAGE_USAGE_SAMPLED_BIT; + if (format_info.attachable) { + switch (VideoCore::Surface::GetFormatType(info.format)) { + case VideoCore::Surface::SurfaceType::ColorTexture: + usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; + break; + case VideoCore::Surface::SurfaceType::Depth: + case VideoCore::Surface::SurfaceType::DepthStencil: + usage |= VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT; + break; + default: + UNREACHABLE_MSG("Invalid surface type"); + } + } + if (format_info.storage) { + usage |= VK_IMAGE_USAGE_STORAGE_BIT; + } + const auto [samples_x, samples_y] = VideoCommon::SamplesLog2(info.num_samples); + return VkImageCreateInfo{ + .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, .pNext = nullptr, - .flags = 0, - .size = static_cast<VkDeviceSize>(host_memory_size), - .usage = VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | - VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .flags = flags, + .imageType = ConvertImageType(info.type), + .format = format_info.format, + .extent = + { + .width = info.size.width >> samples_x, + .height = info.size.height >> samples_y, + .depth = info.size.depth, + }, + .mipLevels = static_cast<u32>(info.resources.levels), + .arrayLayers = static_cast<u32>(info.resources.layers), + .samples = ConvertSampleCount(info.num_samples), + .tiling = VK_IMAGE_TILING_OPTIMAL, + .usage = usage, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - }); -} - -VkBufferViewCreateInfo GenerateBufferViewCreateInfo(const VKDevice& device, - const SurfaceParams& params, VkBuffer buffer, - std::size_t host_memory_size) { - ASSERT(params.IsBuffer()); - - return { - .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .buffer = buffer, - .format = - MaxwellToVK::SurfaceFormat(device, FormatType::Buffer, params.pixel_format).format, - .offset = 0, - .range = static_cast<VkDeviceSize>(host_memory_size), + .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, }; } -VkImageCreateInfo GenerateImageCreateInfo(const VKDevice& device, const SurfaceParams& params) { - ASSERT(!params.IsBuffer()); - - const auto [format, attachable, storage] = - MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, params.pixel_format); +[[nodiscard]] vk::Image MakeImage(const Device& device, const ImageInfo& info) { + if (info.type == ImageType::Buffer) { + return vk::Image{}; + } + return device.GetLogical().CreateImage(MakeImageCreateInfo(device, info)); +} - VkImageCreateInfo ci{ - .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO, +[[nodiscard]] vk::Buffer MakeBuffer(const Device& device, const ImageInfo& info) { + if (info.type != ImageType::Buffer) { + return vk::Buffer{}; + } + const size_t bytes_per_block = VideoCore::Surface::BytesPerBlock(info.format); + return device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .imageType = SurfaceTargetToImage(params.target), - .format = format, - .extent = {}, - .mipLevels = params.num_levels, - .arrayLayers = static_cast<u32>(params.GetNumLayers()), - .samples = VK_SAMPLE_COUNT_1_BIT, - .tiling = VK_IMAGE_TILING_OPTIMAL, - .usage = VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_TRANSFER_DST_BIT | - VK_IMAGE_USAGE_TRANSFER_SRC_BIT, + .size = info.size.width * bytes_per_block, + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, - .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED, - }; - if (attachable) { - ci.usage |= params.IsPixelFormatZeta() ? VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT - : VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT; - } - if (storage) { - ci.usage |= VK_IMAGE_USAGE_STORAGE_BIT; - } - - switch (params.target) { - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::TextureCubeArray: - ci.flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; - [[fallthrough]]; - case SurfaceTarget::Texture1D: - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2D: - case SurfaceTarget::Texture2DArray: - ci.extent = {params.width, params.height, 1}; - break; - case SurfaceTarget::Texture3D: - ci.flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT; - ci.extent = {params.width, params.height, params.depth}; - break; - case SurfaceTarget::TextureBuffer: - UNREACHABLE(); - } - - return ci; + }); } -u32 EncodeSwizzle(Tegra::Texture::SwizzleSource x_source, Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, Tegra::Texture::SwizzleSource w_source) { - return (static_cast<u32>(x_source) << 24) | (static_cast<u32>(y_source) << 16) | - (static_cast<u32>(z_source) << 8) | static_cast<u32>(w_source); +[[nodiscard]] VkImageAspectFlags ImageAspectMask(PixelFormat format) { + switch (VideoCore::Surface::GetFormatType(format)) { + case VideoCore::Surface::SurfaceType::ColorTexture: + return VK_IMAGE_ASPECT_COLOR_BIT; + case VideoCore::Surface::SurfaceType::Depth: + return VK_IMAGE_ASPECT_DEPTH_BIT; + case VideoCore::Surface::SurfaceType::DepthStencil: + return VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT; + default: + UNREACHABLE_MSG("Invalid surface type"); + return VkImageAspectFlags{}; + } } -} // Anonymous namespace - -CachedSurface::CachedSurface(const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool, - GPUVAddr gpu_addr, const SurfaceParams& params) - : SurfaceBase<View>{gpu_addr, params, device.IsOptimalAstcSupported()}, device{device}, - memory_manager{memory_manager}, scheduler{scheduler}, staging_pool{staging_pool} { - if (params.IsBuffer()) { - buffer = CreateBuffer(device, params, host_memory_size); - commit = memory_manager.Commit(buffer, false); - - const auto buffer_view_ci = - GenerateBufferViewCreateInfo(device, params, *buffer, host_memory_size); - format = buffer_view_ci.format; - - buffer_view = device.GetLogical().CreateBufferView(buffer_view_ci); - } else { - const auto image_ci = GenerateImageCreateInfo(device, params); - format = image_ci.format; - - image.emplace(device, scheduler, image_ci, PixelFormatToImageAspect(params.pixel_format)); - commit = memory_manager.Commit(image->GetHandle(), false); +[[nodiscard]] VkImageAspectFlags ImageViewAspectMask(const VideoCommon::ImageViewInfo& info) { + if (info.IsRenderTarget()) { + return ImageAspectMask(info.format); } - - // TODO(Rodrigo): Move this to a virtual function. - u32 num_layers = 1; - if (params.is_layered || params.target == SurfaceTarget::Texture3D) { - num_layers = params.depth; + const bool is_first = info.Swizzle()[0] == SwizzleSource::R; + switch (info.format) { + case PixelFormat::D24_UNORM_S8_UINT: + case PixelFormat::D32_FLOAT_S8_UINT: + return is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT; + case PixelFormat::S8_UINT_D24_UNORM: + return is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; + case PixelFormat::D16_UNORM: + case PixelFormat::D32_FLOAT: + return VK_IMAGE_ASPECT_DEPTH_BIT; + default: + return VK_IMAGE_ASPECT_COLOR_BIT; } - main_view = CreateView(ViewParams(params.target, 0, num_layers, 0, params.num_levels)); } -CachedSurface::~CachedSurface() = default; - -void CachedSurface::UploadTexture(const std::vector<u8>& staging_buffer) { - // To upload data we have to be outside of a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); +[[nodiscard]] VkAttachmentDescription AttachmentDescription(const Device& device, + const ImageView* image_view) { + const auto pixel_format = image_view->format; + return VkAttachmentDescription{ + .flags = VK_ATTACHMENT_DESCRIPTION_MAY_ALIAS_BIT, + .format = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, pixel_format).format, + .samples = image_view->Samples(), + .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .storeOp = VK_ATTACHMENT_STORE_OP_STORE, + .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD, + .stencilStoreOp = VK_ATTACHMENT_STORE_OP_STORE, + .initialLayout = VK_IMAGE_LAYOUT_GENERAL, + .finalLayout = VK_IMAGE_LAYOUT_GENERAL, + }; +} - if (params.IsBuffer()) { - UploadBuffer(staging_buffer); - } else { - UploadImage(staging_buffer); +[[nodiscard]] VkComponentSwizzle ComponentSwizzle(SwizzleSource swizzle) { + switch (swizzle) { + case SwizzleSource::Zero: + return VK_COMPONENT_SWIZZLE_ZERO; + case SwizzleSource::R: + return VK_COMPONENT_SWIZZLE_R; + case SwizzleSource::G: + return VK_COMPONENT_SWIZZLE_G; + case SwizzleSource::B: + return VK_COMPONENT_SWIZZLE_B; + case SwizzleSource::A: + return VK_COMPONENT_SWIZZLE_A; + case SwizzleSource::OneFloat: + case SwizzleSource::OneInt: + return VK_COMPONENT_SWIZZLE_ONE; } + UNREACHABLE_MSG("Invalid swizzle={}", swizzle); + return VK_COMPONENT_SWIZZLE_ZERO; } -void CachedSurface::DownloadTexture(std::vector<u8>& staging_buffer) { - UNIMPLEMENTED_IF(params.IsBuffer()); - - if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5_UNORM) { - LOG_WARNING(Render_Vulkan, "A1B5G5R5 flushing is stubbed"); +[[nodiscard]] VkImageViewType ImageViewType(VideoCommon::ImageViewType type) { + switch (type) { + case VideoCommon::ImageViewType::e1D: + return VK_IMAGE_VIEW_TYPE_1D; + case VideoCommon::ImageViewType::e2D: + return VK_IMAGE_VIEW_TYPE_2D; + case VideoCommon::ImageViewType::Cube: + return VK_IMAGE_VIEW_TYPE_CUBE; + case VideoCommon::ImageViewType::e3D: + return VK_IMAGE_VIEW_TYPE_3D; + case VideoCommon::ImageViewType::e1DArray: + return VK_IMAGE_VIEW_TYPE_1D_ARRAY; + case VideoCommon::ImageViewType::e2DArray: + return VK_IMAGE_VIEW_TYPE_2D_ARRAY; + case VideoCommon::ImageViewType::CubeArray: + return VK_IMAGE_VIEW_TYPE_CUBE_ARRAY; + case VideoCommon::ImageViewType::Rect: + LOG_WARNING(Render_Vulkan, "Unnormalized image view type not supported"); + return VK_IMAGE_VIEW_TYPE_2D; + case VideoCommon::ImageViewType::Buffer: + UNREACHABLE_MSG("Texture buffers can't be image views"); + return VK_IMAGE_VIEW_TYPE_1D; } + UNREACHABLE_MSG("Invalid image view type={}", type); + return VK_IMAGE_VIEW_TYPE_2D; +} - // We can't copy images to buffers inside a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); +[[nodiscard]] VkImageSubresourceLayers MakeImageSubresourceLayers( + VideoCommon::SubresourceLayers subresource, VkImageAspectFlags aspect_mask) { + return VkImageSubresourceLayers{ + .aspectMask = aspect_mask, + .mipLevel = static_cast<u32>(subresource.base_level), + .baseArrayLayer = static_cast<u32>(subresource.base_layer), + .layerCount = static_cast<u32>(subresource.num_layers), + }; +} - FullTransition(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT, - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); +[[nodiscard]] VkOffset3D MakeOffset3D(VideoCommon::Offset3D offset3d) { + return VkOffset3D{ + .x = offset3d.x, + .y = offset3d.y, + .z = offset3d.z, + }; +} - const auto& buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); - // TODO(Rodrigo): Do this in a single copy - for (u32 level = 0; level < params.num_levels; ++level) { - scheduler.Record([image = *image->GetHandle(), buffer = *buffer.handle, - copy = GetBufferImageCopy(level)](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, copy); - }); - } - scheduler.Finish(); +[[nodiscard]] VkExtent3D MakeExtent3D(VideoCommon::Extent3D extent3d) { + return VkExtent3D{ + .width = static_cast<u32>(extent3d.width), + .height = static_cast<u32>(extent3d.height), + .depth = static_cast<u32>(extent3d.depth), + }; +} - // TODO(Rodrigo): Use an intern buffer for staging buffers and avoid this unnecessary memcpy. - std::memcpy(staging_buffer.data(), buffer.commit->Map(host_memory_size), host_memory_size); +[[nodiscard]] VkImageCopy MakeImageCopy(const VideoCommon::ImageCopy& copy, + VkImageAspectFlags aspect_mask) noexcept { + return VkImageCopy{ + .srcSubresource = MakeImageSubresourceLayers(copy.src_subresource, aspect_mask), + .srcOffset = MakeOffset3D(copy.src_offset), + .dstSubresource = MakeImageSubresourceLayers(copy.dst_subresource, aspect_mask), + .dstOffset = MakeOffset3D(copy.dst_offset), + .extent = MakeExtent3D(copy.extent), + }; } -void CachedSurface::DecorateSurfaceName() { - // TODO(Rodrigo): Add name decorations +[[nodiscard]] std::vector<VkBufferCopy> TransformBufferCopies( + std::span<const VideoCommon::BufferCopy> copies, size_t buffer_offset) { + std::vector<VkBufferCopy> result(copies.size()); + std::ranges::transform( + copies, result.begin(), [buffer_offset](const VideoCommon::BufferCopy& copy) { + return VkBufferCopy{ + .srcOffset = static_cast<VkDeviceSize>(copy.src_offset + buffer_offset), + .dstOffset = static_cast<VkDeviceSize>(copy.dst_offset), + .size = static_cast<VkDeviceSize>(copy.size), + }; + }); + return result; } -View CachedSurface::CreateView(const ViewParams& params) { - // TODO(Rodrigo): Add name decorations - return views[params] = std::make_shared<CachedSurfaceView>(device, *this, params); +[[nodiscard]] std::vector<VkBufferImageCopy> TransformBufferImageCopies( + std::span<const BufferImageCopy> copies, size_t buffer_offset, VkImageAspectFlags aspect_mask) { + struct Maker { + VkBufferImageCopy operator()(const BufferImageCopy& copy) const { + return VkBufferImageCopy{ + .bufferOffset = copy.buffer_offset + buffer_offset, + .bufferRowLength = copy.buffer_row_length, + .bufferImageHeight = copy.buffer_image_height, + .imageSubresource = + { + .aspectMask = aspect_mask, + .mipLevel = static_cast<u32>(copy.image_subresource.base_level), + .baseArrayLayer = static_cast<u32>(copy.image_subresource.base_layer), + .layerCount = static_cast<u32>(copy.image_subresource.num_layers), + }, + .imageOffset = + { + .x = copy.image_offset.x, + .y = copy.image_offset.y, + .z = copy.image_offset.z, + }, + .imageExtent = + { + .width = copy.image_extent.width, + .height = copy.image_extent.height, + .depth = copy.image_extent.depth, + }, + }; + } + size_t buffer_offset; + VkImageAspectFlags aspect_mask; + }; + if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + std::vector<VkBufferImageCopy> result(copies.size() * 2); + std::ranges::transform(copies, result.begin(), + Maker{buffer_offset, VK_IMAGE_ASPECT_DEPTH_BIT}); + std::ranges::transform(copies, result.begin() + copies.size(), + Maker{buffer_offset, VK_IMAGE_ASPECT_STENCIL_BIT}); + return result; + } else { + std::vector<VkBufferImageCopy> result(copies.size()); + std::ranges::transform(copies, result.begin(), Maker{buffer_offset, aspect_mask}); + return result; + } } -void CachedSurface::UploadBuffer(const std::vector<u8>& staging_buffer) { - const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); - std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size); +[[nodiscard]] VkImageSubresourceRange MakeSubresourceRange(VkImageAspectFlags aspect_mask, + const SubresourceRange& range) { + return VkImageSubresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = static_cast<u32>(range.base.level), + .levelCount = static_cast<u32>(range.extent.levels), + .baseArrayLayer = static_cast<u32>(range.base.layer), + .layerCount = static_cast<u32>(range.extent.layers), + }; +} - scheduler.Record([src_buffer = *src_buffer.handle, dst_buffer = *buffer, - size = host_memory_size](vk::CommandBuffer cmdbuf) { - VkBufferCopy copy; - copy.srcOffset = 0; - copy.dstOffset = 0; - copy.size = size; - cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); +[[nodiscard]] VkImageSubresourceRange MakeSubresourceRange(const ImageView* image_view) { + SubresourceRange range = image_view->range; + if (True(image_view->flags & VideoCommon::ImageViewFlagBits::Slice)) { + // Slice image views always affect a single layer, but their subresource range corresponds + // to the slice. Override the value to affect a single layer. + range.base.layer = 0; + range.extent.layers = 1; + } + return MakeSubresourceRange(ImageAspectMask(image_view->format), range); +} - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_SHADER_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; // They'll be ignored anyway - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = dst_buffer; - barrier.offset = 0; - barrier.size = size; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, - 0, {}, barrier, {}); - }); +[[nodiscard]] VkImageSubresourceLayers MakeSubresourceLayers(const ImageView* image_view) { + return VkImageSubresourceLayers{ + .aspectMask = ImageAspectMask(image_view->format), + .mipLevel = static_cast<u32>(image_view->range.base.level), + .baseArrayLayer = static_cast<u32>(image_view->range.base.layer), + .layerCount = static_cast<u32>(image_view->range.extent.layers), + }; } -void CachedSurface::UploadImage(const std::vector<u8>& staging_buffer) { - const auto& src_buffer = staging_pool.GetUnusedBuffer(host_memory_size, true); - std::memcpy(src_buffer.commit->Map(host_memory_size), staging_buffer.data(), host_memory_size); - - FullTransition(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - - for (u32 level = 0; level < params.num_levels; ++level) { - const VkBufferImageCopy copy = GetBufferImageCopy(level); - if (image->GetAspectMask() == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - scheduler.Record([buffer = *src_buffer.handle, image = *image->GetHandle(), - copy](vk::CommandBuffer cmdbuf) { - std::array<VkBufferImageCopy, 2> copies = {copy, copy}; - copies[0].imageSubresource.aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT; - copies[1].imageSubresource.aspectMask = VK_IMAGE_ASPECT_STENCIL_BIT; - cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, - copies); - }); - } else { - scheduler.Record([buffer = *src_buffer.handle, image = *image->GetHandle(), - copy](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBufferToImage(buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); - }); - } +[[nodiscard]] constexpr SwizzleSource ConvertGreenRed(SwizzleSource value) { + switch (value) { + case SwizzleSource::G: + return SwizzleSource::R; + default: + return value; } } -VkBufferImageCopy CachedSurface::GetBufferImageCopy(u32 level) const { - return { - .bufferOffset = params.GetHostMipmapLevelOffset(level, is_converted), - .bufferRowLength = 0, - .bufferImageHeight = 0, - .imageSubresource = +void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image, + VkImageAspectFlags aspect_mask, bool is_initialized, + std::span<const VkBufferImageCopy> copies) { + static constexpr VkAccessFlags ACCESS_FLAGS = VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + const VkImageMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = ACCESS_FLAGS, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { - .aspectMask = image->GetAspectMask(), - .mipLevel = level, + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, .baseArrayLayer = 0, - .layerCount = static_cast<u32>(params.GetNumLayers()), + .layerCount = VK_REMAINING_ARRAY_LAYERS, }, - .imageOffset = {.x = 0, .y = 0, .z = 0}, - .imageExtent = + }; + const VkImageMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = ACCESS_FLAGS, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange = { - .width = params.GetMipWidth(level), - .height = params.GetMipHeight(level), - .depth = params.target == SurfaceTarget::Texture3D ? params.GetMipDepth(level) : 1U, + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, }, }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, + read_barrier); + cmdbuf.CopyBufferToImage(src_buffer, image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copies); + // TODO: Move this to another API + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, + write_barrier); } -VkImageSubresourceRange CachedSurface::GetImageSubresourceRange() const { - return {image->GetAspectMask(), 0, params.num_levels, 0, - static_cast<u32>(params.GetNumLayers())}; +[[nodiscard]] VkImageBlit MakeImageBlit(const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + const VkImageSubresourceLayers& dst_layers, + const VkImageSubresourceLayers& src_layers) { + return VkImageBlit{ + .srcSubresource = src_layers, + .srcOffsets = + { + { + .x = src_region[0].x, + .y = src_region[0].y, + .z = 0, + }, + { + .x = src_region[1].x, + .y = src_region[1].y, + .z = 1, + }, + }, + .dstSubresource = dst_layers, + .dstOffsets = + { + { + .x = dst_region[0].x, + .y = dst_region[0].y, + .z = 0, + }, + { + .x = dst_region[1].x, + .y = dst_region[1].y, + .z = 1, + }, + }, + }; } -CachedSurfaceView::CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params) - : VideoCommon::ViewBase{params}, params{surface.GetSurfaceParams()}, - image{surface.GetImageHandle()}, buffer_view{surface.GetBufferViewHandle()}, - aspect_mask{surface.GetAspectMask()}, device{device}, surface{surface}, - base_level{params.base_level}, num_levels{params.num_levels}, - image_view_type{image ? GetImageViewType(params.target) : VK_IMAGE_VIEW_TYPE_1D} { - if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { - base_layer = 0; - num_layers = 1; - base_slice = params.base_layer; - num_slices = params.num_layers; - } else { - base_layer = params.base_layer; - num_layers = params.num_layers; - } +[[nodiscard]] VkImageResolve MakeImageResolve(const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + const VkImageSubresourceLayers& dst_layers, + const VkImageSubresourceLayers& src_layers) { + return VkImageResolve{ + .srcSubresource = src_layers, + .srcOffset = + { + .x = src_region[0].x, + .y = src_region[0].y, + .z = 0, + }, + .dstSubresource = dst_layers, + .dstOffset = + { + .x = dst_region[0].x, + .y = dst_region[0].y, + .z = 0, + }, + .extent = + { + .width = static_cast<u32>(dst_region[1].x - dst_region[0].x), + .height = static_cast<u32>(dst_region[1].y - dst_region[0].y), + .depth = 1, + }, + }; } -CachedSurfaceView::~CachedSurfaceView() = default; - -VkImageView CachedSurfaceView::GetImageView(SwizzleSource x_source, SwizzleSource y_source, - SwizzleSource z_source, SwizzleSource w_source) { - const u32 new_swizzle = EncodeSwizzle(x_source, y_source, z_source, w_source); - if (last_image_view && last_swizzle == new_swizzle) { - return last_image_view; +struct RangedBarrierRange { + u32 min_mip = std::numeric_limits<u32>::max(); + u32 max_mip = std::numeric_limits<u32>::min(); + u32 min_layer = std::numeric_limits<u32>::max(); + u32 max_layer = std::numeric_limits<u32>::min(); + + void AddLayers(const VkImageSubresourceLayers& layers) { + min_mip = std::min(min_mip, layers.mipLevel); + max_mip = std::max(max_mip, layers.mipLevel + 1); + min_layer = std::min(min_layer, layers.baseArrayLayer); + max_layer = std::max(max_layer, layers.baseArrayLayer + layers.layerCount); } - last_swizzle = new_swizzle; - const auto [entry, is_cache_miss] = view_cache.try_emplace(new_swizzle); - auto& image_view = entry->second; - if (!is_cache_miss) { - return last_image_view = *image_view; + VkImageSubresourceRange SubresourceRange(VkImageAspectFlags aspect_mask) const noexcept { + return VkImageSubresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = min_mip, + .levelCount = max_mip - min_mip, + .baseArrayLayer = min_layer, + .layerCount = max_layer - min_layer, + }; } +}; - std::array swizzle{MaxwellToVK::SwizzleSource(x_source), MaxwellToVK::SwizzleSource(y_source), - MaxwellToVK::SwizzleSource(z_source), MaxwellToVK::SwizzleSource(w_source)}; - if (params.pixel_format == VideoCore::Surface::PixelFormat::A1B5G5R5_UNORM) { - // A1B5G5R5 is implemented as A1R5G5B5, we have to change the swizzle here. - std::swap(swizzle[0], swizzle[2]); - } +} // Anonymous namespace - // Games can sample depth or stencil values on textures. This is decided by the swizzle value on - // hardware. To emulate this on Vulkan we specify it in the aspect. - VkImageAspectFlags aspect = aspect_mask; - if (aspect == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { - UNIMPLEMENTED_IF(x_source != SwizzleSource::R && x_source != SwizzleSource::G); - const bool is_first = x_source == SwizzleSource::R; - switch (params.pixel_format) { - case VideoCore::Surface::PixelFormat::D24_UNORM_S8_UINT: - case VideoCore::Surface::PixelFormat::D32_FLOAT_S8_UINT: - aspect = is_first ? VK_IMAGE_ASPECT_DEPTH_BIT : VK_IMAGE_ASPECT_STENCIL_BIT; - break; - case VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM: - aspect = is_first ? VK_IMAGE_ASPECT_STENCIL_BIT : VK_IMAGE_ASPECT_DEPTH_BIT; - break; - default: - aspect = VK_IMAGE_ASPECT_DEPTH_BIT; - UNIMPLEMENTED(); - } +void TextureCacheRuntime::Finish() { + scheduler.Finish(); +} - // Make sure we sample the first component - std::transform( - swizzle.begin(), swizzle.end(), swizzle.begin(), [](VkComponentSwizzle component) { - return component == VK_COMPONENT_SWIZZLE_G ? VK_COMPONENT_SWIZZLE_R : component; - }); - } +ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { + const auto& buffer = staging_buffer_pool.GetUnusedBuffer(size, true); + return ImageBufferMap{ + .handle = *buffer.handle, + .map = buffer.commit->Map(size), + }; +} - if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { - ASSERT(base_slice == 0); - ASSERT(num_slices == params.depth); +void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation) { + const VkImageAspectFlags aspect_mask = ImageAspectMask(src.format); + const bool is_dst_msaa = dst.Samples() != VK_SAMPLE_COUNT_1_BIT; + const bool is_src_msaa = src.Samples() != VK_SAMPLE_COUNT_1_BIT; + ASSERT(aspect_mask == ImageAspectMask(dst.format)); + if (aspect_mask == VK_IMAGE_ASPECT_COLOR_BIT && !is_src_msaa && !is_dst_msaa) { + blit_image_helper.BlitColor(dst_framebuffer, src, dst_region, src_region, filter, + operation); + return; } - - image_view = device.GetLogical().CreateImageView({ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .image = surface.GetImageHandle(), - .viewType = image_view_type, - .format = surface.GetImage().GetFormat(), - .components = - { - .r = swizzle[0], - .g = swizzle[1], - .b = swizzle[2], - .a = swizzle[3], + if (aspect_mask == (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) { + if (!device.IsBlitDepthStencilSupported()) { + UNIMPLEMENTED_IF(is_src_msaa || is_dst_msaa); + blit_image_helper.BlitDepthStencil(dst_framebuffer, src.DepthView(), src.StencilView(), + dst_region, src_region, filter, operation); + return; + } + } + ASSERT(src.ImageFormat() == dst.ImageFormat()); + ASSERT(!(is_dst_msaa && !is_src_msaa)); + ASSERT(operation == Fermi2D::Operation::SrcCopy); + + const VkImage dst_image = dst.ImageHandle(); + const VkImage src_image = src.ImageHandle(); + const VkImageSubresourceLayers dst_layers = MakeSubresourceLayers(&dst); + const VkImageSubresourceLayers src_layers = MakeSubresourceLayers(&src); + const bool is_resolve = is_src_msaa && !is_dst_msaa; + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([filter, dst_region, src_region, dst_image, src_image, dst_layers, src_layers, + aspect_mask, is_resolve](vk::CommandBuffer cmdbuf) { + const std::array read_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = src_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, }, - .subresourceRange = - { - .aspectMask = aspect, - .baseMipLevel = base_level, - .levelCount = num_levels, - .baseArrayLayer = base_layer, - .layerCount = num_layers, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }, + }; + VkImageMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, }, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, nullptr, nullptr, read_barriers); + if (is_resolve) { + cmdbuf.ResolveImage(src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageResolve(dst_region, src_region, dst_layers, src_layers)); + } else { + const bool is_linear = filter == Fermi2D::Filter::Bilinear; + const VkFilter vk_filter = is_linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST; + cmdbuf.BlitImage( + src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + MakeImageBlit(dst_region, src_region, dst_layers, src_layers), vk_filter); + } + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, write_barrier); }); - - return last_image_view = *image_view; } -VkImageView CachedSurfaceView::GetAttachment() { - if (render_target) { - return *render_target; +void TextureCacheRuntime::ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view) { + switch (dst_view.format) { + case PixelFormat::R16_UNORM: + if (src_view.format == PixelFormat::D16_UNORM) { + return blit_image_helper.ConvertD16ToR16(dst, src_view); + } + break; + case PixelFormat::R32_FLOAT: + if (src_view.format == PixelFormat::D32_FLOAT) { + return blit_image_helper.ConvertD32ToR32(dst, src_view); + } + break; + case PixelFormat::D16_UNORM: + if (src_view.format == PixelFormat::R16_UNORM) { + return blit_image_helper.ConvertR16ToD16(dst, src_view); + } + break; + case PixelFormat::D32_FLOAT: + if (src_view.format == PixelFormat::R32_FLOAT) { + return blit_image_helper.ConvertR32ToD32(dst, src_view); + } + break; + default: + break; } + UNIMPLEMENTED_MSG("Unimplemented format copy from {} to {}", src_view.format, dst_view.format); +} - VkImageViewCreateInfo ci{ - .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .image = surface.GetImageHandle(), - .viewType = VK_IMAGE_VIEW_TYPE_1D, - .format = surface.GetImage().GetFormat(), - .components = - { - .r = VK_COMPONENT_SWIZZLE_IDENTITY, - .g = VK_COMPONENT_SWIZZLE_IDENTITY, - .b = VK_COMPONENT_SWIZZLE_IDENTITY, - .a = VK_COMPONENT_SWIZZLE_IDENTITY, +void TextureCacheRuntime::CopyImage(Image& dst, Image& src, + std::span<const VideoCommon::ImageCopy> copies) { + std::vector<VkImageCopy> vk_copies(copies.size()); + const VkImageAspectFlags aspect_mask = dst.AspectMask(); + ASSERT(aspect_mask == src.AspectMask()); + + std::ranges::transform(copies, vk_copies.begin(), [aspect_mask](const auto& copy) { + return MakeImageCopy(copy, aspect_mask); + }); + const VkImage dst_image = dst.Handle(); + const VkImage src_image = src.Handle(); + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([dst_image, src_image, aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) { + RangedBarrierRange dst_range; + RangedBarrierRange src_range; + for (const VkImageCopy& copy : vk_copies) { + dst_range.AddLayers(copy.dstSubresource); + src_range.AddLayers(copy.srcSubresource); + } + const std::array read_barriers{ + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = src_image, + .subresourceRange = src_range.SubresourceRange(aspect_mask), }, - .subresourceRange = - { - .aspectMask = aspect_mask, - .baseMipLevel = base_level, - .levelCount = num_levels, - .baseArrayLayer = 0, - .layerCount = 0, + VkImageMemoryBarrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange = dst_range.SubresourceRange(aspect_mask), }, - }; - if (image_view_type == VK_IMAGE_VIEW_TYPE_3D) { - ci.viewType = num_slices > 1 ? VK_IMAGE_VIEW_TYPE_2D_ARRAY : VK_IMAGE_VIEW_TYPE_2D; - ci.subresourceRange.baseArrayLayer = base_slice; - ci.subresourceRange.layerCount = num_slices; + }; + const VkImageMemoryBarrier write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | + VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = dst_image, + .subresourceRange = dst_range.SubresourceRange(aspect_mask), + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, {}, {}, read_barriers); + cmdbuf.CopyImage(src_image, VK_IMAGE_LAYOUT_GENERAL, dst_image, + VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, vk_copies); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, write_barrier); + }); +} + +Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_addr_, + VAddr cpu_addr_) + : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), scheduler{&runtime.scheduler}, + image(MakeImage(runtime.device, info)), buffer(MakeBuffer(runtime.device, info)), + aspect_mask(ImageAspectMask(info.format)) { + if (image) { + commit = runtime.memory_manager.Commit(image, false); } else { - ci.viewType = image_view_type; - ci.subresourceRange.baseArrayLayer = base_layer; - ci.subresourceRange.layerCount = num_layers; + commit = runtime.memory_manager.Commit(buffer, false); + } + if (IsPixelFormatASTC(info.format) && !runtime.device.IsOptimalAstcSupported()) { + flags |= VideoCommon::ImageFlagBits::Converted; + } + if (runtime.device.HasDebuggingToolAttached()) { + if (image) { + image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); + } else { + buffer.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); + } } - render_target = device.GetLogical().CreateImageView(ci); - return *render_target; } -VKTextureCache::VKTextureCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::Engines::Maxwell3D& maxwell3d, - Tegra::MemoryManager& gpu_memory, const VKDevice& device_, - VKMemoryManager& memory_manager_, VKScheduler& scheduler_, - VKStagingBufferPool& staging_pool_) - : TextureCache(rasterizer, maxwell3d, gpu_memory, device_.IsOptimalAstcSupported()), - device{device_}, memory_manager{memory_manager_}, scheduler{scheduler_}, staging_pool{ - staging_pool_} {} - -VKTextureCache::~VKTextureCache() = default; - -Surface VKTextureCache::CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) { - return std::make_shared<CachedSurface>(device, memory_manager, scheduler, staging_pool, - gpu_addr, params); +void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const BufferImageCopy> copies) { + // TODO: Move this to another API + scheduler->RequestOutsideRenderPassOperationContext(); + std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); + const VkBuffer src_buffer = map.handle; + const VkImage vk_image = *image; + const VkImageAspectFlags vk_aspect_mask = aspect_mask; + const bool is_initialized = std::exchange(initialized, true); + scheduler->Record([src_buffer, vk_image, vk_aspect_mask, is_initialized, + vk_copies](vk::CommandBuffer cmdbuf) { + CopyBufferToImage(cmdbuf, src_buffer, vk_image, vk_aspect_mask, is_initialized, vk_copies); + }); } -void VKTextureCache::ImageCopy(Surface& src_surface, Surface& dst_surface, - const VideoCommon::CopyParams& copy_params) { - const bool src_3d = src_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D; - const bool dst_3d = dst_surface->GetSurfaceParams().target == SurfaceTarget::Texture3D; - UNIMPLEMENTED_IF(src_3d); +void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferCopy> copies) { + // TODO: Move this to another API + scheduler->RequestOutsideRenderPassOperationContext(); + std::vector vk_copies = TransformBufferCopies(copies, buffer_offset); + const VkBuffer src_buffer = map.handle; + const VkBuffer dst_buffer = *buffer; + scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { + // TODO: Barriers + cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies); + }); +} - // The texture cache handles depth in OpenGL terms, we have to handle it as subresource and - // dimension respectively. - const u32 dst_base_layer = dst_3d ? 0 : copy_params.dest_z; - const u32 dst_offset_z = dst_3d ? copy_params.dest_z : 0; +void Image::DownloadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const BufferImageCopy> copies) { + std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); + scheduler->Record([buffer = map.handle, image = *image, aspect_mask = aspect_mask, + vk_copies](vk::CommandBuffer cmdbuf) { + // TODO: Barriers + cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_GENERAL, buffer, vk_copies); + }); +} - const u32 extent_z = dst_3d ? copy_params.depth : 1; - const u32 num_layers = dst_3d ? 1 : copy_params.depth; +ImageView::ImageView(TextureCacheRuntime& runtime, const VideoCommon::ImageViewInfo& info, + ImageId image_id_, Image& image) + : VideoCommon::ImageViewBase{info, image.info, image_id_}, device{&runtime.device}, + image_handle{image.Handle()}, image_format{image.info.format}, samples{ConvertSampleCount( + image.info.num_samples)} { + const VkImageAspectFlags aspect_mask = ImageViewAspectMask(info); + std::array<SwizzleSource, 4> swizzle{ + SwizzleSource::R, + SwizzleSource::G, + SwizzleSource::B, + SwizzleSource::A, + }; + if (!info.IsRenderTarget()) { + swizzle = info.Swizzle(); + if ((aspect_mask & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) != 0) { + std::ranges::transform(swizzle, swizzle.begin(), ConvertGreenRed); + } + } + const VkFormat vk_format = + MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, format).format; + const VkImageViewCreateInfo create_info{ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = image.Handle(), + .viewType = VkImageViewType{}, + .format = vk_format, + .components{ + .r = ComponentSwizzle(swizzle[0]), + .g = ComponentSwizzle(swizzle[1]), + .b = ComponentSwizzle(swizzle[2]), + .a = ComponentSwizzle(swizzle[3]), + }, + .subresourceRange = MakeSubresourceRange(aspect_mask, info.range), + }; + const auto create = [&](VideoCommon::ImageViewType view_type, std::optional<u32> num_layers) { + VkImageViewCreateInfo ci{create_info}; + ci.viewType = ImageViewType(view_type); + if (num_layers) { + ci.subresourceRange.layerCount = *num_layers; + } + vk::ImageView handle = device->GetLogical().CreateImageView(ci); + if (device->HasDebuggingToolAttached()) { + handle.SetObjectNameEXT(VideoCommon::Name(*this, view_type).c_str()); + } + image_views[static_cast<size_t>(view_type)] = std::move(handle); + }; + switch (info.type) { + case VideoCommon::ImageViewType::e1D: + case VideoCommon::ImageViewType::e1DArray: + create(VideoCommon::ImageViewType::e1D, 1); + create(VideoCommon::ImageViewType::e1DArray, std::nullopt); + render_target = Handle(VideoCommon::ImageViewType::e1DArray); + break; + case VideoCommon::ImageViewType::e2D: + case VideoCommon::ImageViewType::e2DArray: + create(VideoCommon::ImageViewType::e2D, 1); + create(VideoCommon::ImageViewType::e2DArray, std::nullopt); + render_target = Handle(VideoCommon::ImageViewType::e2DArray); + break; + case VideoCommon::ImageViewType::e3D: + create(VideoCommon::ImageViewType::e3D, std::nullopt); + render_target = Handle(VideoCommon::ImageViewType::e3D); + break; + case VideoCommon::ImageViewType::Cube: + case VideoCommon::ImageViewType::CubeArray: + create(VideoCommon::ImageViewType::Cube, 6); + create(VideoCommon::ImageViewType::CubeArray, std::nullopt); + break; + case VideoCommon::ImageViewType::Rect: + UNIMPLEMENTED(); + break; + case VideoCommon::ImageViewType::Buffer: + buffer_view = device->GetLogical().CreateBufferView(VkBufferViewCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .buffer = image.Buffer(), + .format = vk_format, + .offset = 0, // TODO: Redesign buffer cache to support this + .range = image.guest_size_bytes, + }); + break; + } +} - // We can't copy inside a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); +ImageView::ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams& params) + : VideoCommon::ImageViewBase{params} {} - src_surface->Transition(copy_params.source_z, copy_params.depth, copy_params.source_level, 1, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_READ_BIT, - VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); - dst_surface->Transition(dst_base_layer, num_layers, copy_params.dest_level, 1, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); +VkImageView ImageView::DepthView() { + if (depth_view) { + return *depth_view; + } + depth_view = MakeDepthStencilView(VK_IMAGE_ASPECT_DEPTH_BIT); + return *depth_view; +} - const VkImageCopy copy{ - .srcSubresource = - { - .aspectMask = src_surface->GetAspectMask(), - .mipLevel = copy_params.source_level, - .baseArrayLayer = copy_params.source_z, - .layerCount = num_layers, - }, - .srcOffset = - { - .x = static_cast<s32>(copy_params.source_x), - .y = static_cast<s32>(copy_params.source_y), - .z = 0, - }, - .dstSubresource = - { - .aspectMask = dst_surface->GetAspectMask(), - .mipLevel = copy_params.dest_level, - .baseArrayLayer = dst_base_layer, - .layerCount = num_layers, - }, - .dstOffset = - { - .x = static_cast<s32>(copy_params.dest_x), - .y = static_cast<s32>(copy_params.dest_y), - .z = static_cast<s32>(dst_offset_z), - }, - .extent = - { - .width = copy_params.width, - .height = copy_params.height, - .depth = extent_z, - }, - }; +VkImageView ImageView::StencilView() { + if (stencil_view) { + return *stencil_view; + } + stencil_view = MakeDepthStencilView(VK_IMAGE_ASPECT_STENCIL_BIT); + return *stencil_view; +} - const VkImage src_image = src_surface->GetImageHandle(); - const VkImage dst_image = dst_surface->GetImageHandle(); - scheduler.Record([src_image, dst_image, copy](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyImage(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, copy); +vk::ImageView ImageView::MakeDepthStencilView(VkImageAspectFlags aspect_mask) { + return device->GetLogical().CreateImageView({ + .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .image = image_handle, + .viewType = ImageViewType(type), + .format = MaxwellToVK::SurfaceFormat(*device, FormatType::Optimal, format).format, + .components{ + .r = VK_COMPONENT_SWIZZLE_IDENTITY, + .g = VK_COMPONENT_SWIZZLE_IDENTITY, + .b = VK_COMPONENT_SWIZZLE_IDENTITY, + .a = VK_COMPONENT_SWIZZLE_IDENTITY, + }, + .subresourceRange = MakeSubresourceRange(aspect_mask, range), }); } -void VKTextureCache::ImageBlit(View& src_view, View& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) { - // We can't blit inside a renderpass - scheduler.RequestOutsideRenderPassOperationContext(); - - src_view->Transition(VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_READ_BIT); - dst_view->Transition(VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT); - - VkImageBlit blit; - blit.srcSubresource = src_view->GetImageSubresourceLayers(); - blit.srcOffsets[0].x = copy_config.src_rect.left; - blit.srcOffsets[0].y = copy_config.src_rect.top; - blit.srcOffsets[0].z = 0; - blit.srcOffsets[1].x = copy_config.src_rect.right; - blit.srcOffsets[1].y = copy_config.src_rect.bottom; - blit.srcOffsets[1].z = 1; - blit.dstSubresource = dst_view->GetImageSubresourceLayers(); - blit.dstOffsets[0].x = copy_config.dst_rect.left; - blit.dstOffsets[0].y = copy_config.dst_rect.top; - blit.dstOffsets[0].z = 0; - blit.dstOffsets[1].x = copy_config.dst_rect.right; - blit.dstOffsets[1].y = copy_config.dst_rect.bottom; - blit.dstOffsets[1].z = 1; - - const bool is_linear = copy_config.filter == Tegra::Engines::Fermi2D::Filter::Linear; - - scheduler.Record([src_image = src_view->GetImage(), dst_image = dst_view->GetImage(), blit, - is_linear](vk::CommandBuffer cmdbuf) { - cmdbuf.BlitImage(src_image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, dst_image, - VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, blit, - is_linear ? VK_FILTER_LINEAR : VK_FILTER_NEAREST); +Sampler::Sampler(TextureCacheRuntime& runtime, const Tegra::Texture::TSCEntry& tsc) { + const auto& device = runtime.device; + const bool arbitrary_borders = runtime.device.IsExtCustomBorderColorSupported(); + const std::array<float, 4> color = tsc.BorderColor(); + // C++20 bit_cast + VkClearColorValue border_color; + std::memcpy(&border_color, &color, sizeof(color)); + const VkSamplerCustomBorderColorCreateInfoEXT border_ci{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CUSTOM_BORDER_COLOR_CREATE_INFO_EXT, + .pNext = nullptr, + .customBorderColor = border_color, + .format = VK_FORMAT_UNDEFINED, + }; + const void* pnext = nullptr; + if (arbitrary_borders) { + pnext = &border_ci; + } + const VkSamplerReductionModeCreateInfoEXT reduction_ci{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_REDUCTION_MODE_CREATE_INFO_EXT, + .pNext = pnext, + .reductionMode = MaxwellToVK::SamplerReduction(tsc.reduction_filter), + }; + if (runtime.device.IsExtSamplerFilterMinmaxSupported()) { + pnext = &reduction_ci; + } else if (reduction_ci.reductionMode != VK_SAMPLER_REDUCTION_MODE_WEIGHTED_AVERAGE_EXT) { + LOG_WARNING(Render_Vulkan, "VK_EXT_sampler_filter_minmax is required"); + } + // Some games have samplers with garbage. Sanitize them here. + const float max_anisotropy = std::clamp(tsc.MaxAnisotropy(), 1.0f, 16.0f); + sampler = device.GetLogical().CreateSampler(VkSamplerCreateInfo{ + .sType = VK_STRUCTURE_TYPE_SAMPLER_CREATE_INFO, + .pNext = pnext, + .flags = 0, + .magFilter = MaxwellToVK::Sampler::Filter(tsc.mag_filter), + .minFilter = MaxwellToVK::Sampler::Filter(tsc.min_filter), + .mipmapMode = MaxwellToVK::Sampler::MipmapMode(tsc.mipmap_filter), + .addressModeU = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_u, tsc.mag_filter), + .addressModeV = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_v, tsc.mag_filter), + .addressModeW = MaxwellToVK::Sampler::WrapMode(device, tsc.wrap_p, tsc.mag_filter), + .mipLodBias = tsc.LodBias(), + .anisotropyEnable = static_cast<VkBool32>(max_anisotropy > 1.0f ? VK_TRUE : VK_FALSE), + .maxAnisotropy = max_anisotropy, + .compareEnable = tsc.depth_compare_enabled, + .compareOp = MaxwellToVK::Sampler::DepthCompareFunction(tsc.depth_compare_func), + .minLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.0f : tsc.MinLod(), + .maxLod = tsc.mipmap_filter == TextureMipmapFilter::None ? 0.25f : tsc.MaxLod(), + .borderColor = + arbitrary_borders ? VK_BORDER_COLOR_INT_CUSTOM_EXT : ConvertBorderColor(color), + .unnormalizedCoordinates = VK_FALSE, }); } -void VKTextureCache::BufferCopy(Surface& src_surface, Surface& dst_surface) { - // Currently unimplemented. PBO copies should be dropped and we should use a render pass to - // convert from color to depth and viceversa. - LOG_WARNING(Render_Vulkan, "Unimplemented"); +Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM_RT> color_buffers, + ImageView* depth_buffer, const VideoCommon::RenderTargets& key) { + std::vector<VkAttachmentDescription> descriptions; + std::vector<VkImageView> attachments; + RenderPassKey renderpass_key{}; + s32 num_layers = 1; + + for (size_t index = 0; index < NUM_RT; ++index) { + const ImageView* const color_buffer = color_buffers[index]; + if (!color_buffer) { + renderpass_key.color_formats[index] = PixelFormat::Invalid; + continue; + } + descriptions.push_back(AttachmentDescription(runtime.device, color_buffer)); + attachments.push_back(color_buffer->RenderTarget()); + renderpass_key.color_formats[index] = color_buffer->format; + num_layers = std::max(num_layers, color_buffer->range.extent.layers); + images[num_images] = color_buffer->ImageHandle(); + image_ranges[num_images] = MakeSubresourceRange(color_buffer); + samples = color_buffer->Samples(); + ++num_images; + } + const size_t num_colors = attachments.size(); + const VkAttachmentReference* depth_attachment = + depth_buffer ? &ATTACHMENT_REFERENCES[num_colors] : nullptr; + if (depth_buffer) { + descriptions.push_back(AttachmentDescription(runtime.device, depth_buffer)); + attachments.push_back(depth_buffer->RenderTarget()); + renderpass_key.depth_format = depth_buffer->format; + num_layers = std::max(num_layers, depth_buffer->range.extent.layers); + images[num_images] = depth_buffer->ImageHandle(); + image_ranges[num_images] = MakeSubresourceRange(depth_buffer); + samples = depth_buffer->Samples(); + ++num_images; + } else { + renderpass_key.depth_format = PixelFormat::Invalid; + } + renderpass_key.samples = samples; + + const auto& device = runtime.device.GetLogical(); + const auto [cache_pair, is_new] = runtime.renderpass_cache.try_emplace(renderpass_key); + if (is_new) { + const VkSubpassDescription subpass{ + .flags = 0, + .pipelineBindPoint = VK_PIPELINE_BIND_POINT_GRAPHICS, + .inputAttachmentCount = 0, + .pInputAttachments = nullptr, + .colorAttachmentCount = static_cast<u32>(num_colors), + .pColorAttachments = num_colors != 0 ? ATTACHMENT_REFERENCES.data() : nullptr, + .pResolveAttachments = nullptr, + .pDepthStencilAttachment = depth_attachment, + .preserveAttachmentCount = 0, + .pPreserveAttachments = nullptr, + }; + cache_pair->second = device.CreateRenderPass(VkRenderPassCreateInfo{ + .sType = VK_STRUCTURE_TYPE_RENDER_PASS_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .attachmentCount = static_cast<u32>(descriptions.size()), + .pAttachments = descriptions.data(), + .subpassCount = 1, + .pSubpasses = &subpass, + .dependencyCount = 0, + .pDependencies = nullptr, + }); + } + renderpass = *cache_pair->second; + render_area = VkExtent2D{ + .width = key.size.width, + .height = key.size.height, + }; + num_color_buffers = static_cast<u32>(num_colors); + framebuffer = device.CreateFramebuffer(VkFramebufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_FRAMEBUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .renderPass = renderpass, + .attachmentCount = static_cast<u32>(attachments.size()), + .pAttachments = attachments.data(), + .width = key.size.width, + .height = key.size.height, + .layers = static_cast<u32>(num_layers), + }); + if (runtime.device.HasDebuggingToolAttached()) { + framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str()); + } } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 39202feba..92a7aad8b 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -4,216 +4,270 @@ #pragma once -#include <memory> -#include <unordered_map> +#include <compare> +#include <span> -#include "common/common_types.h" -#include "video_core/renderer_vulkan/vk_image.h" #include "video_core/renderer_vulkan/vk_memory_manager.h" -#include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/wrapper.h" -#include "video_core/texture_cache/surface_base.h" #include "video_core/texture_cache/texture_cache.h" - -namespace VideoCore { -class RasterizerInterface; -} +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class RasterizerVulkan; -class VKDevice; +using VideoCommon::ImageId; +using VideoCommon::NUM_RT; +using VideoCommon::Offset2D; +using VideoCommon::RenderTargets; +using VideoCore::Surface::PixelFormat; + class VKScheduler; class VKStagingBufferPool; -class CachedSurfaceView; -class CachedSurface; +class BlitImageHelper; +class Device; +class Image; +class ImageView; +class Framebuffer; -using Surface = std::shared_ptr<CachedSurface>; -using View = std::shared_ptr<CachedSurfaceView>; -using TextureCacheBase = VideoCommon::TextureCache<Surface, View>; +struct RenderPassKey { + constexpr auto operator<=>(const RenderPassKey&) const noexcept = default; -using VideoCommon::SurfaceParams; -using VideoCommon::ViewParams; + std::array<PixelFormat, NUM_RT> color_formats; + PixelFormat depth_format; + VkSampleCountFlagBits samples; +}; -class CachedSurface final : public VideoCommon::SurfaceBase<View> { - friend CachedSurfaceView; +} // namespace Vulkan -public: - explicit CachedSurface(const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool, - GPUVAddr gpu_addr, const SurfaceParams& params); - ~CachedSurface(); +namespace std { +template <> +struct hash<Vulkan::RenderPassKey> { + [[nodiscard]] constexpr size_t operator()(const Vulkan::RenderPassKey& key) const noexcept { + size_t value = static_cast<size_t>(key.depth_format) << 48; + value ^= static_cast<size_t>(key.samples) << 52; + for (size_t i = 0; i < key.color_formats.size(); ++i) { + value ^= static_cast<size_t>(key.color_formats[i]) << (i * 6); + } + return value; + } +}; +} // namespace std - void UploadTexture(const std::vector<u8>& staging_buffer) override; - void DownloadTexture(std::vector<u8>& staging_buffer) override; +namespace Vulkan { - void FullTransition(VkPipelineStageFlags new_stage_mask, VkAccessFlags new_access, - VkImageLayout new_layout) { - image->Transition(0, static_cast<u32>(params.GetNumLayers()), 0, params.num_levels, - new_stage_mask, new_access, new_layout); +struct ImageBufferMap { + [[nodiscard]] VkBuffer Handle() const noexcept { + return handle; } - void Transition(u32 base_layer, u32 num_layers, u32 base_level, u32 num_levels, - VkPipelineStageFlags new_stage_mask, VkAccessFlags new_access, - VkImageLayout new_layout) { - image->Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask, - new_access, new_layout); + [[nodiscard]] std::span<u8> Span() const noexcept { + return map.Span(); } - VKImage& GetImage() { - return *image; - } + VkBuffer handle; + MemoryMap map; +}; - const VKImage& GetImage() const { - return *image; +struct TextureCacheRuntime { + const Device& device; + VKScheduler& scheduler; + VKMemoryManager& memory_manager; + VKStagingBufferPool& staging_buffer_pool; + BlitImageHelper& blit_image_helper; + std::unordered_map<RenderPassKey, vk::RenderPass> renderpass_cache; + + void Finish(); + + [[nodiscard]] ImageBufferMap MapUploadBuffer(size_t size); + + [[nodiscard]] ImageBufferMap MapDownloadBuffer(size_t size) { + // TODO: Have a special function for this + return MapUploadBuffer(size); } - VkImage GetImageHandle() const { - return *image->GetHandle(); + void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, + const std::array<Offset2D, 2>& dst_region, + const std::array<Offset2D, 2>& src_region, + Tegra::Engines::Fermi2D::Filter filter, + Tegra::Engines::Fermi2D::Operation operation); + + void CopyImage(Image& dst, Image& src, std::span<const VideoCommon::ImageCopy> copies); + + void ConvertImage(Framebuffer* dst, ImageView& dst_view, ImageView& src_view); + + [[nodiscard]] bool CanAccelerateImageUpload(Image&) const noexcept { + return false; } - VkImageAspectFlags GetAspectMask() const { - return image->GetAspectMask(); + void AccelerateImageUpload(Image&, const ImageBufferMap&, size_t, + std::span<const VideoCommon::SwizzleParameters>) { + UNREACHABLE(); } - VkBufferView GetBufferViewHandle() const { - return *buffer_view; + void InsertUploadMemoryBarrier() {} + + bool HasBrokenTextureViewFormats() const noexcept { + // No known Vulkan driver has broken image views + return false; } +}; -protected: - void DecorateSurfaceName(); +class Image : public VideoCommon::ImageBase { +public: + explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, + VAddr cpu_addr); - View CreateView(const ViewParams& params) override; + void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); -private: - void UploadBuffer(const std::vector<u8>& staging_buffer); + void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferCopy> copies); - void UploadImage(const std::vector<u8>& staging_buffer); + void DownloadMemory(const ImageBufferMap& map, size_t buffer_offset, + std::span<const VideoCommon::BufferImageCopy> copies); - VkBufferImageCopy GetBufferImageCopy(u32 level) const; + [[nodiscard]] VkImage Handle() const noexcept { + return *image; + } - VkImageSubresourceRange GetImageSubresourceRange() const; + [[nodiscard]] VkBuffer Buffer() const noexcept { + return *buffer; + } - const VKDevice& device; - VKMemoryManager& memory_manager; - VKScheduler& scheduler; - VKStagingBufferPool& staging_pool; + [[nodiscard]] VkImageCreateFlags AspectMask() const noexcept { + return aspect_mask; + } - std::optional<VKImage> image; +private: + VKScheduler* scheduler; + vk::Image image; vk::Buffer buffer; - vk::BufferView buffer_view; VKMemoryCommit commit; - - VkFormat format = VK_FORMAT_UNDEFINED; + VkImageAspectFlags aspect_mask = 0; + bool initialized = false; }; -class CachedSurfaceView final : public VideoCommon::ViewBase { +class ImageView : public VideoCommon::ImageViewBase { public: - explicit CachedSurfaceView(const VKDevice& device, CachedSurface& surface, - const ViewParams& params); - ~CachedSurfaceView(); + explicit ImageView(TextureCacheRuntime&, const VideoCommon::ImageViewInfo&, ImageId, Image&); + explicit ImageView(TextureCacheRuntime&, const VideoCommon::NullImageParams&); - VkImageView GetImageView(Tegra::Texture::SwizzleSource x_source, - Tegra::Texture::SwizzleSource y_source, - Tegra::Texture::SwizzleSource z_source, - Tegra::Texture::SwizzleSource w_source); + [[nodiscard]] VkImageView DepthView(); - VkImageView GetAttachment(); + [[nodiscard]] VkImageView StencilView(); - bool IsSameSurface(const CachedSurfaceView& rhs) const { - return &surface == &rhs.surface; + [[nodiscard]] VkImageView Handle(VideoCommon::ImageViewType query_type) const noexcept { + return *image_views[static_cast<size_t>(query_type)]; } - u32 GetWidth() const { - return params.GetMipWidth(base_level); + [[nodiscard]] VkBufferView BufferView() const noexcept { + return *buffer_view; } - u32 GetHeight() const { - return params.GetMipHeight(base_level); + [[nodiscard]] VkImage ImageHandle() const noexcept { + return image_handle; } - u32 GetNumLayers() const { - return num_layers; + [[nodiscard]] VkImageView RenderTarget() const noexcept { + return render_target; } - bool IsBufferView() const { - return buffer_view; + [[nodiscard]] PixelFormat ImageFormat() const noexcept { + return image_format; } - VkImage GetImage() const { - return image; + [[nodiscard]] VkSampleCountFlagBits Samples() const noexcept { + return samples; } - VkBufferView GetBufferView() const { - return buffer_view; - } +private: + [[nodiscard]] vk::ImageView MakeDepthStencilView(VkImageAspectFlags aspect_mask); - VkImageSubresourceRange GetImageSubresourceRange() const { - return {aspect_mask, base_level, num_levels, base_layer, num_layers}; - } + const Device* device = nullptr; + std::array<vk::ImageView, VideoCommon::NUM_IMAGE_VIEW_TYPES> image_views; + vk::ImageView depth_view; + vk::ImageView stencil_view; + vk::BufferView buffer_view; + VkImage image_handle = VK_NULL_HANDLE; + VkImageView render_target = VK_NULL_HANDLE; + PixelFormat image_format = PixelFormat::Invalid; + VkSampleCountFlagBits samples = VK_SAMPLE_COUNT_1_BIT; +}; - VkImageSubresourceLayers GetImageSubresourceLayers() const { - return {surface.GetAspectMask(), base_level, base_layer, num_layers}; - } +class ImageAlloc : public VideoCommon::ImageAllocBase {}; - void Transition(VkImageLayout new_layout, VkPipelineStageFlags new_stage_mask, - VkAccessFlags new_access) const { - surface.Transition(base_layer, num_layers, base_level, num_levels, new_stage_mask, - new_access, new_layout); - } +class Sampler { +public: + explicit Sampler(TextureCacheRuntime&, const Tegra::Texture::TSCEntry&); - void MarkAsModified(u64 tick) { - surface.MarkAsModified(true, tick); + [[nodiscard]] VkSampler Handle() const noexcept { + return *sampler; } private: - // Store a copy of these values to avoid double dereference when reading them - const SurfaceParams params; - const VkImage image; - const VkBufferView buffer_view; - const VkImageAspectFlags aspect_mask; - - const VKDevice& device; - CachedSurface& surface; - const u32 base_level; - const u32 num_levels; - const VkImageViewType image_view_type; - u32 base_layer = 0; - u32 num_layers = 0; - u32 base_slice = 0; - u32 num_slices = 0; - - VkImageView last_image_view = nullptr; - u32 last_swizzle = 0; - - vk::ImageView render_target; - std::unordered_map<u32, vk::ImageView> view_cache; + vk::Sampler sampler; }; -class VKTextureCache final : public TextureCacheBase { +class Framebuffer { public: - explicit VKTextureCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::Engines::Maxwell3D& maxwell3d, Tegra::MemoryManager& gpu_memory, - const VKDevice& device, VKMemoryManager& memory_manager, - VKScheduler& scheduler, VKStagingBufferPool& staging_pool); - ~VKTextureCache(); + explicit Framebuffer(TextureCacheRuntime&, std::span<ImageView*, NUM_RT> color_buffers, + ImageView* depth_buffer, const VideoCommon::RenderTargets& key); -private: - Surface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) override; + [[nodiscard]] VkFramebuffer Handle() const noexcept { + return *framebuffer; + } - void ImageCopy(Surface& src_surface, Surface& dst_surface, - const VideoCommon::CopyParams& copy_params) override; + [[nodiscard]] VkRenderPass RenderPass() const noexcept { + return renderpass; + } - void ImageBlit(View& src_view, View& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) override; + [[nodiscard]] VkExtent2D RenderArea() const noexcept { + return render_area; + } - void BufferCopy(Surface& src_surface, Surface& dst_surface) override; + [[nodiscard]] VkSampleCountFlagBits Samples() const noexcept { + return samples; + } - const VKDevice& device; - VKMemoryManager& memory_manager; - VKScheduler& scheduler; - VKStagingBufferPool& staging_pool; + [[nodiscard]] u32 NumColorBuffers() const noexcept { + return num_color_buffers; + } + + [[nodiscard]] u32 NumImages() const noexcept { + return num_images; + } + + [[nodiscard]] const std::array<VkImage, 9>& Images() const noexcept { + return images; + } + + [[nodiscard]] const std::array<VkImageSubresourceRange, 9>& ImageRanges() const noexcept { + return image_ranges; + } + +private: + vk::Framebuffer framebuffer; + VkRenderPass renderpass{}; + VkExtent2D render_area{}; + VkSampleCountFlagBits samples = VK_SAMPLE_COUNT_1_BIT; + u32 num_color_buffers = 0; + u32 num_images = 0; + std::array<VkImage, 9> images{}; + std::array<VkImageSubresourceRange, 9> image_ranges{}; }; +struct TextureCacheParams { + static constexpr bool ENABLE_VALIDATION = true; + static constexpr bool FRAMEBUFFER_BLITS = false; + static constexpr bool HAS_EMULATED_COPIES = false; + + using Runtime = Vulkan::TextureCacheRuntime; + using Image = Vulkan::Image; + using ImageAlloc = Vulkan::ImageAlloc; + using ImageView = Vulkan::ImageView; + using Sampler = Vulkan::Sampler; + using Framebuffer = Vulkan::Framebuffer; +}; + +using TextureCache = VideoCommon::TextureCache<TextureCacheParams>; + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp index 351c048d2..f99273c6a 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp @@ -7,15 +7,15 @@ #include "common/assert.h" #include "common/logging/log.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -VKUpdateDescriptorQueue::VKUpdateDescriptorQueue(const VKDevice& device, VKScheduler& scheduler) - : device{device}, scheduler{scheduler} {} +VKUpdateDescriptorQueue::VKUpdateDescriptorQueue(const Device& device_, VKScheduler& scheduler_) + : device{device_}, scheduler{scheduler_} {} VKUpdateDescriptorQueue::~VKUpdateDescriptorQueue() = default; diff --git a/src/video_core/renderer_vulkan/vk_update_descriptor.h b/src/video_core/renderer_vulkan/vk_update_descriptor.h index 945320c72..e214f7195 100644 --- a/src/video_core/renderer_vulkan/vk_update_descriptor.h +++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h @@ -8,11 +8,11 @@ #include <boost/container/static_vector.hpp> #include "common/common_types.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { -class VKDevice; +class Device; class VKScheduler; struct DescriptorUpdateEntry { @@ -31,7 +31,7 @@ struct DescriptorUpdateEntry { class VKUpdateDescriptorQueue final { public: - explicit VKUpdateDescriptorQueue(const VKDevice& device, VKScheduler& scheduler); + explicit VKUpdateDescriptorQueue(const Device& device_, VKScheduler& scheduler_); ~VKUpdateDescriptorQueue(); void TickFrame(); @@ -40,32 +40,36 @@ public: void Send(VkDescriptorUpdateTemplateKHR update_template, VkDescriptorSet set); - void AddSampledImage(VkSampler sampler, VkImageView image_view) { - payload.emplace_back(VkDescriptorImageInfo{sampler, image_view, {}}); + void AddSampledImage(VkImageView image_view, VkSampler sampler) { + payload.emplace_back(VkDescriptorImageInfo{ + .sampler = sampler, + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }); } void AddImage(VkImageView image_view) { - payload.emplace_back(VkDescriptorImageInfo{{}, image_view, {}}); + payload.emplace_back(VkDescriptorImageInfo{ + .sampler = VK_NULL_HANDLE, + .imageView = image_view, + .imageLayout = VK_IMAGE_LAYOUT_GENERAL, + }); } - void AddBuffer(VkBuffer buffer, u64 offset, std::size_t size) { - payload.emplace_back(VkDescriptorBufferInfo{buffer, offset, size}); + void AddBuffer(VkBuffer buffer, u64 offset, size_t size) { + payload.emplace_back(VkDescriptorBufferInfo{ + .buffer = buffer, + .offset = offset, + .range = size, + }); } void AddTexelBuffer(VkBufferView texel_buffer) { payload.emplace_back(texel_buffer); } - VkImageLayout* LastImageLayout() { - return &payload.back().image.imageLayout; - } - - const VkImageLayout* LastImageLayout() const { - return &payload.back().image.imageLayout; - } - private: - const VKDevice& device; + const Device& device; VKScheduler& scheduler; const DescriptorUpdateEntry* upload_start = nullptr; diff --git a/src/video_core/sampler_cache.cpp b/src/video_core/sampler_cache.cpp deleted file mode 100644 index 53c7ef12d..000000000 --- a/src/video_core/sampler_cache.cpp +++ /dev/null @@ -1,21 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "common/cityhash.h" -#include "common/common_types.h" -#include "video_core/sampler_cache.h" - -namespace VideoCommon { - -std::size_t SamplerCacheKey::Hash() const { - static_assert(sizeof(raw) % sizeof(u64) == 0); - return static_cast<std::size_t>( - Common::CityHash64(reinterpret_cast<const char*>(raw.data()), sizeof(raw) / sizeof(u64))); -} - -bool SamplerCacheKey::operator==(const SamplerCacheKey& rhs) const { - return raw == rhs.raw; -} - -} // namespace VideoCommon diff --git a/src/video_core/sampler_cache.h b/src/video_core/sampler_cache.h deleted file mode 100644 index cbe3ad071..000000000 --- a/src/video_core/sampler_cache.h +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <cstddef> -#include <unordered_map> - -#include "video_core/textures/texture.h" - -namespace VideoCommon { - -struct SamplerCacheKey final : public Tegra::Texture::TSCEntry { - std::size_t Hash() const; - - bool operator==(const SamplerCacheKey& rhs) const; - - bool operator!=(const SamplerCacheKey& rhs) const { - return !operator==(rhs); - } -}; - -} // namespace VideoCommon - -namespace std { - -template <> -struct hash<VideoCommon::SamplerCacheKey> { - std::size_t operator()(const VideoCommon::SamplerCacheKey& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std - -namespace VideoCommon { - -template <typename SamplerType, typename SamplerStorageType> -class SamplerCache { -public: - SamplerType GetSampler(const Tegra::Texture::TSCEntry& tsc) { - const auto [entry, is_cache_miss] = cache.try_emplace(SamplerCacheKey{tsc}); - auto& sampler = entry->second; - if (is_cache_miss) { - sampler = CreateSampler(tsc); - } - return ToSamplerType(sampler); - } - -protected: - virtual SamplerStorageType CreateSampler(const Tegra::Texture::TSCEntry& tsc) const = 0; - - virtual SamplerType ToSamplerType(const SamplerStorageType& sampler) const = 0; - -private: - std::unordered_map<SamplerCacheKey, SamplerStorageType> cache; -}; - -} // namespace VideoCommon
\ No newline at end of file diff --git a/src/video_core/shader/ast.cpp b/src/video_core/shader/ast.cpp index 3f96d9076..db11144c7 100644 --- a/src/video_core/shader/ast.cpp +++ b/src/video_core/shader/ast.cpp @@ -212,16 +212,15 @@ public: } void operator()(const ExprPredicate& expr) { - inner += "P" + std::to_string(expr.predicate); + inner += fmt::format("P{}", expr.predicate); } void operator()(const ExprCondCode& expr) { - u32 cc = static_cast<u32>(expr.cc); - inner += "CC" + std::to_string(cc); + inner += fmt::format("CC{}", expr.cc); } void operator()(const ExprVar& expr) { - inner += "V" + std::to_string(expr.var_index); + inner += fmt::format("V{}", expr.var_index); } void operator()(const ExprBoolean& expr) { @@ -229,7 +228,7 @@ public: } void operator()(const ExprGprEqual& expr) { - inner += "( gpr_" + std::to_string(expr.gpr) + " == " + std::to_string(expr.value) + ')'; + inner += fmt::format("(gpr_{} == {})", expr.gpr, expr.value); } const std::string& GetResult() const { @@ -374,8 +373,8 @@ std::string ASTManager::Print() const { return printer.GetResult(); } -ASTManager::ASTManager(bool full_decompile, bool disable_else_derivation) - : full_decompile{full_decompile}, disable_else_derivation{disable_else_derivation} {}; +ASTManager::ASTManager(bool do_full_decompile, bool disable_else_derivation_) + : full_decompile{do_full_decompile}, disable_else_derivation{disable_else_derivation_} {} ASTManager::~ASTManager() { Clear(); diff --git a/src/video_core/shader/ast.h b/src/video_core/shader/ast.h index 8e5a22ab3..dc49b369e 100644 --- a/src/video_core/shader/ast.h +++ b/src/video_core/shader/ast.h @@ -76,7 +76,7 @@ public: class ASTIfThen { public: - explicit ASTIfThen(Expr condition) : condition{std::move(condition)} {} + explicit ASTIfThen(Expr condition_) : condition{std::move(condition_)} {} Expr condition; ASTZipper nodes{}; }; @@ -88,63 +88,68 @@ public: class ASTBlockEncoded { public: - explicit ASTBlockEncoded(u32 start, u32 end) : start{start}, end{end} {} + explicit ASTBlockEncoded(u32 start_, u32 _) : start{start_}, end{_} {} u32 start; u32 end; }; class ASTBlockDecoded { public: - explicit ASTBlockDecoded(NodeBlock&& new_nodes) : nodes(std::move(new_nodes)) {} + explicit ASTBlockDecoded(NodeBlock&& new_nodes_) : nodes(std::move(new_nodes_)) {} NodeBlock nodes; }; class ASTVarSet { public: - explicit ASTVarSet(u32 index, Expr condition) : index{index}, condition{std::move(condition)} {} + explicit ASTVarSet(u32 index_, Expr condition_) + : index{index_}, condition{std::move(condition_)} {} + u32 index; Expr condition; }; class ASTLabel { public: - explicit ASTLabel(u32 index) : index{index} {} + explicit ASTLabel(u32 index_) : index{index_} {} u32 index; bool unused{}; }; class ASTGoto { public: - explicit ASTGoto(Expr condition, u32 label) : condition{std::move(condition)}, label{label} {} + explicit ASTGoto(Expr condition_, u32 label_) + : condition{std::move(condition_)}, label{label_} {} + Expr condition; u32 label; }; class ASTDoWhile { public: - explicit ASTDoWhile(Expr condition) : condition{std::move(condition)} {} + explicit ASTDoWhile(Expr condition_) : condition{std::move(condition_)} {} Expr condition; ASTZipper nodes{}; }; class ASTReturn { public: - explicit ASTReturn(Expr condition, bool kills) - : condition{std::move(condition)}, kills{kills} {} + explicit ASTReturn(Expr condition_, bool kills_) + : condition{std::move(condition_)}, kills{kills_} {} + Expr condition; bool kills; }; class ASTBreak { public: - explicit ASTBreak(Expr condition) : condition{std::move(condition)} {} + explicit ASTBreak(Expr condition_) : condition{std::move(condition_)} {} Expr condition; }; class ASTBase { public: - explicit ASTBase(ASTNode parent, ASTData data) - : data{std::move(data)}, parent{std::move(parent)} {} + explicit ASTBase(ASTNode parent_, ASTData data_) + : data{std::move(data_)}, parent{std::move(parent_)} {} template <class U, class... Args> static ASTNode Make(ASTNode parent, Args&&... args) { @@ -300,7 +305,7 @@ private: class ASTManager final { public: - ASTManager(bool full_decompile, bool disable_else_derivation); + explicit ASTManager(bool do_full_decompile, bool disable_else_derivation_); ~ASTManager(); ASTManager(const ASTManager& o) = delete; diff --git a/src/video_core/shader/async_shaders.cpp b/src/video_core/shader/async_shaders.cpp index aabd62c5c..9707136e9 100644 --- a/src/video_core/shader/async_shaders.cpp +++ b/src/video_core/shader/async_shaders.cpp @@ -13,21 +13,22 @@ namespace VideoCommon::Shader { -AsyncShaders::AsyncShaders(Core::Frontend::EmuWindow& emu_window) : emu_window(emu_window) {} +AsyncShaders::AsyncShaders(Core::Frontend::EmuWindow& emu_window_) : emu_window(emu_window_) {} AsyncShaders::~AsyncShaders() { KillWorkers(); } void AsyncShaders::AllocateWorkers() { - // Max worker threads we should allow - constexpr u32 MAX_THREADS = 4; - // Deduce how many threads we can use - const u32 threads_used = std::thread::hardware_concurrency() / 4; - // Always allow at least 1 thread regardless of our settings - const auto max_worker_count = std::max(1U, threads_used); - // Don't use more than MAX_THREADS - const auto num_workers = std::min(max_worker_count, MAX_THREADS); + // Use at least one thread + u32 num_workers = 1; + + // Deduce how many more threads we can use + const u32 thread_count = std::thread::hardware_concurrency(); + if (thread_count >= 8) { + // Increase async workers by 1 for every 2 threads >= 8 + num_workers += 1 + (thread_count - 8) / 2; + } // If we already have workers queued, ignore if (num_workers == worker_threads.size()) { @@ -42,8 +43,8 @@ void AsyncShaders::AllocateWorkers() { // Create workers for (std::size_t i = 0; i < num_workers; i++) { context_list.push_back(emu_window.CreateSharedContext()); - worker_threads.push_back( - std::thread(&AsyncShaders::ShaderCompilerThread, this, context_list[i].get())); + worker_threads.emplace_back(&AsyncShaders::ShaderCompilerThread, this, + context_list[i].get()); } } @@ -105,8 +106,7 @@ std::vector<AsyncShaders::Result> AsyncShaders::GetCompletedWork() { std::vector<Result> results; { std::unique_lock lock{completed_mutex}; - results.assign(std::make_move_iterator(finished_work.begin()), - std::make_move_iterator(finished_work.end())); + results = std::move(finished_work); finished_work.clear(); } return results; @@ -115,11 +115,10 @@ std::vector<AsyncShaders::Result> AsyncShaders::GetCompletedWork() { void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device, Tegra::Engines::ShaderType shader_type, u64 uid, std::vector<u64> code, std::vector<u64> code_b, - u32 main_offset, - VideoCommon::Shader::CompilerSettings compiler_settings, - const VideoCommon::Shader::Registry& registry, - VAddr cpu_addr) { - WorkerParams params{ + u32 main_offset, CompilerSettings compiler_settings, + const Registry& registry, VAddr cpu_addr) { + std::unique_lock lock(queue_mutex); + pending_queue.push({ .backend = device.UseAssemblyShaders() ? Backend::GLASM : Backend::OpenGL, .device = &device, .shader_type = shader_type, @@ -130,35 +129,30 @@ void AsyncShaders::QueueOpenGLShader(const OpenGL::Device& device, .compiler_settings = compiler_settings, .registry = registry, .cpu_address = cpu_addr, - }; - std::unique_lock lock(queue_mutex); - pending_queue.push(std::move(params)); + }); cv.notify_one(); } void AsyncShaders::QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, - const Vulkan::VKDevice& device, Vulkan::VKScheduler& scheduler, + const Vulkan::Device& device, Vulkan::VKScheduler& scheduler, Vulkan::VKDescriptorPool& descriptor_pool, Vulkan::VKUpdateDescriptorQueue& update_descriptor_queue, - Vulkan::VKRenderPassCache& renderpass_cache, std::vector<VkDescriptorSetLayoutBinding> bindings, Vulkan::SPIRVProgram program, - Vulkan::GraphicsPipelineCacheKey key) { - WorkerParams params{ + Vulkan::GraphicsPipelineCacheKey key, u32 num_color_buffers) { + std::unique_lock lock(queue_mutex); + pending_queue.push({ .backend = Backend::Vulkan, .pp_cache = pp_cache, .vk_device = &device, .scheduler = &scheduler, .descriptor_pool = &descriptor_pool, .update_descriptor_queue = &update_descriptor_queue, - .renderpass_cache = &renderpass_cache, - .bindings = bindings, - .program = program, + .bindings = std::move(bindings), + .program = std::move(program), .key = key, - }; - - std::unique_lock lock(queue_mutex); - pending_queue.push(std::move(params)); + .num_color_buffers = num_color_buffers, + }); cv.notify_one(); } @@ -210,8 +204,8 @@ void AsyncShaders::ShaderCompilerThread(Core::Frontend::GraphicsContext* context } else if (work.backend == Backend::Vulkan) { auto pipeline = std::make_unique<Vulkan::VKGraphicsPipeline>( *work.vk_device, *work.scheduler, *work.descriptor_pool, - *work.update_descriptor_queue, *work.renderpass_cache, work.key, work.bindings, - work.program); + *work.update_descriptor_queue, work.key, work.bindings, work.program, + work.num_color_buffers); work.pp_cache->EmplacePipeline(std::move(pipeline)); } diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h index 7a99e1dc5..0dbb1a31f 100644 --- a/src/video_core/shader/async_shaders.h +++ b/src/video_core/shader/async_shaders.h @@ -24,9 +24,9 @@ #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" -#include "video_core/renderer_vulkan/vk_device.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/vulkan_common/vulkan_device.h" namespace Core::Frontend { class EmuWindow; @@ -66,7 +66,7 @@ public: Tegra::Engines::ShaderType shader_type; }; - explicit AsyncShaders(Core::Frontend::EmuWindow& emu_window); + explicit AsyncShaders(Core::Frontend::EmuWindow& emu_window_); ~AsyncShaders(); /// Start up shader worker threads @@ -94,13 +94,13 @@ public: CompilerSettings compiler_settings, const Registry& registry, VAddr cpu_addr); - void QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, const Vulkan::VKDevice& device, + void QueueVulkanShader(Vulkan::VKPipelineCache* pp_cache, const Vulkan::Device& device, Vulkan::VKScheduler& scheduler, Vulkan::VKDescriptorPool& descriptor_pool, Vulkan::VKUpdateDescriptorQueue& update_descriptor_queue, - Vulkan::VKRenderPassCache& renderpass_cache, std::vector<VkDescriptorSetLayoutBinding> bindings, - Vulkan::SPIRVProgram program, Vulkan::GraphicsPipelineCacheKey key); + Vulkan::SPIRVProgram program, Vulkan::GraphicsPipelineCacheKey key, + u32 num_color_buffers); private: void ShaderCompilerThread(Core::Frontend::GraphicsContext* context); @@ -123,14 +123,14 @@ private: // For Vulkan Vulkan::VKPipelineCache* pp_cache; - const Vulkan::VKDevice* vk_device; + const Vulkan::Device* vk_device; Vulkan::VKScheduler* scheduler; Vulkan::VKDescriptorPool* descriptor_pool; Vulkan::VKUpdateDescriptorQueue* update_descriptor_queue; - Vulkan::VKRenderPassCache* renderpass_cache; std::vector<VkDescriptorSetLayoutBinding> bindings; Vulkan::SPIRVProgram program; Vulkan::GraphicsPipelineCacheKey key; + u32 num_color_buffers; }; std::condition_variable cv; diff --git a/src/video_core/shader/control_flow.cpp b/src/video_core/shader/control_flow.cpp index 4c8971615..43d965f2f 100644 --- a/src/video_core/shader/control_flow.cpp +++ b/src/video_core/shader/control_flow.cpp @@ -66,8 +66,8 @@ struct BlockInfo { }; struct CFGRebuildState { - explicit CFGRebuildState(const ProgramCode& program_code, u32 start, Registry& registry) - : program_code{program_code}, registry{registry}, start{start} {} + explicit CFGRebuildState(const ProgramCode& program_code_, u32 start_, Registry& registry_) + : program_code{program_code_}, registry{registry_}, start{start_} {} const ProgramCode& program_code; Registry& registry; @@ -241,10 +241,10 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) ParseInfo parse_info{}; SingleBranch single_branch{}; - const auto insert_label = [](CFGRebuildState& state, u32 address) { - const auto pair = state.labels.emplace(address); + const auto insert_label = [](CFGRebuildState& rebuild_state, u32 label_address) { + const auto pair = rebuild_state.labels.emplace(label_address); if (pair.second) { - state.inspect_queries.push_back(address); + rebuild_state.inspect_queries.push_back(label_address); } }; @@ -257,7 +257,7 @@ std::pair<ParseResult, ParseInfo> ParseCode(CFGRebuildState& state, u32 address) single_branch.ignore = false; break; } - if (state.registered.count(offset) != 0) { + if (state.registered.contains(offset)) { single_branch.address = offset; single_branch.ignore = true; break; @@ -632,12 +632,12 @@ void DecompileShader(CFGRebuildState& state) { for (auto label : state.labels) { state.manager->DeclareLabel(label); } - for (auto& block : state.block_info) { - if (state.labels.count(block.start) != 0) { + for (const auto& block : state.block_info) { + if (state.labels.contains(block.start)) { state.manager->InsertLabel(block.start); } const bool ignore = BlockBranchIsIgnored(block.branch); - u32 end = ignore ? block.end + 1 : block.end; + const u32 end = ignore ? block.end + 1 : block.end; state.manager->InsertBlock(block.start, end); if (!ignore) { InsertBranch(*state.manager, block.branch); @@ -737,7 +737,7 @@ std::unique_ptr<ShaderCharacteristics> ScanFlow(const ProgramCode& program_code, auto back = result_out->blocks.begin(); auto next = std::next(back); while (next != result_out->blocks.end()) { - if (state.labels.count(next->start) == 0 && next->start == back->end + 1) { + if (!state.labels.contains(next->start) && next->start == back->end + 1) { back->end = next->end; next = result_out->blocks.erase(next); continue; diff --git a/src/video_core/shader/control_flow.h b/src/video_core/shader/control_flow.h index 62a3510d8..37bf96492 100644 --- a/src/video_core/shader/control_flow.h +++ b/src/video_core/shader/control_flow.h @@ -42,10 +42,10 @@ struct Condition { class SingleBranch { public: SingleBranch() = default; - SingleBranch(Condition condition, s32 address, bool kill, bool is_sync, bool is_brk, - bool ignore) - : condition{condition}, address{address}, kill{kill}, is_sync{is_sync}, is_brk{is_brk}, - ignore{ignore} {} + explicit SingleBranch(Condition condition_, s32 address_, bool kill_, bool is_sync_, + bool is_brk_, bool ignore_) + : condition{condition_}, address{address_}, kill{kill_}, is_sync{is_sync_}, is_brk{is_brk_}, + ignore{ignore_} {} bool operator==(const SingleBranch& b) const { return std::tie(condition, address, kill, is_sync, is_brk, ignore) == @@ -65,15 +65,15 @@ public: }; struct CaseBranch { - CaseBranch(u32 cmp_value, u32 address) : cmp_value{cmp_value}, address{address} {} + explicit CaseBranch(u32 cmp_value_, u32 address_) : cmp_value{cmp_value_}, address{address_} {} u32 cmp_value; u32 address; }; class MultiBranch { public: - MultiBranch(u32 gpr, std::vector<CaseBranch>&& branches) - : gpr{gpr}, branches{std::move(branches)} {} + explicit MultiBranch(u32 gpr_, std::vector<CaseBranch>&& branches_) + : gpr{gpr_}, branches{std::move(branches_)} {} u32 gpr{}; std::vector<CaseBranch> branches{}; diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index eeac328a6..6576d1208 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -25,7 +25,7 @@ using Tegra::Shader::OpCode; namespace { void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, - const std::list<Sampler>& used_samplers) { + const std::list<SamplerEntry>& used_samplers) { if (gpu_driver.IsTextureHandlerSizeKnown() || used_samplers.size() <= 1) { return; } @@ -43,9 +43,9 @@ void DeduceTextureHandlerSize(VideoCore::GuestDriverProfile& gpu_driver, } } -std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, +std::optional<u32> TryDeduceSamplerSize(const SamplerEntry& sampler_to_deduce, VideoCore::GuestDriverProfile& gpu_driver, - const std::list<Sampler>& used_samplers) { + const std::list<SamplerEntry>& used_samplers) { const u32 base_offset = sampler_to_deduce.offset; u32 max_offset{std::numeric_limits<u32>::max()}; for (const auto& sampler : used_samplers) { @@ -66,7 +66,7 @@ std::optional<u32> TryDeduceSamplerSize(const Sampler& sampler_to_deduce, class ASTDecoder { public: - ASTDecoder(ShaderIR& ir) : ir(ir) {} + explicit ASTDecoder(ShaderIR& ir_) : ir(ir_) {} void operator()(ASTProgram& ast) { ASTNode current = ast.nodes.GetFirst(); @@ -153,8 +153,8 @@ void ShaderIR::Decode() { const auto& blocks = shader_info.blocks; NodeBlock current_block; u32 current_label = static_cast<u32>(exit_branch); - for (auto& block : blocks) { - if (shader_info.labels.count(block.start) != 0) { + for (const auto& block : blocks) { + if (shader_info.labels.contains(block.start)) { insert_block(current_block, current_label); current_block.clear(); current_label = block.start; diff --git a/src/video_core/shader/decode/arithmetic.cpp b/src/video_core/shader/decode/arithmetic.cpp index 4db329fa5..15eb700e7 100644 --- a/src/video_core/shader/decode/arithmetic.cpp +++ b/src/video_core/shader/decode/arithmetic.cpp @@ -110,8 +110,7 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { case SubOp::Sqrt: return Operation(OperationCode::FSqrt, PRECISE, op_a); default: - UNIMPLEMENTED_MSG("Unhandled MUFU sub op={0:x}", - static_cast<unsigned>(instr.sub_op.Value())); + UNIMPLEMENTED_MSG("Unhandled MUFU sub op={0:x}", instr.sub_op.Value()); return Immediate(0); } }(); @@ -137,7 +136,8 @@ u32 ShaderIR::DecodeArithmetic(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::FCMP_RR: - case OpCode::Id::FCMP_RC: { + case OpCode::Id::FCMP_RC: + case OpCode::Id::FCMP_IMMR: { UNIMPLEMENTED_IF(instr.fcmp.ftz == 0); Node op_c = GetRegister(instr.gpr39); Node comp = GetPredicateComparisonFloat(instr.fcmp.cond, std::move(op_c), Immediate(0.0f)); diff --git a/src/video_core/shader/decode/arithmetic_integer.cpp b/src/video_core/shader/decode/arithmetic_integer.cpp index 73155966f..7b5bb7003 100644 --- a/src/video_core/shader/decode/arithmetic_integer.cpp +++ b/src/video_core/shader/decode/arithmetic_integer.cpp @@ -83,7 +83,7 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { case IAdd3Height::UpperHalfWord: return BitfieldExtract(value, 16, 16); default: - UNIMPLEMENTED_MSG("Unhandled IADD3 height: {}", static_cast<u32>(height)); + UNIMPLEMENTED_MSG("Unhandled IADD3 height: {}", height); return Immediate(0); } }; @@ -258,7 +258,7 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { case OpCode::Id::LEA_IMM: case OpCode::Id::LEA_RZ: case OpCode::Id::LEA_HI: { - auto [op_a, op_b, op_c] = [&]() -> std::tuple<Node, Node, Node> { + auto [op_a_, op_b_, op_c_] = [&]() -> std::tuple<Node, Node, Node> { switch (opcode->get().GetId()) { case OpCode::Id::LEA_R2: { return {GetRegister(instr.gpr20), GetRegister(instr.gpr39), @@ -294,8 +294,9 @@ u32 ShaderIR::DecodeArithmeticInteger(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.lea.pred48 != static_cast<u64>(Pred::UnusedIndex), "Unhandled LEA Predicate"); - Node value = Operation(OperationCode::ILogicalShiftLeft, std::move(op_a), std::move(op_c)); - value = Operation(OperationCode::IAdd, std::move(op_b), std::move(value)); + Node value = + Operation(OperationCode::ILogicalShiftLeft, std::move(op_a_), std::move(op_c_)); + value = Operation(OperationCode::IAdd, std::move(op_b_), std::move(value)); SetRegister(bb, instr.gpr0, std::move(value)); break; diff --git a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp index 73880db0e..73580277a 100644 --- a/src/video_core/shader/decode/arithmetic_integer_immediate.cpp +++ b/src/video_core/shader/decode/arithmetic_integer_immediate.cpp @@ -28,23 +28,26 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) { case OpCode::Id::IADD32I: { UNIMPLEMENTED_IF_MSG(instr.iadd32i.saturate, "IADD32I saturation is not implemented"); - op_a = GetOperandAbsNegInteger(op_a, false, instr.iadd32i.negate_a, true); + op_a = GetOperandAbsNegInteger(std::move(op_a), false, instr.iadd32i.negate_a != 0, true); - const Node value = Operation(OperationCode::IAdd, PRECISE, op_a, op_b); + Node value = Operation(OperationCode::IAdd, PRECISE, std::move(op_a), std::move(op_b)); - SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc); - SetRegister(bb, instr.gpr0, value); + SetInternalFlagsFromInteger(bb, value, instr.op_32.generates_cc != 0); + SetRegister(bb, instr.gpr0, std::move(value)); break; } case OpCode::Id::LOP32I: { - if (instr.alu.lop32i.invert_a) - op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_a); + if (instr.alu.lop32i.invert_a) { + op_a = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_a)); + } - if (instr.alu.lop32i.invert_b) - op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, op_b); + if (instr.alu.lop32i.invert_b) { + op_b = Operation(OperationCode::IBitwiseNot, NO_PRECISE, std::move(op_b)); + } - WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, op_a, op_b, - PredicateResultMode::None, Pred::UnusedIndex, instr.op_32.generates_cc); + WriteLogicOperation(bb, instr.gpr0, instr.alu.lop32i.operation, std::move(op_a), + std::move(op_b), PredicateResultMode::None, Pred::UnusedIndex, + instr.op_32.generates_cc != 0); break; } default: @@ -58,18 +61,18 @@ u32 ShaderIR::DecodeArithmeticIntegerImmediate(NodeBlock& bb, u32 pc) { void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation logic_op, Node op_a, Node op_b, PredicateResultMode predicate_mode, Pred predicate, bool sets_cc) { - const Node result = [&]() { + Node result = [&] { switch (logic_op) { case LogicOperation::And: - return Operation(OperationCode::IBitwiseAnd, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseAnd, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::Or: - return Operation(OperationCode::IBitwiseOr, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseOr, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::Xor: - return Operation(OperationCode::IBitwiseXor, PRECISE, op_a, op_b); + return Operation(OperationCode::IBitwiseXor, PRECISE, std::move(op_a), std::move(op_b)); case LogicOperation::PassB: return op_b; default: - UNIMPLEMENTED_MSG("Unimplemented logic operation={}", static_cast<u32>(logic_op)); + UNIMPLEMENTED_MSG("Unimplemented logic operation={}", logic_op); return Immediate(0); } }(); @@ -84,13 +87,12 @@ void ShaderIR::WriteLogicOperation(NodeBlock& bb, Register dest, LogicOperation return; case PredicateResultMode::NotZero: { // Set the predicate to true if the result is not zero. - const Node compare = Operation(OperationCode::LogicalINotEqual, result, Immediate(0)); - SetPredicate(bb, static_cast<u64>(predicate), compare); + Node compare = Operation(OperationCode::LogicalINotEqual, std::move(result), Immediate(0)); + SetPredicate(bb, static_cast<u64>(predicate), std::move(compare)); break; } default: - UNIMPLEMENTED_MSG("Unimplemented predicate result mode: {}", - static_cast<u32>(predicate_mode)); + UNIMPLEMENTED_MSG("Unimplemented predicate result mode: {}", predicate_mode); } } diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index b9989c88c..fea7a54df 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp @@ -244,7 +244,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { return Operation(OperationCode::FTrunc, value); default: UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}", - static_cast<u32>(instr.conversion.f2f.rounding.Value())); + instr.conversion.f2f.rounding.Value()); return value; } }(); @@ -300,7 +300,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { return Operation(OperationCode::FTrunc, PRECISE, value); default: UNIMPLEMENTED_MSG("Unimplemented F2I rounding mode {}", - static_cast<u32>(instr.conversion.f2i.rounding.Value())); + instr.conversion.f2i.rounding.Value()); return Immediate(0); } }(); diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp index b2e88fa20..fa83108cd 100644 --- a/src/video_core/shader/decode/half_set.cpp +++ b/src/video_core/shader/decode/half_set.cpp @@ -22,13 +22,13 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - PredCondition cond; - bool bf; - bool ftz; - bool neg_a; - bool abs_a; - bool neg_b; - bool abs_b; + PredCondition cond{}; + bool bf = false; + bool ftz = false; + bool neg_a = false; + bool abs_a = false; + bool neg_b = false; + bool abs_b = false; switch (opcode->get().GetId()) { case OpCode::Id::HSET2_C: case OpCode::Id::HSET2_IMM: diff --git a/src/video_core/shader/decode/image.cpp b/src/video_core/shader/decode/image.cpp index 618d309d2..5470e8cf4 100644 --- a/src/video_core/shader/decode/image.cpp +++ b/src/video_core/shader/decode/image.cpp @@ -212,10 +212,10 @@ u32 GetComponentSize(TextureFormat format, std::size_t component) { return 0; case TextureFormat::R8G24: if (component == 0) { - return 8; + return 24; } if (component == 1) { - return 24; + return 8; } return 0; case TextureFormat::R8G8: @@ -358,9 +358,9 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { instr.suldst.GetStoreDataLayout() != StoreType::Bits64); auto descriptor = [this, instr] { - std::optional<Tegra::Engines::SamplerDescriptor> descriptor; + std::optional<Tegra::Engines::SamplerDescriptor> sampler_descriptor; if (instr.suldst.is_immediate) { - descriptor = + sampler_descriptor = registry.ObtainBoundSampler(static_cast<u32>(instr.image.index.Value())); } else { const Node image_register = GetRegister(instr.gpr39); @@ -368,12 +368,12 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { static_cast<s64>(global_code.size())); const auto buffer = std::get<1>(result); const auto offset = std::get<2>(result); - descriptor = registry.ObtainBindlessSampler(buffer, offset); + sampler_descriptor = registry.ObtainBindlessSampler(buffer, offset); } - if (!descriptor) { + if (!sampler_descriptor) { UNREACHABLE_MSG("Failed to obtain image descriptor"); } - return *descriptor; + return *sampler_descriptor; }(); const auto comp_mask = GetImageComponentMask(descriptor.format); @@ -497,11 +497,12 @@ u32 ShaderIR::DecodeImage(NodeBlock& bb, u32 pc) { return pc; } -Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) { +ImageEntry& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type) { const auto offset = static_cast<u32>(image.index.Value()); - const auto it = std::find_if(std::begin(used_images), std::end(used_images), - [offset](const Image& entry) { return entry.offset == offset; }); + const auto it = + std::find_if(std::begin(used_images), std::end(used_images), + [offset](const ImageEntry& entry) { return entry.offset == offset; }); if (it != std::end(used_images)) { ASSERT(!it->is_bindless && it->type == type); return *it; @@ -511,7 +512,7 @@ Image& ShaderIR::GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType t return used_images.emplace_back(next_index, offset, type); } -Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type) { +ImageEntry& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type) { const Node image_register = GetRegister(reg); const auto result = TrackCbuf(image_register, global_code, static_cast<s64>(global_code.size())); @@ -520,7 +521,7 @@ Image& ShaderIR::GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::Im const auto offset = std::get<2>(result); const auto it = std::find_if(std::begin(used_images), std::end(used_images), - [buffer, offset](const Image& entry) { + [buffer, offset](const ImageEntry& entry) { return entry.buffer == buffer && entry.offset == offset; }); if (it != std::end(used_images)) { diff --git a/src/video_core/shader/decode/memory.cpp b/src/video_core/shader/decode/memory.cpp index e2bba88dd..50f4e7d35 100644 --- a/src/video_core/shader/decode/memory.cpp +++ b/src/video_core/shader/decode/memory.cpp @@ -47,7 +47,7 @@ OperationCode GetAtomOperation(AtomicOp op) { case AtomicOp::Exch: return OperationCode::AtomicIExchange; default: - UNIMPLEMENTED_MSG("op={}", static_cast<int>(op)); + UNIMPLEMENTED_MSG("op={}", op); return OperationCode::AtomicIAdd; } } @@ -83,7 +83,7 @@ u32 GetMemorySize(Tegra::Shader::UniformType uniform_type) { case Tegra::Shader::UniformType::UnsignedQuad: return 128; default: - UNIMPLEMENTED_MSG("Unimplemented size={}!", static_cast<u32>(uniform_type)); + UNIMPLEMENTED_MSG("Unimplemented size={}!", uniform_type); return 32; } } @@ -175,12 +175,12 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { break; } default: - UNIMPLEMENTED_MSG("Unhandled type: {}", static_cast<unsigned>(instr.ld_c.type.Value())); + UNIMPLEMENTED_MSG("Unhandled type: {}", instr.ld_c.type.Value()); } break; } case OpCode::Id::LD_L: - LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", static_cast<u64>(instr.ld_l.unknown)); + LOG_DEBUG(HW_GPU, "LD_L cache management mode: {}", instr.ld_l.unknown); [[fallthrough]]; case OpCode::Id::LD_S: { const auto GetAddress = [&](s32 offset) { @@ -224,7 +224,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } default: UNIMPLEMENTED_MSG("{} Unhandled type: {}", opcode->get().GetName(), - static_cast<u32>(instr.ldst_sl.type.Value())); + instr.ldst_sl.type.Value()); } break; } @@ -306,8 +306,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::ST_L: - LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}", - static_cast<u64>(instr.st_l.cache_management.Value())); + LOG_DEBUG(HW_GPU, "ST_L cache management mode: {}", instr.st_l.cache_management.Value()); [[fallthrough]]; case OpCode::Id::ST_S: { const auto GetAddress = [&](s32 offset) { @@ -340,7 +339,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } default: UNIMPLEMENTED_MSG("{} unhandled type: {}", opcode->get().GetName(), - static_cast<u32>(instr.ldst_sl.type.Value())); + instr.ldst_sl.type.Value()); } break; } @@ -387,7 +386,7 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { } case OpCode::Id::RED: { UNIMPLEMENTED_IF_MSG(instr.red.type != GlobalAtomicType::U32, "type={}", - static_cast<int>(instr.red.type.Value())); + instr.red.type.Value()); const auto [real_address, base_address, descriptor] = TrackGlobalMemory(bb, instr, true, true); if (!real_address || !base_address) { @@ -403,12 +402,12 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF_MSG(instr.atom.operation == AtomicOp::Inc || instr.atom.operation == AtomicOp::Dec || instr.atom.operation == AtomicOp::SafeAdd, - "operation={}", static_cast<int>(instr.atom.operation.Value())); + "operation={}", instr.atom.operation.Value()); UNIMPLEMENTED_IF_MSG(instr.atom.type == GlobalAtomicType::S64 || instr.atom.type == GlobalAtomicType::U64 || instr.atom.type == GlobalAtomicType::F16x2_FTZ_RN || instr.atom.type == GlobalAtomicType::F32_FTZ_RN, - "type={}", static_cast<int>(instr.atom.type.Value())); + "type={}", instr.atom.type.Value()); const auto [real_address, base_address, descriptor] = TrackGlobalMemory(bb, instr, true, true); @@ -428,10 +427,10 @@ u32 ShaderIR::DecodeMemory(NodeBlock& bb, u32 pc) { case OpCode::Id::ATOMS: { UNIMPLEMENTED_IF_MSG(instr.atoms.operation == AtomicOp::Inc || instr.atoms.operation == AtomicOp::Dec, - "operation={}", static_cast<int>(instr.atoms.operation.Value())); + "operation={}", instr.atoms.operation.Value()); UNIMPLEMENTED_IF_MSG(instr.atoms.type == AtomicType::S64 || instr.atoms.type == AtomicType::U64, - "type={}", static_cast<int>(instr.atoms.type.Value())); + "type={}", instr.atoms.type.Value()); const bool is_signed = instr.atoms.type == AtomicType::S32 || instr.atoms.type == AtomicType::S64; const s32 offset = instr.atoms.GetImmediateOffset(); diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index 29a7cfbfe..d3ea07aac 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -34,14 +34,13 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::EXIT: { - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "EXIT condition code used: {}", - static_cast<u32>(cc)); + const ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != ConditionCode::T, "EXIT condition code used: {}", cc); switch (instr.flow.cond) { case Tegra::Shader::FlowCondition::Always: bb.push_back(Operation(OperationCode::Exit)); - if (instr.pred.pred_index == static_cast<u64>(Tegra::Shader::Pred::UnusedIndex)) { + if (instr.pred.pred_index == static_cast<u64>(Pred::UnusedIndex)) { // If this is an unconditional exit then just end processing here, // otherwise we have to account for the possibility of the condition // not being met, so continue processing the next instruction. @@ -56,17 +55,15 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { break; default: - UNIMPLEMENTED_MSG("Unhandled flow condition: {}", - static_cast<u32>(instr.flow.cond.Value())); + UNIMPLEMENTED_MSG("Unhandled flow condition: {}", instr.flow.cond.Value()); } break; } case OpCode::Id::KIL: { UNIMPLEMENTED_IF(instr.flow.cond != Tegra::Shader::FlowCondition::Always); - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "KIL condition code used: {}", - static_cast<u32>(cc)); + const ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != ConditionCode::T, "KIL condition code used: {}", cc); bb.push_back(Operation(OperationCode::Discard)); break; @@ -90,11 +87,11 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_MSG("S2R WscaleFactorZ is not implemented"); return Immediate(0U); case SystemVariable::Tid: { - Node value = Immediate(0); - value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdX), 0, 9); - value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdY), 16, 9); - value = BitfieldInsert(value, Operation(OperationCode::LocalInvocationIdZ), 26, 5); - return value; + Node val = Immediate(0); + val = BitfieldInsert(val, Operation(OperationCode::LocalInvocationIdX), 0, 9); + val = BitfieldInsert(val, Operation(OperationCode::LocalInvocationIdY), 16, 9); + val = BitfieldInsert(val, Operation(OperationCode::LocalInvocationIdZ), 26, 5); + return val; } case SystemVariable::TidX: return Operation(OperationCode::LocalInvocationIdX); @@ -130,8 +127,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { return Immediate(0u); } default: - UNIMPLEMENTED_MSG("Unhandled system move: {}", - static_cast<u32>(instr.sys20.Value())); + UNIMPLEMENTED_MSG("Unhandled system move: {}", instr.sys20.Value()); return Immediate(0u); } }(); @@ -181,8 +177,8 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { } const Node branch = Operation(OperationCode::BranchIndirect, operand); - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - if (cc != Tegra::Shader::ConditionCode::T) { + const ConditionCode cc = instr.flow_condition_code; + if (cc != ConditionCode::T) { bb.push_back(Conditional(GetConditionCode(cc), {branch})); } else { bb.push_back(branch); @@ -218,9 +214,8 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::SYNC: { - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "SYNC condition code used: {}", - static_cast<u32>(cc)); + const ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != ConditionCode::T, "SYNC condition code used: {}", cc); if (decompiled) { break; @@ -231,9 +226,8 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::BRK: { - const Tegra::Shader::ConditionCode cc = instr.flow_condition_code; - UNIMPLEMENTED_IF_MSG(cc != Tegra::Shader::ConditionCode::T, "BRK condition code used: {}", - static_cast<u32>(cc)); + const ConditionCode cc = instr.flow_condition_code; + UNIMPLEMENTED_IF_MSG(cc != ConditionCode::T, "BRK condition code used: {}", cc); if (decompiled) { break; } @@ -306,7 +300,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { case Tegra::Shader::MembarType::GL: return OperationCode::MemoryBarrierGlobal; default: - UNIMPLEMENTED_MSG("MEMBAR type={}", static_cast<int>(instr.membar.type.Value())); + UNIMPLEMENTED_MSG("MEMBAR type={}", instr.membar.type.Value()); return OperationCode::MemoryBarrierGlobal; } }(); diff --git a/src/video_core/shader/decode/shift.cpp b/src/video_core/shader/decode/shift.cpp index d4ffa8014..a53819c15 100644 --- a/src/video_core/shader/decode/shift.cpp +++ b/src/video_core/shader/decode/shift.cpp @@ -125,7 +125,7 @@ u32 ShaderIR::DecodeShift(NodeBlock& bb, u32 pc) { case OpCode::Id::SHF_LEFT_IMM: { UNIMPLEMENTED_IF(instr.generates_cc); UNIMPLEMENTED_IF_MSG(instr.shf.xmode != ShfXmode::None, "xmode={}", - static_cast<int>(instr.shf.xmode.Value())); + instr.shf.xmode.Value()); if (instr.is_b_imm) { op_b = Immediate(static_cast<u32>(instr.shf.immediate)); diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index a03b50e39..833fa2a39 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -34,7 +34,7 @@ static std::size_t GetCoordCount(TextureType texture_type) { case TextureType::TextureCube: return 3; default: - UNIMPLEMENTED_MSG("Unhandled texture type: {}", static_cast<u32>(texture_type)); + UNIMPLEMENTED_MSG("Unhandled texture type: {}", texture_type); return 0; } } @@ -141,7 +141,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { SamplerInfo info; info.is_shadow = is_depth_compare; - const std::optional<Sampler> sampler = GetSampler(instr.sampler, info); + const std::optional<SamplerEntry> sampler = GetSampler(instr.sampler, info); Node4 values; for (u32 element = 0; element < values.size(); ++element) { @@ -173,9 +173,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { SamplerInfo info; info.type = texture_type; info.is_array = is_array; - const std::optional<Sampler> sampler = is_bindless - ? GetBindlessSampler(base_reg, info, index_var) - : GetSampler(instr.sampler, info); + const std::optional<SamplerEntry> sampler = + is_bindless ? GetBindlessSampler(base_reg, info, index_var) + : GetSampler(instr.sampler, info); Node4 values; if (!sampler) { std::generate(values.begin(), values.end(), [this] { return Immediate(0); }); @@ -217,9 +217,9 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { [[fallthrough]]; case OpCode::Id::TXQ: { Node index_var; - const std::optional<Sampler> sampler = is_bindless - ? GetBindlessSampler(instr.gpr8, {}, index_var) - : GetSampler(instr.sampler, {}); + const std::optional<SamplerEntry> sampler = + is_bindless ? GetBindlessSampler(instr.gpr8, {}, index_var) + : GetSampler(instr.sampler, {}); if (!sampler) { u32 indexer = 0; @@ -255,8 +255,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { break; } default: - UNIMPLEMENTED_MSG("Unhandled texture query type: {}", - static_cast<u32>(instr.txq.query_type.Value())); + UNIMPLEMENTED_MSG("Unhandled texture query type: {}", instr.txq.query_type.Value()); } break; } @@ -273,7 +272,7 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { info.type = texture_type; info.is_array = is_array; Node index_var; - const std::optional<Sampler> sampler = + const std::optional<SamplerEntry> sampler = is_bindless ? GetBindlessSampler(instr.gpr20, info, index_var) : GetSampler(instr.sampler, info); @@ -292,33 +291,36 @@ u32 ShaderIR::DecodeTexture(NodeBlock& bb, u32 pc) { break; } - std::vector<Node> coords; - - // TODO: Add coordinates for different samplers once other texture types are implemented. - switch (texture_type) { - case TextureType::Texture1D: - coords.push_back(GetRegister(instr.gpr8)); - break; - case TextureType::Texture2D: - coords.push_back(GetRegister(instr.gpr8.Value() + 0)); - coords.push_back(GetRegister(instr.gpr8.Value() + 1)); - break; - default: - UNIMPLEMENTED_MSG("Unhandled texture type {}", static_cast<int>(texture_type)); + const u64 base_index = is_array ? 1 : 0; + const u64 num_components = [texture_type] { + switch (texture_type) { + case TextureType::Texture1D: + return 1; + case TextureType::Texture2D: + return 2; + case TextureType::TextureCube: + return 3; + default: + UNIMPLEMENTED_MSG("Unhandled texture type {}", texture_type); + return 2; + } + }(); + // TODO: What's the array component used for? - // Fallback to interpreting as a 2D texture for now - coords.push_back(GetRegister(instr.gpr8.Value() + 0)); - coords.push_back(GetRegister(instr.gpr8.Value() + 1)); + std::vector<Node> coords; + coords.reserve(num_components); + for (u64 component = 0; component < num_components; ++component) { + coords.push_back(GetRegister(instr.gpr8.Value() + base_index + component)); } + u32 indexer = 0; for (u32 element = 0; element < 2; ++element) { if (!instr.tmml.IsComponentEnabled(element)) { continue; } - auto params = coords; MetaTexture meta{*sampler, {}, {}, {}, {}, {}, {}, {}, {}, element, index_var}; - const Node value = Operation(OperationCode::TextureQueryLod, meta, std::move(params)); - SetTemporary(bb, indexer++, value); + Node value = Operation(OperationCode::TextureQueryLod, meta, coords); + SetTemporary(bb, indexer++, std::move(value)); } for (u32 i = 0; i < indexer; ++i) { SetRegister(bb, instr.gpr0.Value() + i, GetTemporary(i)); @@ -377,14 +379,15 @@ ShaderIR::SamplerInfo ShaderIR::GetSamplerInfo( return info; } -std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, - SamplerInfo sampler_info) { +std::optional<SamplerEntry> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, + SamplerInfo sampler_info) { const u32 offset = static_cast<u32>(sampler.index.Value()); const auto info = GetSamplerInfo(sampler_info, registry.ObtainBoundSampler(offset)); // If this sampler has already been used, return the existing mapping. - const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), - [offset](const Sampler& entry) { return entry.offset == offset; }); + const auto it = + std::find_if(used_samplers.begin(), used_samplers.end(), + [offset](const SamplerEntry& entry) { return entry.offset == offset; }); if (it != used_samplers.end()) { ASSERT(!it->is_bindless && it->type == info.type && it->is_array == info.is_array && it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); @@ -397,8 +400,8 @@ std::optional<Sampler> ShaderIR::GetSampler(Tegra::Shader::Sampler sampler, *info.is_shadow, *info.is_buffer, false); } -std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info, - Node& index_var) { +std::optional<SamplerEntry> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, + SamplerInfo info, Node& index_var) { const Node sampler_register = GetRegister(reg); const auto [base_node, tracked_sampler_info] = TrackBindlessSampler(sampler_register, global_code, static_cast<s64>(global_code.size())); @@ -414,7 +417,7 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, // If this sampler has already been used, return the existing mapping. const auto it = std::find_if(used_samplers.begin(), used_samplers.end(), - [buffer, offset](const Sampler& entry) { + [buffer, offset](const SamplerEntry& entry) { return entry.buffer == buffer && entry.offset == offset; }); if (it != used_samplers.end()) { @@ -434,11 +437,12 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, info = GetSamplerInfo(info, registry.ObtainSeparateSampler(indices, offsets)); // Try to use an already created sampler if it exists - const auto it = std::find_if( - used_samplers.begin(), used_samplers.end(), [indices, offsets](const Sampler& entry) { - return offsets == std::pair{entry.offset, entry.secondary_offset} && - indices == std::pair{entry.buffer, entry.secondary_buffer}; - }); + const auto it = + std::find_if(used_samplers.begin(), used_samplers.end(), + [indices, offsets](const SamplerEntry& entry) { + return offsets == std::pair{entry.offset, entry.secondary_offset} && + indices == std::pair{entry.buffer, entry.secondary_buffer}; + }); if (it != used_samplers.end()) { ASSERT(it->is_separated && it->type == info.type && it->is_array == info.is_array && it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer); @@ -458,7 +462,7 @@ std::optional<Sampler> ShaderIR::GetBindlessSampler(Tegra::Shader::Register reg, // If this sampler has already been used, return the existing mapping. const auto it = std::find_if( used_samplers.begin(), used_samplers.end(), - [base_offset](const Sampler& entry) { return entry.offset == base_offset; }); + [base_offset](const SamplerEntry& entry) { return entry.offset == base_offset; }); if (it != used_samplers.end()) { ASSERT(!it->is_bindless && it->type == info.type && it->is_array == info.is_array && it->is_shadow == info.is_shadow && it->is_buffer == info.is_buffer && @@ -553,7 +557,6 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, const bool is_shadow = depth_compare != nullptr; const bool is_bindless = bindless_reg.has_value(); - UNIMPLEMENTED_IF(texture_type == TextureType::TextureCube && is_array && is_shadow); ASSERT_MSG(texture_type != TextureType::Texture3D || !is_array || !is_shadow, "Illegal texture type"); @@ -564,9 +567,9 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, info.is_buffer = false; Node index_var; - const std::optional<Sampler> sampler = is_bindless - ? GetBindlessSampler(*bindless_reg, info, index_var) - : GetSampler(instr.sampler, info); + const std::optional<SamplerEntry> sampler = + is_bindless ? GetBindlessSampler(*bindless_reg, info, index_var) + : GetSampler(instr.sampler, info); if (!sampler) { return {Immediate(0), Immediate(0), Immediate(0), Immediate(0)}; } @@ -593,7 +596,7 @@ Node4 ShaderIR::GetTextureCode(Instruction instr, TextureType texture_type, lod = GetRegister(instr.gpr20.Value() + bias_offset); break; default: - UNIMPLEMENTED_MSG("Unimplemented process mode={}", static_cast<u32>(process_mode)); + UNIMPLEMENTED_MSG("Unimplemented process mode={}", process_mode); break; } @@ -723,7 +726,7 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de info.is_shadow = depth_compare; Node index_var; - const std::optional<Sampler> sampler = + const std::optional<SamplerEntry> sampler = is_bindless ? GetBindlessSampler(parameter_register++, info, index_var) : GetSampler(instr.sampler, info); Node4 values; @@ -782,7 +785,7 @@ Node4 ShaderIR::GetTldCode(Tegra::Shader::Instruction instr) { // const Node aoffi_register{is_aoffi ? GetRegister(gpr20_cursor++) : nullptr}; // const Node multisample{is_multisample ? GetRegister(gpr20_cursor++) : nullptr}; - const std::optional<Sampler> sampler = GetSampler(instr.sampler, {}); + const std::optional<SamplerEntry> sampler = GetSampler(instr.sampler, {}); Node4 values; for (u32 element = 0; element < values.size(); ++element) { @@ -799,7 +802,7 @@ Node4 ShaderIR::GetTldsCode(Instruction instr, TextureType texture_type, bool is info.type = texture_type; info.is_array = is_array; info.is_shadow = false; - const std::optional<Sampler> sampler = GetSampler(instr.sampler, info); + const std::optional<SamplerEntry> sampler = GetSampler(instr.sampler, info); const std::size_t type_coord_count = GetCoordCount(texture_type); const bool lod_enabled = instr.tlds.GetTextureProcessMode() == TextureProcessMode::LL; diff --git a/src/video_core/shader/decode/warp.cpp b/src/video_core/shader/decode/warp.cpp index 11b77f795..37433d783 100644 --- a/src/video_core/shader/decode/warp.cpp +++ b/src/video_core/shader/decode/warp.cpp @@ -27,7 +27,7 @@ OperationCode GetOperationCode(VoteOperation vote_op) { case VoteOperation::Eq: return OperationCode::VoteEqual; default: - UNREACHABLE_MSG("Invalid vote operation={}", static_cast<u64>(vote_op)); + UNREACHABLE_MSG("Invalid vote operation={}", vote_op); return OperationCode::VoteAll; } } diff --git a/src/video_core/shader/expr.h b/src/video_core/shader/expr.h index 4e8264367..cda284c72 100644 --- a/src/video_core/shader/expr.h +++ b/src/video_core/shader/expr.h @@ -76,7 +76,7 @@ public: class ExprPredicate final { public: - explicit ExprPredicate(u32 predicate) : predicate{predicate} {} + explicit ExprPredicate(u32 predicate_) : predicate{predicate_} {} bool operator==(const ExprPredicate& b) const { return predicate == b.predicate; @@ -91,7 +91,7 @@ public: class ExprCondCode final { public: - explicit ExprCondCode(ConditionCode cc) : cc{cc} {} + explicit ExprCondCode(ConditionCode condition_code) : cc{condition_code} {} bool operator==(const ExprCondCode& b) const { return cc == b.cc; @@ -121,7 +121,7 @@ public: class ExprGprEqual final { public: - ExprGprEqual(u32 gpr, u32 value) : gpr{gpr}, value{value} {} + explicit ExprGprEqual(u32 gpr_, u32 value_) : gpr{gpr_}, value{value_} {} bool operator==(const ExprGprEqual& b) const { return gpr == b.gpr && value == b.value; diff --git a/src/video_core/shader/node.h b/src/video_core/shader/node.h index 1b19a0673..c9840b75e 100644 --- a/src/video_core/shader/node.h +++ b/src/video_core/shader/node.h @@ -282,26 +282,27 @@ struct SeparateSamplerNode; using TrackSamplerData = std::variant<BindlessSamplerNode, SeparateSamplerNode, ArraySamplerNode>; using TrackSampler = std::shared_ptr<TrackSamplerData>; -struct Sampler { +struct SamplerEntry { /// Bound samplers constructor - constexpr explicit Sampler(u32 index, u32 offset, Tegra::Shader::TextureType type, - bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) - : index{index}, offset{offset}, type{type}, is_array{is_array}, is_shadow{is_shadow}, - is_buffer{is_buffer}, is_indexed{is_indexed} {} + explicit SamplerEntry(u32 index_, u32 offset_, Tegra::Shader::TextureType type_, bool is_array_, + bool is_shadow_, bool is_buffer_, bool is_indexed_) + : index{index_}, offset{offset_}, type{type_}, is_array{is_array_}, is_shadow{is_shadow_}, + is_buffer{is_buffer_}, is_indexed{is_indexed_} {} /// Separate sampler constructor - constexpr explicit Sampler(u32 index, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers, - Tegra::Shader::TextureType type, bool is_array, bool is_shadow, - bool is_buffer) - : index{index}, offset{offsets.first}, secondary_offset{offsets.second}, - buffer{buffers.first}, secondary_buffer{buffers.second}, type{type}, is_array{is_array}, - is_shadow{is_shadow}, is_buffer{is_buffer}, is_separated{true} {} + explicit SamplerEntry(u32 index_, std::pair<u32, u32> offsets, std::pair<u32, u32> buffers, + Tegra::Shader::TextureType type_, bool is_array_, bool is_shadow_, + bool is_buffer_) + : index{index_}, offset{offsets.first}, secondary_offset{offsets.second}, + buffer{buffers.first}, secondary_buffer{buffers.second}, type{type_}, is_array{is_array_}, + is_shadow{is_shadow_}, is_buffer{is_buffer_}, is_separated{true} {} /// Bindless samplers constructor - constexpr explicit Sampler(u32 index, u32 offset, u32 buffer, Tegra::Shader::TextureType type, - bool is_array, bool is_shadow, bool is_buffer, bool is_indexed) - : index{index}, offset{offset}, buffer{buffer}, type{type}, is_array{is_array}, - is_shadow{is_shadow}, is_buffer{is_buffer}, is_bindless{true}, is_indexed{is_indexed} {} + explicit SamplerEntry(u32 index_, u32 offset_, u32 buffer_, Tegra::Shader::TextureType type_, + bool is_array_, bool is_shadow_, bool is_buffer_, bool is_indexed_) + : index{index_}, offset{offset_}, buffer{buffer_}, type{type_}, is_array{is_array_}, + is_shadow{is_shadow_}, is_buffer{is_buffer_}, is_bindless{true}, is_indexed{is_indexed_} { + } u32 index = 0; ///< Emulated index given for the this sampler. u32 offset = 0; ///< Offset in the const buffer from where the sampler is being read. @@ -338,15 +339,15 @@ struct BindlessSamplerNode { u32 offset; }; -struct Image { +struct ImageEntry { public: /// Bound images constructor - constexpr explicit Image(u32 index, u32 offset, Tegra::Shader::ImageType type) - : index{index}, offset{offset}, type{type} {} + explicit ImageEntry(u32 index_, u32 offset_, Tegra::Shader::ImageType type_) + : index{index_}, offset{offset_}, type{type_} {} /// Bindless samplers constructor - constexpr explicit Image(u32 index, u32 offset, u32 buffer, Tegra::Shader::ImageType type) - : index{index}, offset{offset}, buffer{buffer}, type{type}, is_bindless{true} {} + explicit ImageEntry(u32 index_, u32 offset_, u32 buffer_, Tegra::Shader::ImageType type_) + : index{index_}, offset{offset_}, buffer{buffer_}, type{type_}, is_bindless{true} {} void MarkWrite() { is_written = true; @@ -377,7 +378,7 @@ struct GlobalMemoryBase { u32 cbuf_index = 0; u32 cbuf_offset = 0; - bool operator<(const GlobalMemoryBase& rhs) const { + [[nodiscard]] bool operator<(const GlobalMemoryBase& rhs) const { return std::tie(cbuf_index, cbuf_offset) < std::tie(rhs.cbuf_index, rhs.cbuf_offset); } }; @@ -389,7 +390,7 @@ struct MetaArithmetic { /// Parameters describing a texture sampler struct MetaTexture { - Sampler sampler; + SamplerEntry sampler; Node array; Node depth_compare; std::vector<Node> aoffi; @@ -403,7 +404,7 @@ struct MetaTexture { }; struct MetaImage { - const Image& image; + const ImageEntry& image; std::vector<Node> values; u32 element{}; }; @@ -414,7 +415,7 @@ using Meta = class AmendNode { public: - std::optional<std::size_t> GetAmendIndex() const { + [[nodiscard]] std::optional<std::size_t> GetAmendIndex() const { if (amend_index == amend_null_index) { return std::nullopt; } @@ -437,30 +438,30 @@ private: /// Holds any kind of operation that can be done in the IR class OperationNode final : public AmendNode { public: - explicit OperationNode(OperationCode code) : OperationNode(code, Meta{}) {} + explicit OperationNode(OperationCode code_) : OperationNode(code_, Meta{}) {} - explicit OperationNode(OperationCode code, Meta meta) - : OperationNode(code, std::move(meta), std::vector<Node>{}) {} + explicit OperationNode(OperationCode code_, Meta meta_) + : OperationNode(code_, std::move(meta_), std::vector<Node>{}) {} - explicit OperationNode(OperationCode code, std::vector<Node> operands) - : OperationNode(code, Meta{}, std::move(operands)) {} + explicit OperationNode(OperationCode code_, std::vector<Node> operands_) + : OperationNode(code_, Meta{}, std::move(operands_)) {} - explicit OperationNode(OperationCode code, Meta meta, std::vector<Node> operands) - : code{code}, meta{std::move(meta)}, operands{std::move(operands)} {} + explicit OperationNode(OperationCode code_, Meta meta_, std::vector<Node> operands_) + : code{code_}, meta{std::move(meta_)}, operands{std::move(operands_)} {} template <typename... Args> - explicit OperationNode(OperationCode code, Meta meta, Args&&... operands) - : code{code}, meta{std::move(meta)}, operands{operands...} {} + explicit OperationNode(OperationCode code_, Meta meta_, Args&&... operands_) + : code{code_}, meta{std::move(meta_)}, operands{operands_...} {} - OperationCode GetCode() const { + [[nodiscard]] OperationCode GetCode() const { return code; } - const Meta& GetMeta() const { + [[nodiscard]] const Meta& GetMeta() const { return meta; } - std::size_t GetOperandsCount() const { + [[nodiscard]] std::size_t GetOperandsCount() const { return operands.size(); } @@ -472,7 +473,7 @@ public: return operands; } - const Node& operator[](std::size_t operand_index) const { + [[nodiscard]] const Node& operator[](std::size_t operand_index) const { return operands.at(operand_index); } @@ -485,14 +486,14 @@ private: /// Encloses inside any kind of node that returns a boolean conditionally-executed code class ConditionalNode final : public AmendNode { public: - explicit ConditionalNode(Node condition, std::vector<Node>&& code) - : condition{std::move(condition)}, code{std::move(code)} {} + explicit ConditionalNode(Node condition_, std::vector<Node>&& code_) + : condition{std::move(condition_)}, code{std::move(code_)} {} - const Node& GetCondition() const { + [[nodiscard]] const Node& GetCondition() const { return condition; } - const std::vector<Node>& GetCode() const { + [[nodiscard]] const std::vector<Node>& GetCode() const { return code; } @@ -504,9 +505,9 @@ private: /// A general purpose register class GprNode final { public: - explicit constexpr GprNode(Tegra::Shader::Register index) : index{index} {} + explicit constexpr GprNode(Tegra::Shader::Register index_) : index{index_} {} - u32 GetIndex() const { + [[nodiscard]] constexpr u32 GetIndex() const { return static_cast<u32>(index); } @@ -517,9 +518,9 @@ private: /// A custom variable class CustomVarNode final { public: - explicit constexpr CustomVarNode(u32 index) : index{index} {} + explicit constexpr CustomVarNode(u32 index_) : index{index_} {} - constexpr u32 GetIndex() const { + [[nodiscard]] constexpr u32 GetIndex() const { return index; } @@ -530,9 +531,9 @@ private: /// A 32-bits value that represents an immediate value class ImmediateNode final { public: - explicit constexpr ImmediateNode(u32 value) : value{value} {} + explicit constexpr ImmediateNode(u32 value_) : value{value_} {} - u32 GetValue() const { + [[nodiscard]] constexpr u32 GetValue() const { return value; } @@ -543,9 +544,9 @@ private: /// One of Maxwell's internal flags class InternalFlagNode final { public: - explicit constexpr InternalFlagNode(InternalFlag flag) : flag{flag} {} + explicit constexpr InternalFlagNode(InternalFlag flag_) : flag{flag_} {} - InternalFlag GetFlag() const { + [[nodiscard]] constexpr InternalFlag GetFlag() const { return flag; } @@ -556,14 +557,14 @@ private: /// A predicate register, it can be negated without additional nodes class PredicateNode final { public: - explicit constexpr PredicateNode(Tegra::Shader::Pred index, bool negated) - : index{index}, negated{negated} {} + explicit constexpr PredicateNode(Tegra::Shader::Pred index_, bool negated_) + : index{index_}, negated{negated_} {} - Tegra::Shader::Pred GetIndex() const { + [[nodiscard]] constexpr Tegra::Shader::Pred GetIndex() const { return index; } - bool IsNegated() const { + [[nodiscard]] constexpr bool IsNegated() const { return negated; } @@ -576,30 +577,30 @@ private: class AbufNode final { public: // Initialize for standard attributes (index is explicit). - explicit AbufNode(Tegra::Shader::Attribute::Index index, u32 element, Node buffer = {}) - : buffer{std::move(buffer)}, index{index}, element{element} {} + explicit AbufNode(Tegra::Shader::Attribute::Index index_, u32 element_, Node buffer_ = {}) + : buffer{std::move(buffer_)}, index{index_}, element{element_} {} // Initialize for physical attributes (index is a variable value). - explicit AbufNode(Node physical_address, Node buffer = {}) - : physical_address{std::move(physical_address)}, buffer{std::move(buffer)} {} + explicit AbufNode(Node physical_address_, Node buffer_ = {}) + : physical_address{std::move(physical_address_)}, buffer{std::move(buffer_)} {} - Tegra::Shader::Attribute::Index GetIndex() const { + [[nodiscard]] Tegra::Shader::Attribute::Index GetIndex() const { return index; } - u32 GetElement() const { + [[nodiscard]] u32 GetElement() const { return element; } - const Node& GetBuffer() const { + [[nodiscard]] const Node& GetBuffer() const { return buffer; } - bool IsPhysicalBuffer() const { + [[nodiscard]] bool IsPhysicalBuffer() const { return static_cast<bool>(physical_address); } - const Node& GetPhysicalAddress() const { + [[nodiscard]] const Node& GetPhysicalAddress() const { return physical_address; } @@ -613,9 +614,9 @@ private: /// Patch memory (used to communicate tessellation stages). class PatchNode final { public: - explicit PatchNode(u32 offset) : offset{offset} {} + explicit constexpr PatchNode(u32 offset_) : offset{offset_} {} - u32 GetOffset() const { + [[nodiscard]] constexpr u32 GetOffset() const { return offset; } @@ -626,13 +627,13 @@ private: /// Constant buffer node, usually mapped to uniform buffers in GLSL class CbufNode final { public: - explicit CbufNode(u32 index, Node offset) : index{index}, offset{std::move(offset)} {} + explicit CbufNode(u32 index_, Node offset_) : index{index_}, offset{std::move(offset_)} {} - u32 GetIndex() const { + [[nodiscard]] u32 GetIndex() const { return index; } - const Node& GetOffset() const { + [[nodiscard]] const Node& GetOffset() const { return offset; } @@ -644,9 +645,9 @@ private: /// Local memory node class LmemNode final { public: - explicit LmemNode(Node address) : address{std::move(address)} {} + explicit LmemNode(Node address_) : address{std::move(address_)} {} - const Node& GetAddress() const { + [[nodiscard]] const Node& GetAddress() const { return address; } @@ -657,9 +658,9 @@ private: /// Shared memory node class SmemNode final { public: - explicit SmemNode(Node address) : address{std::move(address)} {} + explicit SmemNode(Node address_) : address{std::move(address_)} {} - const Node& GetAddress() const { + [[nodiscard]] const Node& GetAddress() const { return address; } @@ -670,19 +671,19 @@ private: /// Global memory node class GmemNode final { public: - explicit GmemNode(Node real_address, Node base_address, const GlobalMemoryBase& descriptor) - : real_address{std::move(real_address)}, base_address{std::move(base_address)}, - descriptor{descriptor} {} + explicit GmemNode(Node real_address_, Node base_address_, const GlobalMemoryBase& descriptor_) + : real_address{std::move(real_address_)}, base_address{std::move(base_address_)}, + descriptor{descriptor_} {} - const Node& GetRealAddress() const { + [[nodiscard]] const Node& GetRealAddress() const { return real_address; } - const Node& GetBaseAddress() const { + [[nodiscard]] const Node& GetBaseAddress() const { return base_address; } - const GlobalMemoryBase& GetDescriptor() const { + [[nodiscard]] const GlobalMemoryBase& GetDescriptor() const { return descriptor; } @@ -695,9 +696,9 @@ private: /// Commentary, can be dropped class CommentNode final { public: - explicit CommentNode(std::string text) : text{std::move(text)} {} + explicit CommentNode(std::string text_) : text{std::move(text_)} {} - const std::string& GetText() const { + [[nodiscard]] const std::string& GetText() const { return text; } diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp index 7bf4ff387..6a5b6940d 100644 --- a/src/video_core/shader/node_helper.cpp +++ b/src/video_core/shader/node_helper.cpp @@ -107,7 +107,7 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed) UNREACHABLE_MSG("Can't apply absolute to an unsigned integer"); return {}; default: - UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code)); + UNREACHABLE_MSG("Unknown signed operation with code={}", operation_code); return {}; } } diff --git a/src/video_core/shader/registry.cpp b/src/video_core/shader/registry.cpp index cdf274e54..148d91fcb 100644 --- a/src/video_core/shader/registry.cpp +++ b/src/video_core/shader/registry.cpp @@ -24,44 +24,45 @@ GraphicsInfo MakeGraphicsInfo(ShaderType shader_stage, ConstBufferEngineInterfac if (shader_stage == ShaderType::Compute) { return {}; } - auto& graphics = static_cast<Tegra::Engines::Maxwell3D&>(engine); - - GraphicsInfo info; - info.tfb_layouts = graphics.regs.tfb_layouts; - info.tfb_varying_locs = graphics.regs.tfb_varying_locs; - info.primitive_topology = graphics.regs.draw.topology; - info.tessellation_primitive = graphics.regs.tess_mode.prim; - info.tessellation_spacing = graphics.regs.tess_mode.spacing; - info.tfb_enabled = graphics.regs.tfb_enabled; - info.tessellation_clockwise = graphics.regs.tess_mode.cw; - return info; + + auto& graphics = dynamic_cast<Tegra::Engines::Maxwell3D&>(engine); + + return { + .tfb_layouts = graphics.regs.tfb_layouts, + .tfb_varying_locs = graphics.regs.tfb_varying_locs, + .primitive_topology = graphics.regs.draw.topology, + .tessellation_primitive = graphics.regs.tess_mode.prim, + .tessellation_spacing = graphics.regs.tess_mode.spacing, + .tfb_enabled = graphics.regs.tfb_enabled != 0, + .tessellation_clockwise = graphics.regs.tess_mode.cw.Value() != 0, + }; } ComputeInfo MakeComputeInfo(ShaderType shader_stage, ConstBufferEngineInterface& engine) { if (shader_stage != ShaderType::Compute) { return {}; } - auto& compute = static_cast<Tegra::Engines::KeplerCompute&>(engine); + + auto& compute = dynamic_cast<Tegra::Engines::KeplerCompute&>(engine); const auto& launch = compute.launch_description; - ComputeInfo info; - info.workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z}; - info.local_memory_size_in_words = launch.local_pos_alloc; - info.shared_memory_size_in_words = launch.shared_alloc; - return info; + return { + .workgroup_size = {launch.block_dim_x, launch.block_dim_y, launch.block_dim_z}, + .shared_memory_size_in_words = launch.shared_alloc, + .local_memory_size_in_words = launch.local_pos_alloc, + }; } } // Anonymous namespace -Registry::Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info) +Registry::Registry(ShaderType shader_stage, const SerializedRegistryInfo& info) : stage{shader_stage}, stored_guest_driver_profile{info.guest_driver_profile}, bound_buffer{info.bound_buffer}, graphics_info{info.graphics}, compute_info{info.compute} {} -Registry::Registry(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine) - : stage{shader_stage}, engine{&engine}, bound_buffer{engine.GetBoundBuffer()}, - graphics_info{MakeGraphicsInfo(shader_stage, engine)}, compute_info{MakeComputeInfo( - shader_stage, engine)} {} +Registry::Registry(ShaderType shader_stage, ConstBufferEngineInterface& engine_) + : stage{shader_stage}, engine{&engine_}, bound_buffer{engine_.GetBoundBuffer()}, + graphics_info{MakeGraphicsInfo(shader_stage, engine_)}, compute_info{MakeComputeInfo( + shader_stage, engine_)} {} Registry::~Registry() = default; @@ -113,8 +114,7 @@ std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainSeparateSampler return value; } -std::optional<Tegra::Engines::SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, - u32 offset) { +std::optional<SamplerDescriptor> Registry::ObtainBindlessSampler(u32 buffer, u32 offset) { const std::pair key = {buffer, offset}; const auto iter = bindless_samplers.find(key); if (iter != bindless_samplers.end()) { diff --git a/src/video_core/shader/registry.h b/src/video_core/shader/registry.h index 231206765..4bebefdde 100644 --- a/src/video_core/shader/registry.h +++ b/src/video_core/shader/registry.h @@ -94,7 +94,7 @@ public: explicit Registry(Tegra::Engines::ShaderType shader_stage, const SerializedRegistryInfo& info); explicit Registry(Tegra::Engines::ShaderType shader_stage, - Tegra::Engines::ConstBufferEngineInterface& engine); + Tegra::Engines::ConstBufferEngineInterface& engine_); ~Registry(); diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index f207bbfbf..caf5ff362 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -25,9 +25,10 @@ using Tegra::Shader::PredCondition; using Tegra::Shader::PredOperation; using Tegra::Shader::Register; -ShaderIR::ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, - Registry& registry) - : program_code{program_code}, main_offset{main_offset}, settings{settings}, registry{registry} { +ShaderIR::ShaderIR(const ProgramCode& program_code_, u32 main_offset_, CompilerSettings settings_, + Registry& registry_) + : program_code{program_code_}, main_offset{main_offset_}, settings{settings_}, registry{ + registry_} { Decode(); PostDecode(); } @@ -170,7 +171,7 @@ Node ShaderIR::ConvertIntegerSize(Node value, Register::Size size, bool is_signe // Default - do nothing return value; default: - UNREACHABLE_MSG("Unimplemented conversion size: {}", static_cast<u32>(size)); + UNREACHABLE_MSG("Unimplemented conversion size: {}", size); return value; } } @@ -335,15 +336,15 @@ OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) { return operation_table[index]; } -Node ShaderIR::GetConditionCode(Tegra::Shader::ConditionCode cc) const { +Node ShaderIR::GetConditionCode(ConditionCode cc) const { switch (cc) { - case Tegra::Shader::ConditionCode::NEU: + case ConditionCode::NEU: return GetInternalFlag(InternalFlag::Zero, true); - case Tegra::Shader::ConditionCode::FCSM_TR: + case ConditionCode::FCSM_TR: UNIMPLEMENTED_MSG("EXIT.FCSM_TR is not implemented"); return MakeNode<PredicateNode>(Pred::NeverExecute, false); default: - UNIMPLEMENTED_MSG("Unimplemented condition code: {}", static_cast<u32>(cc)); + UNIMPLEMENTED_MSG("Unimplemented condition code: {}", cc); return MakeNode<PredicateNode>(Pred::NeverExecute, false); } } @@ -496,8 +497,8 @@ void ShaderIR::MarkAttributeUsage(Attribute::Index index, u64 element) { } std::size_t ShaderIR::DeclareAmend(Node new_amend) { - const std::size_t id = amend_code.size(); - amend_code.push_back(new_amend); + const auto id = amend_code.size(); + amend_code.push_back(std::move(new_amend)); return id; } diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index b450f3b8a..0afa39531 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -29,8 +29,8 @@ struct ShaderBlock; constexpr u32 MAX_PROGRAM_LENGTH = 0x1000; struct ConstBuffer { - constexpr explicit ConstBuffer(u32 max_offset, bool is_indirect) - : max_offset{max_offset}, is_indirect{is_indirect} {} + constexpr explicit ConstBuffer(u32 max_offset_, bool is_indirect_) + : max_offset{max_offset_}, is_indirect{is_indirect_} {} constexpr ConstBuffer() = default; @@ -66,8 +66,8 @@ struct GlobalMemoryUsage { class ShaderIR final { public: - explicit ShaderIR(const ProgramCode& program_code, u32 main_offset, CompilerSettings settings, - Registry& registry); + explicit ShaderIR(const ProgramCode& program_code_, u32 main_offset_, + CompilerSettings settings_, Registry& registry_); ~ShaderIR(); const std::map<u32, NodeBlock>& GetBasicBlocks() const { @@ -94,11 +94,11 @@ public: return used_cbufs; } - const std::list<Sampler>& GetSamplers() const { + const std::list<SamplerEntry>& GetSamplers() const { return used_samplers; } - const std::list<Image>& GetImages() const { + const std::list<ImageEntry>& GetImages() const { return used_images; } @@ -334,17 +334,17 @@ private: std::optional<Tegra::Engines::SamplerDescriptor> sampler); /// Accesses a texture sampler. - std::optional<Sampler> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); + std::optional<SamplerEntry> GetSampler(Tegra::Shader::Sampler sampler, SamplerInfo info); /// Accesses a texture sampler for a bindless texture. - std::optional<Sampler> GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info, - Node& index_var); + std::optional<SamplerEntry> GetBindlessSampler(Tegra::Shader::Register reg, SamplerInfo info, + Node& index_var); /// Accesses an image. - Image& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type); + ImageEntry& GetImage(Tegra::Shader::Image image, Tegra::Shader::ImageType type); /// Access a bindless image sampler. - Image& GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type); + ImageEntry& GetBindlessImage(Tegra::Shader::Register reg, Tegra::Shader::ImageType type); /// Recursive Iteration over the OperationNode operands, searching for GprNodes. void SearchOperands(NodeBlock& nb, Node var); @@ -457,8 +457,8 @@ private: std::set<Tegra::Shader::Attribute::Index> used_input_attributes; std::set<Tegra::Shader::Attribute::Index> used_output_attributes; std::map<u32, ConstBuffer> used_cbufs; - std::list<Sampler> used_samplers; - std::list<Image> used_images; + std::list<SamplerEntry> used_samplers; + std::list<ImageEntry> used_images; std::array<bool, Tegra::Engines::Maxwell3D::Regs::NumClipDistances> used_clip_distances{}; std::map<GlobalMemoryBase, GlobalMemoryUsage> used_global_memory; bool uses_layer{}; diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 1688267bb..6308aef94 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -28,7 +28,7 @@ SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_t case Tegra::Texture::TextureType::Texture2DArray: return SurfaceTarget::Texture2DArray; default: - LOG_CRITICAL(HW_GPU, "Unimplemented texture_type={}", static_cast<u32>(texture_type)); + LOG_CRITICAL(HW_GPU, "Unimplemented texture_type={}", texture_type); UNREACHABLE(); return SurfaceTarget::Texture2D; } @@ -47,7 +47,7 @@ bool SurfaceTargetIsLayered(SurfaceTarget target) { case SurfaceTarget::TextureCubeArray: return true; default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); + LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", target); UNREACHABLE(); return false; } @@ -66,7 +66,7 @@ bool SurfaceTargetIsArray(SurfaceTarget target) { case SurfaceTarget::TextureCubeArray: return true; default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); + LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", target); UNREACHABLE(); return false; } @@ -85,7 +85,7 @@ PixelFormat PixelFormatFromDepthFormat(Tegra::DepthFormat format) { case Tegra::DepthFormat::D32_FLOAT_S8X24_UINT: return PixelFormat::D32_FLOAT_S8_UINT; default: - UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format)); + UNIMPLEMENTED_MSG("Unimplemented format={}", format); return PixelFormat::S8_UINT_D24_UNORM; } } @@ -183,7 +183,7 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) case Tegra::RenderTargetFormat::R8_UINT: return PixelFormat::R8_UINT; default: - UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<int>(format)); + UNIMPLEMENTED_MSG("Unimplemented format={}", format); return PixelFormat::A8B8G8R8_UNORM; } } @@ -197,7 +197,7 @@ PixelFormat PixelFormatFromGPUPixelFormat(Tegra::FramebufferConfig::PixelFormat case Tegra::FramebufferConfig::PixelFormat::B8G8R8A8_UNORM: return PixelFormat::B8G8R8A8_UNORM; default: - UNIMPLEMENTED_MSG("Unimplemented format={}", static_cast<u32>(format)); + UNIMPLEMENTED_MSG("Unimplemented format={}", format); return PixelFormat::A8B8G8R8_UNORM; } } @@ -280,7 +280,7 @@ bool IsPixelFormatSRGB(PixelFormat format) { } std::pair<u32, u32> GetASTCBlockSize(PixelFormat format) { - return {GetDefaultBlockWidth(format), GetDefaultBlockHeight(format)}; + return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; } } // namespace VideoCore::Surface diff --git a/src/video_core/surface.h b/src/video_core/surface.h index cfd12fa61..c40ab89d0 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -120,7 +120,7 @@ enum class PixelFormat { Max = MaxDepthStencilFormat, Invalid = 255, }; -static constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max); +constexpr std::size_t MaxPixelFormat = static_cast<std::size_t>(PixelFormat::Max); enum class SurfaceType { ColorTexture = 0, @@ -140,117 +140,7 @@ enum class SurfaceTarget { TextureCubeArray, }; -constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ - 0, // A8B8G8R8_UNORM - 0, // A8B8G8R8_SNORM - 0, // A8B8G8R8_SINT - 0, // A8B8G8R8_UINT - 0, // R5G6B5_UNORM - 0, // B5G6R5_UNORM - 0, // A1R5G5B5_UNORM - 0, // A2B10G10R10_UNORM - 0, // A2B10G10R10_UINT - 0, // A1B5G5R5_UNORM - 0, // R8_UNORM - 0, // R8_SNORM - 0, // R8_SINT - 0, // R8_UINT - 0, // R16G16B16A16_FLOAT - 0, // R16G16B16A16_UNORM - 0, // R16G16B16A16_SNORM - 0, // R16G16B16A16_SINT - 0, // R16G16B16A16_UINT - 0, // B10G11R11_FLOAT - 0, // R32G32B32A32_UINT - 2, // BC1_RGBA_UNORM - 2, // BC2_UNORM - 2, // BC3_UNORM - 2, // BC4_UNORM - 2, // BC4_SNORM - 2, // BC5_UNORM - 2, // BC5_SNORM - 2, // BC7_UNORM - 2, // BC6H_UFLOAT - 2, // BC6H_SFLOAT - 2, // ASTC_2D_4X4_UNORM - 0, // B8G8R8A8_UNORM - 0, // R32G32B32A32_FLOAT - 0, // R32G32B32A32_SINT - 0, // R32G32_FLOAT - 0, // R32G32_SINT - 0, // R32_FLOAT - 0, // R16_FLOAT - 0, // R16_UNORM - 0, // R16_SNORM - 0, // R16_UINT - 0, // R16_SINT - 0, // R16G16_UNORM - 0, // R16G16_FLOAT - 0, // R16G16_UINT - 0, // R16G16_SINT - 0, // R16G16_SNORM - 0, // R32G32B32_FLOAT - 0, // A8B8G8R8_SRGB - 0, // R8G8_UNORM - 0, // R8G8_SNORM - 0, // R8G8_SINT - 0, // R8G8_UINT - 0, // R32G32_UINT - 0, // R16G16B16X16_FLOAT - 0, // R32_UINT - 0, // R32_SINT - 2, // ASTC_2D_8X8_UNORM - 2, // ASTC_2D_8X5_UNORM - 2, // ASTC_2D_5X4_UNORM - 0, // B8G8R8A8_SRGB - 2, // BC1_RGBA_SRGB - 2, // BC2_SRGB - 2, // BC3_SRGB - 2, // BC7_SRGB - 0, // A4B4G4R4_UNORM - 2, // ASTC_2D_4X4_SRGB - 2, // ASTC_2D_8X8_SRGB - 2, // ASTC_2D_8X5_SRGB - 2, // ASTC_2D_5X4_SRGB - 2, // ASTC_2D_5X5_UNORM - 2, // ASTC_2D_5X5_SRGB - 2, // ASTC_2D_10X8_UNORM - 2, // ASTC_2D_10X8_SRGB - 2, // ASTC_2D_6X6_UNORM - 2, // ASTC_2D_6X6_SRGB - 2, // ASTC_2D_10X10_UNORM - 2, // ASTC_2D_10X10_SRGB - 2, // ASTC_2D_12X12_UNORM - 2, // ASTC_2D_12X12_SRGB - 2, // ASTC_2D_8X6_UNORM - 2, // ASTC_2D_8X6_SRGB - 2, // ASTC_2D_6X5_UNORM - 2, // ASTC_2D_6X5_SRGB - 0, // E5B9G9R9_FLOAT - 0, // D32_FLOAT - 0, // D16_UNORM - 0, // D24_UNORM_S8_UINT - 0, // S8_UINT_D24_UNORM - 0, // D32_FLOAT_S8_UINT -}}; - -/** - * Gets the compression factor for the specified PixelFormat. This applies to just the - * "compressed width" and "compressed height", not the overall compression factor of a - * compressed image. This is used for maintaining proper surface sizes for compressed - * texture formats. - */ -inline constexpr u32 GetCompressionFactorShift(PixelFormat format) { - DEBUG_ASSERT(format != PixelFormat::Invalid); - DEBUG_ASSERT(static_cast<std::size_t>(format) < compression_factor_shift_table.size()); - return compression_factor_shift_table[static_cast<std::size_t>(format)]; -} - -inline constexpr u32 GetCompressionFactor(PixelFormat format) { - return 1U << GetCompressionFactorShift(format); -} - -constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ +constexpr std::array<u32, MaxPixelFormat> BLOCK_WIDTH_TABLE = {{ 1, // A8B8G8R8_UNORM 1, // A8B8G8R8_SNORM 1, // A8B8G8R8_SINT @@ -344,15 +234,12 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // D32_FLOAT_S8_UINT }}; -static constexpr u32 GetDefaultBlockWidth(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - - ASSERT(static_cast<std::size_t>(format) < block_width_table.size()); - return block_width_table[static_cast<std::size_t>(format)]; +constexpr u32 DefaultBlockWidth(PixelFormat format) { + ASSERT(static_cast<std::size_t>(format) < BLOCK_WIDTH_TABLE.size()); + return BLOCK_WIDTH_TABLE[static_cast<std::size_t>(format)]; } -constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ +constexpr std::array<u32, MaxPixelFormat> BLOCK_HEIGHT_TABLE = {{ 1, // A8B8G8R8_UNORM 1, // A8B8G8R8_SNORM 1, // A8B8G8R8_SINT @@ -446,15 +333,12 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // D32_FLOAT_S8_UINT }}; -static constexpr u32 GetDefaultBlockHeight(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - - ASSERT(static_cast<std::size_t>(format) < block_height_table.size()); - return block_height_table[static_cast<std::size_t>(format)]; +constexpr u32 DefaultBlockHeight(PixelFormat format) { + ASSERT(static_cast<std::size_t>(format) < BLOCK_HEIGHT_TABLE.size()); + return BLOCK_HEIGHT_TABLE[static_cast<std::size_t>(format)]; } -constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ +constexpr std::array<u32, MaxPixelFormat> BITS_PER_BLOCK_TABLE = {{ 32, // A8B8G8R8_UNORM 32, // A8B8G8R8_SNORM 32, // A8B8G8R8_SINT @@ -548,20 +432,14 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 64, // D32_FLOAT_S8_UINT }}; -static constexpr u32 GetFormatBpp(PixelFormat format) { - if (format == PixelFormat::Invalid) - return 0; - - ASSERT(static_cast<std::size_t>(format) < bpp_table.size()); - return bpp_table[static_cast<std::size_t>(format)]; +constexpr u32 BitsPerBlock(PixelFormat format) { + ASSERT(static_cast<std::size_t>(format) < BITS_PER_BLOCK_TABLE.size()); + return BITS_PER_BLOCK_TABLE[static_cast<std::size_t>(format)]; } /// Returns the sizer in bytes of the specified pixel format -static constexpr u32 GetBytesPerPixel(PixelFormat pixel_format) { - if (pixel_format == PixelFormat::Invalid) { - return 0; - } - return GetFormatBpp(pixel_format) / CHAR_BIT; +constexpr u32 BytesPerBlock(PixelFormat pixel_format) { + return BitsPerBlock(pixel_format) / CHAR_BIT; } SurfaceTarget SurfaceTargetFromTextureType(Tegra::Texture::TextureType texture_type); diff --git a/src/video_core/texture_cache/accelerated_swizzle.cpp b/src/video_core/texture_cache/accelerated_swizzle.cpp new file mode 100644 index 000000000..a4fc1184b --- /dev/null +++ b/src/video_core/texture_cache/accelerated_swizzle.cpp @@ -0,0 +1,70 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <array> +#include <bit> + +#include "common/alignment.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/accelerated_swizzle.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/decoders.h" + +namespace VideoCommon::Accelerated { + +using Tegra::Texture::GOB_SIZE_SHIFT; +using Tegra::Texture::GOB_SIZE_X; +using Tegra::Texture::GOB_SIZE_X_SHIFT; +using Tegra::Texture::GOB_SIZE_Y_SHIFT; +using VideoCore::Surface::BytesPerBlock; + +BlockLinearSwizzle2DParams MakeBlockLinearSwizzle2DParams(const SwizzleParameters& swizzle, + const ImageInfo& info) { + const Extent3D block = swizzle.block; + const Extent3D num_tiles = swizzle.num_tiles; + const u32 bytes_per_block = BytesPerBlock(info.format); + const u32 stride_alignment = CalculateLevelStrideAlignment(info, swizzle.level); + const u32 stride = Common::AlignBits(num_tiles.width, stride_alignment) * bytes_per_block; + const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); + return BlockLinearSwizzle2DParams{ + .origin{0, 0, 0}, + .destination{0, 0, 0}, + .bytes_per_block_log2 = static_cast<u32>(std::countr_zero(bytes_per_block)), + .layer_stride = info.layer_stride, + .block_size = gobs_in_x << (GOB_SIZE_SHIFT + block.height + block.depth), + .x_shift = GOB_SIZE_SHIFT + block.height + block.depth, + .block_height = block.height, + .block_height_mask = (1U << block.height) - 1, + }; +} + +BlockLinearSwizzle3DParams MakeBlockLinearSwizzle3DParams(const SwizzleParameters& swizzle, + const ImageInfo& info) { + const Extent3D block = swizzle.block; + const Extent3D num_tiles = swizzle.num_tiles; + const u32 bytes_per_block = BytesPerBlock(info.format); + const u32 stride_alignment = CalculateLevelStrideAlignment(info, swizzle.level); + const u32 stride = Common::AlignBits(num_tiles.width, stride_alignment) * bytes_per_block; + + const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) >> GOB_SIZE_X_SHIFT; + const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block.height + block.depth); + const u32 slice_size = + Common::DivCeilLog2(num_tiles.height, block.height + GOB_SIZE_Y_SHIFT) * block_size; + return BlockLinearSwizzle3DParams{ + .origin{0, 0, 0}, + .destination{0, 0, 0}, + .bytes_per_block_log2 = static_cast<u32>(std::countr_zero(bytes_per_block)), + .slice_size = slice_size, + .block_size = block_size, + .x_shift = GOB_SIZE_SHIFT + block.height + block.depth, + .block_height = block.height, + .block_height_mask = (1U << block.height) - 1, + .block_depth = block.depth, + .block_depth_mask = (1U << block.depth) - 1, + }; +} + +} // namespace VideoCommon::Accelerated
\ No newline at end of file diff --git a/src/video_core/texture_cache/accelerated_swizzle.h b/src/video_core/texture_cache/accelerated_swizzle.h new file mode 100644 index 000000000..6ec5c78c4 --- /dev/null +++ b/src/video_core/texture_cache/accelerated_swizzle.h @@ -0,0 +1,45 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> + +#include "common/common_types.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon::Accelerated { + +struct BlockLinearSwizzle2DParams { + std::array<u32, 3> origin; + std::array<s32, 3> destination; + u32 bytes_per_block_log2; + u32 layer_stride; + u32 block_size; + u32 x_shift; + u32 block_height; + u32 block_height_mask; +}; + +struct BlockLinearSwizzle3DParams { + std::array<u32, 3> origin; + std::array<s32, 3> destination; + u32 bytes_per_block_log2; + u32 slice_size; + u32 block_size; + u32 x_shift; + u32 block_height; + u32 block_height_mask; + u32 block_depth; + u32 block_depth_mask; +}; + +[[nodiscard]] BlockLinearSwizzle2DParams MakeBlockLinearSwizzle2DParams( + const SwizzleParameters& swizzle, const ImageInfo& info); + +[[nodiscard]] BlockLinearSwizzle3DParams MakeBlockLinearSwizzle3DParams( + const SwizzleParameters& swizzle, const ImageInfo& info); + +} // namespace VideoCommon::Accelerated diff --git a/src/video_core/texture_cache/copy_params.h b/src/video_core/texture_cache/copy_params.h deleted file mode 100644 index 9c21a0649..000000000 --- a/src/video_core/texture_cache/copy_params.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" - -namespace VideoCommon { - -struct CopyParams { - constexpr CopyParams(u32 source_x, u32 source_y, u32 source_z, u32 dest_x, u32 dest_y, - u32 dest_z, u32 source_level, u32 dest_level, u32 width, u32 height, - u32 depth) - : source_x{source_x}, source_y{source_y}, source_z{source_z}, dest_x{dest_x}, - dest_y{dest_y}, dest_z{dest_z}, source_level{source_level}, - dest_level{dest_level}, width{width}, height{height}, depth{depth} {} - - constexpr CopyParams(u32 width, u32 height, u32 depth, u32 level) - : source_x{}, source_y{}, source_z{}, dest_x{}, dest_y{}, dest_z{}, source_level{level}, - dest_level{level}, width{width}, height{height}, depth{depth} {} - - u32 source_x; - u32 source_y; - u32 source_z; - u32 dest_x; - u32 dest_y; - u32 dest_z; - u32 source_level; - u32 dest_level; - u32 width; - u32 height; - u32 depth; -}; - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.cpp b/src/video_core/texture_cache/decode_bc4.cpp new file mode 100644 index 000000000..017327975 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc4.cpp @@ -0,0 +1,97 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <array> +#include <span> + +#include "common/assert.h" +#include "common/common_types.h" +#include "video_core/texture_cache/decode_bc4.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt +[[nodiscard]] constexpr u32 DecompressBlock(u64 bits, u32 x, u32 y) { + const u32 code_offset = 16 + 3 * (4 * y + x); + const u32 code = (bits >> code_offset) & 7; + const u32 red0 = (bits >> 0) & 0xff; + const u32 red1 = (bits >> 8) & 0xff; + if (red0 > red1) { + switch (code) { + case 0: + return red0; + case 1: + return red1; + case 2: + return (6 * red0 + 1 * red1) / 7; + case 3: + return (5 * red0 + 2 * red1) / 7; + case 4: + return (4 * red0 + 3 * red1) / 7; + case 5: + return (3 * red0 + 4 * red1) / 7; + case 6: + return (2 * red0 + 5 * red1) / 7; + case 7: + return (1 * red0 + 6 * red1) / 7; + } + } else { + switch (code) { + case 0: + return red0; + case 1: + return red1; + case 2: + return (4 * red0 + 1 * red1) / 5; + case 3: + return (3 * red0 + 2 * red1) / 5; + case 4: + return (2 * red0 + 3 * red1) / 5; + case 5: + return (1 * red0 + 4 * red1) / 5; + case 6: + return 0; + case 7: + return 0xff; + } + } + return 0; +} + +void DecompressBC4(std::span<const u8> input, Extent3D extent, std::span<u8> output) { + UNIMPLEMENTED_IF_MSG(extent.width % 4 != 0, "Unaligned width={}", extent.width); + UNIMPLEMENTED_IF_MSG(extent.height % 4 != 0, "Unaligned height={}", extent.height); + static constexpr u32 BLOCK_SIZE = 4; + size_t input_offset = 0; + for (u32 slice = 0; slice < extent.depth; ++slice) { + for (u32 block_y = 0; block_y < extent.height / 4; ++block_y) { + for (u32 block_x = 0; block_x < extent.width / 4; ++block_x) { + u64 bits; + std::memcpy(&bits, &input[input_offset], sizeof(bits)); + input_offset += sizeof(bits); + + for (u32 y = 0; y < BLOCK_SIZE; ++y) { + for (u32 x = 0; x < BLOCK_SIZE; ++x) { + const u32 linear_z = slice; + const u32 linear_y = block_y * BLOCK_SIZE + y; + const u32 linear_x = block_x * BLOCK_SIZE + x; + const u32 offset_z = linear_z * extent.width * extent.height; + const u32 offset_y = linear_y * extent.width; + const u32 offset_x = linear_x; + const u32 output_offset = (offset_z + offset_y + offset_x) * 4ULL; + const u32 color = DecompressBlock(bits, x, y); + output[output_offset + 0] = static_cast<u8>(color); + output[output_offset + 1] = 0; + output[output_offset + 2] = 0; + output[output_offset + 3] = 0xff; + } + } + } + } + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.h b/src/video_core/texture_cache/decode_bc4.h new file mode 100644 index 000000000..63fb23508 --- /dev/null +++ b/src/video_core/texture_cache/decode_bc4.h @@ -0,0 +1,16 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <span> + +#include "common/common_types.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +void DecompressBC4(std::span<const u8> data, Extent3D extent, std::span<u8> output); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/descriptor_table.h b/src/video_core/texture_cache/descriptor_table.h new file mode 100644 index 000000000..3a03b786f --- /dev/null +++ b/src/video_core/texture_cache/descriptor_table.h @@ -0,0 +1,82 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <vector> + +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "common/logging/log.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" + +namespace VideoCommon { + +template <typename Descriptor> +class DescriptorTable { +public: + explicit DescriptorTable(Tegra::MemoryManager& gpu_memory_) : gpu_memory{gpu_memory_} {} + + [[nodiscard]] bool Synchornize(GPUVAddr gpu_addr, u32 limit) { + [[likely]] if (current_gpu_addr == gpu_addr && current_limit == limit) { + return false; + } + Refresh(gpu_addr, limit); + return true; + } + + void Invalidate() noexcept { + std::ranges::fill(read_descriptors, 0); + } + + [[nodiscard]] std::pair<Descriptor, bool> Read(u32 index) { + DEBUG_ASSERT(index <= current_limit); + const GPUVAddr gpu_addr = current_gpu_addr + index * sizeof(Descriptor); + std::pair<Descriptor, bool> result; + gpu_memory.ReadBlockUnsafe(gpu_addr, &result.first, sizeof(Descriptor)); + if (IsDescriptorRead(index)) { + result.second = result.first != descriptors[index]; + } else { + MarkDescriptorAsRead(index); + result.second = true; + } + if (result.second) { + descriptors[index] = result.first; + } + return result; + } + + [[nodiscard]] u32 Limit() const noexcept { + return current_limit; + } + +private: + void Refresh(GPUVAddr gpu_addr, u32 limit) { + current_gpu_addr = gpu_addr; + current_limit = limit; + + const size_t num_descriptors = static_cast<size_t>(limit) + 1; + read_descriptors.clear(); + read_descriptors.resize(Common::DivCeil(num_descriptors, 64U), 0); + descriptors.resize(num_descriptors); + } + + void MarkDescriptorAsRead(u32 index) noexcept { + read_descriptors[index / 64] |= 1ULL << (index % 64); + } + + [[nodiscard]] bool IsDescriptorRead(u32 index) const noexcept { + return (read_descriptors[index / 64] & (1ULL << (index % 64))) != 0; + } + + Tegra::MemoryManager& gpu_memory; + GPUVAddr current_gpu_addr{}; + u32 current_limit{}; + std::vector<u64> read_descriptors; + std::vector<Descriptor> descriptors; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 7d5a75648..ddfb726fe 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -2,7 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include <array> #include "common/common_types.h" #include "common/logging/log.h" #include "video_core/texture_cache/format_lookup_table.h" @@ -20,198 +19,207 @@ constexpr auto UNORM = ComponentType::UNORM; constexpr auto SINT = ComponentType::SINT; constexpr auto UINT = ComponentType::UINT; constexpr auto FLOAT = ComponentType::FLOAT; -constexpr bool C = false; // Normal color -constexpr bool S = true; // Srgb - -struct Table { - constexpr Table(TextureFormat texture_format, bool is_srgb, ComponentType red_component, - ComponentType green_component, ComponentType blue_component, - ComponentType alpha_component, PixelFormat pixel_format) - : texture_format{texture_format}, pixel_format{pixel_format}, red_component{red_component}, - green_component{green_component}, blue_component{blue_component}, - alpha_component{alpha_component}, is_srgb{is_srgb} {} - - TextureFormat texture_format; - PixelFormat pixel_format; - ComponentType red_component; - ComponentType green_component; - ComponentType blue_component; - ComponentType alpha_component; - bool is_srgb; -}; -constexpr std::array<Table, 86> DefinitionTable = {{ - {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A8B8G8R8_UNORM}, - {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::A8B8G8R8_SNORM}, - {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::A8B8G8R8_UINT}, - {TextureFormat::A8R8G8B8, C, SINT, SINT, SINT, SINT, PixelFormat::A8B8G8R8_SINT}, - {TextureFormat::A8R8G8B8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::A8B8G8R8_SRGB}, - - {TextureFormat::B5G6R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::B5G6R5_UNORM}, - - {TextureFormat::A2B10G10R10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A2B10G10R10_UNORM}, - {TextureFormat::A2B10G10R10, C, UINT, UINT, UINT, UINT, PixelFormat::A2B10G10R10_UINT}, - - {TextureFormat::A1B5G5R5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A1B5G5R5_UNORM}, - - {TextureFormat::A4B4G4R4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::A4B4G4R4_UNORM}, - - {TextureFormat::R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8_UNORM}, - {TextureFormat::R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R8_SNORM}, - {TextureFormat::R8, C, UINT, UINT, UINT, UINT, PixelFormat::R8_UINT}, - {TextureFormat::R8, C, SINT, SINT, SINT, SINT, PixelFormat::R8_SINT}, - - {TextureFormat::R8G8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R8G8_UNORM}, - {TextureFormat::R8G8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R8G8_SNORM}, - {TextureFormat::R8G8, C, UINT, UINT, UINT, UINT, PixelFormat::R8G8_UINT}, - {TextureFormat::R8G8, C, SINT, SINT, SINT, SINT, PixelFormat::R8G8_SINT}, - - {TextureFormat::R16G16B16A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16G16B16A16_SNORM}, - {TextureFormat::R16G16B16A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16G16B16A16_UNORM}, - {TextureFormat::R16G16B16A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16G16B16A16_FLOAT}, - {TextureFormat::R16G16B16A16, C, UINT, UINT, UINT, UINT, PixelFormat::R16G16B16A16_UINT}, - {TextureFormat::R16G16B16A16, C, SINT, SINT, SINT, SINT, PixelFormat::R16G16B16A16_SINT}, - - {TextureFormat::R16G16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16G16_FLOAT}, - {TextureFormat::R16G16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16G16_UNORM}, - {TextureFormat::R16G16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16G16_SNORM}, - {TextureFormat::R16G16, C, UINT, UINT, UINT, UINT, PixelFormat::R16G16_UINT}, - {TextureFormat::R16G16, C, SINT, SINT, SINT, SINT, PixelFormat::R16G16_SINT}, - - {TextureFormat::R16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R16_FLOAT}, - {TextureFormat::R16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::R16_UNORM}, - {TextureFormat::R16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::R16_SNORM}, - {TextureFormat::R16, C, UINT, UINT, UINT, UINT, PixelFormat::R16_UINT}, - {TextureFormat::R16, C, SINT, SINT, SINT, SINT, PixelFormat::R16_SINT}, - - {TextureFormat::B10G11R11, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::B10G11R11_FLOAT}, - - {TextureFormat::R32G32B32A32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32B32A32_FLOAT}, - {TextureFormat::R32G32B32A32, C, UINT, UINT, UINT, UINT, PixelFormat::R32G32B32A32_UINT}, - {TextureFormat::R32G32B32A32, C, SINT, SINT, SINT, SINT, PixelFormat::R32G32B32A32_SINT}, - - {TextureFormat::R32G32B32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32B32_FLOAT}, - - {TextureFormat::R32G32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32G32_FLOAT}, - {TextureFormat::R32G32, C, UINT, UINT, UINT, UINT, PixelFormat::R32G32_UINT}, - {TextureFormat::R32G32, C, SINT, SINT, SINT, SINT, PixelFormat::R32G32_SINT}, - - {TextureFormat::R32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::R32_FLOAT}, - {TextureFormat::R32, C, UINT, UINT, UINT, UINT, PixelFormat::R32_UINT}, - {TextureFormat::R32, C, SINT, SINT, SINT, SINT, PixelFormat::R32_SINT}, - - {TextureFormat::E5B9G9R9, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::E5B9G9R9_FLOAT}, - - {TextureFormat::D32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::D32_FLOAT}, - {TextureFormat::D16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::D16_UNORM}, - {TextureFormat::S8D24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8_UINT_D24_UNORM}, - {TextureFormat::R8G24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8_UINT_D24_UNORM}, - {TextureFormat::D32S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::D32_FLOAT_S8_UINT}, - - {TextureFormat::BC1_RGBA, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC1_RGBA_UNORM}, - {TextureFormat::BC1_RGBA, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC1_RGBA_SRGB}, - - {TextureFormat::BC2, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC2_UNORM}, - {TextureFormat::BC2, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC2_SRGB}, - - {TextureFormat::BC3, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC3_UNORM}, - {TextureFormat::BC3, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC3_SRGB}, - - {TextureFormat::BC4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC4_UNORM}, - {TextureFormat::BC4, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::BC4_SNORM}, - - {TextureFormat::BC5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC5_UNORM}, - {TextureFormat::BC5, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::BC5_SNORM}, - - {TextureFormat::BC7, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7_UNORM}, - {TextureFormat::BC7, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::BC7_SRGB}, - - {TextureFormat::BC6H_SFLOAT, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_SFLOAT}, - {TextureFormat::BC6H_UFLOAT, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::BC6H_UFLOAT}, - - {TextureFormat::ASTC_2D_4X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4_UNORM}, - {TextureFormat::ASTC_2D_4X4, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_4X4_SRGB}, - - {TextureFormat::ASTC_2D_5X4, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4_UNORM}, - {TextureFormat::ASTC_2D_5X4, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X4_SRGB}, - - {TextureFormat::ASTC_2D_5X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5_UNORM}, - {TextureFormat::ASTC_2D_5X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_5X5_SRGB}, - - {TextureFormat::ASTC_2D_8X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8_UNORM}, - {TextureFormat::ASTC_2D_8X8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X8_SRGB}, - - {TextureFormat::ASTC_2D_8X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5_UNORM}, - {TextureFormat::ASTC_2D_8X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X5_SRGB}, - - {TextureFormat::ASTC_2D_10X8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8_UNORM}, - {TextureFormat::ASTC_2D_10X8, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X8_SRGB}, - - {TextureFormat::ASTC_2D_6X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6_UNORM}, - {TextureFormat::ASTC_2D_6X6, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X6_SRGB}, - - {TextureFormat::ASTC_2D_10X10, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10_UNORM}, - {TextureFormat::ASTC_2D_10X10, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_10X10_SRGB}, - - {TextureFormat::ASTC_2D_12X12, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12_UNORM}, - {TextureFormat::ASTC_2D_12X12, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_12X12_SRGB}, - - {TextureFormat::ASTC_2D_8X6, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6_UNORM}, - {TextureFormat::ASTC_2D_8X6, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_8X6_SRGB}, +constexpr bool LINEAR = false; +constexpr bool SRGB = true; + +constexpr u32 Hash(TextureFormat format, ComponentType red_component, ComponentType green_component, + ComponentType blue_component, ComponentType alpha_component, bool is_srgb) { + u32 hash = is_srgb ? 1 : 0; + hash |= static_cast<u32>(red_component) << 1; + hash |= static_cast<u32>(green_component) << 4; + hash |= static_cast<u32>(blue_component) << 7; + hash |= static_cast<u32>(alpha_component) << 10; + hash |= static_cast<u32>(format) << 13; + return hash; +} - {TextureFormat::ASTC_2D_6X5, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5_UNORM}, - {TextureFormat::ASTC_2D_6X5, S, UNORM, UNORM, UNORM, UNORM, PixelFormat::ASTC_2D_6X5_SRGB}, -}}; +constexpr u32 Hash(TextureFormat format, ComponentType component, bool is_srgb = LINEAR) { + return Hash(format, component, component, component, component, is_srgb); +} } // Anonymous namespace -FormatLookupTable::FormatLookupTable() { - table.fill(static_cast<u8>(PixelFormat::Invalid)); - - for (const auto& entry : DefinitionTable) { - table[CalculateIndex(entry.texture_format, entry.is_srgb != 0, entry.red_component, - entry.green_component, entry.blue_component, entry.alpha_component)] = - static_cast<u8>(entry.pixel_format); - } -} - -PixelFormat FormatLookupTable::GetPixelFormat(TextureFormat format, bool is_srgb, - ComponentType red_component, - ComponentType green_component, - ComponentType blue_component, - ComponentType alpha_component) const noexcept { - const auto pixel_format = static_cast<PixelFormat>(table[CalculateIndex( - format, is_srgb, red_component, green_component, blue_component, alpha_component)]); - // [[likely]] - if (pixel_format != PixelFormat::Invalid) { - return pixel_format; +PixelFormat PixelFormatFromTextureInfo(TextureFormat format, ComponentType red, ComponentType green, + ComponentType blue, ComponentType alpha, + bool is_srgb) noexcept { + switch (Hash(format, red, green, blue, alpha, is_srgb)) { + case Hash(TextureFormat::A8R8G8B8, UNORM): + return PixelFormat::A8B8G8R8_UNORM; + case Hash(TextureFormat::A8R8G8B8, SNORM): + return PixelFormat::A8B8G8R8_SNORM; + case Hash(TextureFormat::A8R8G8B8, UINT): + return PixelFormat::A8B8G8R8_UINT; + case Hash(TextureFormat::A8R8G8B8, SINT): + return PixelFormat::A8B8G8R8_SINT; + case Hash(TextureFormat::A8R8G8B8, UNORM, SRGB): + return PixelFormat::A8B8G8R8_SRGB; + case Hash(TextureFormat::B5G6R5, UNORM): + return PixelFormat::B5G6R5_UNORM; + case Hash(TextureFormat::A2B10G10R10, UNORM): + return PixelFormat::A2B10G10R10_UNORM; + case Hash(TextureFormat::A2B10G10R10, UINT): + return PixelFormat::A2B10G10R10_UINT; + case Hash(TextureFormat::A1B5G5R5, UNORM): + return PixelFormat::A1B5G5R5_UNORM; + case Hash(TextureFormat::A4B4G4R4, UNORM): + return PixelFormat::A4B4G4R4_UNORM; + case Hash(TextureFormat::R8, UNORM): + return PixelFormat::R8_UNORM; + case Hash(TextureFormat::R8, SNORM): + return PixelFormat::R8_SNORM; + case Hash(TextureFormat::R8, UINT): + return PixelFormat::R8_UINT; + case Hash(TextureFormat::R8, SINT): + return PixelFormat::R8_SINT; + case Hash(TextureFormat::R8G8, UNORM): + return PixelFormat::R8G8_UNORM; + case Hash(TextureFormat::R8G8, SNORM): + return PixelFormat::R8G8_SNORM; + case Hash(TextureFormat::R8G8, UINT): + return PixelFormat::R8G8_UINT; + case Hash(TextureFormat::R8G8, SINT): + return PixelFormat::R8G8_SINT; + case Hash(TextureFormat::R16G16B16A16, FLOAT): + return PixelFormat::R16G16B16A16_FLOAT; + case Hash(TextureFormat::R16G16B16A16, UNORM): + return PixelFormat::R16G16B16A16_UNORM; + case Hash(TextureFormat::R16G16B16A16, SNORM): + return PixelFormat::R16G16B16A16_SNORM; + case Hash(TextureFormat::R16G16B16A16, UINT): + return PixelFormat::R16G16B16A16_UINT; + case Hash(TextureFormat::R16G16B16A16, SINT): + return PixelFormat::R16G16B16A16_SINT; + case Hash(TextureFormat::R16G16, FLOAT): + return PixelFormat::R16G16_FLOAT; + case Hash(TextureFormat::R16G16, UNORM): + return PixelFormat::R16G16_UNORM; + case Hash(TextureFormat::R16G16, SNORM): + return PixelFormat::R16G16_SNORM; + case Hash(TextureFormat::R16G16, UINT): + return PixelFormat::R16G16_UINT; + case Hash(TextureFormat::R16G16, SINT): + return PixelFormat::R16G16_SINT; + case Hash(TextureFormat::R16, FLOAT): + return PixelFormat::R16_FLOAT; + case Hash(TextureFormat::R16, UNORM): + return PixelFormat::R16_UNORM; + case Hash(TextureFormat::R16, SNORM): + return PixelFormat::R16_SNORM; + case Hash(TextureFormat::R16, UINT): + return PixelFormat::R16_UINT; + case Hash(TextureFormat::R16, SINT): + return PixelFormat::R16_SINT; + case Hash(TextureFormat::B10G11R11, FLOAT): + return PixelFormat::B10G11R11_FLOAT; + case Hash(TextureFormat::R32G32B32A32, FLOAT): + return PixelFormat::R32G32B32A32_FLOAT; + case Hash(TextureFormat::R32G32B32A32, UINT): + return PixelFormat::R32G32B32A32_UINT; + case Hash(TextureFormat::R32G32B32A32, SINT): + return PixelFormat::R32G32B32A32_SINT; + case Hash(TextureFormat::R32G32B32, FLOAT): + return PixelFormat::R32G32B32_FLOAT; + case Hash(TextureFormat::R32G32, FLOAT): + return PixelFormat::R32G32_FLOAT; + case Hash(TextureFormat::R32G32, UINT): + return PixelFormat::R32G32_UINT; + case Hash(TextureFormat::R32G32, SINT): + return PixelFormat::R32G32_SINT; + case Hash(TextureFormat::R32, FLOAT): + return PixelFormat::R32_FLOAT; + case Hash(TextureFormat::R32, UINT): + return PixelFormat::R32_UINT; + case Hash(TextureFormat::R32, SINT): + return PixelFormat::R32_SINT; + case Hash(TextureFormat::E5B9G9R9, FLOAT): + return PixelFormat::E5B9G9R9_FLOAT; + case Hash(TextureFormat::D32, FLOAT): + return PixelFormat::D32_FLOAT; + case Hash(TextureFormat::D16, UNORM): + return PixelFormat::D16_UNORM; + case Hash(TextureFormat::S8D24, UINT, UNORM, UNORM, UNORM, LINEAR): + return PixelFormat::S8_UINT_D24_UNORM; + case Hash(TextureFormat::R8G24, UINT, UNORM, UNORM, UNORM, LINEAR): + return PixelFormat::S8_UINT_D24_UNORM; + case Hash(TextureFormat::D32S8, FLOAT, UINT, UNORM, UNORM, LINEAR): + return PixelFormat::D32_FLOAT_S8_UINT; + case Hash(TextureFormat::BC1_RGBA, UNORM, LINEAR): + return PixelFormat::BC1_RGBA_UNORM; + case Hash(TextureFormat::BC1_RGBA, UNORM, SRGB): + return PixelFormat::BC1_RGBA_SRGB; + case Hash(TextureFormat::BC2, UNORM, LINEAR): + return PixelFormat::BC2_UNORM; + case Hash(TextureFormat::BC2, UNORM, SRGB): + return PixelFormat::BC2_SRGB; + case Hash(TextureFormat::BC3, UNORM, LINEAR): + return PixelFormat::BC3_UNORM; + case Hash(TextureFormat::BC3, UNORM, SRGB): + return PixelFormat::BC3_SRGB; + case Hash(TextureFormat::BC4, UNORM): + return PixelFormat::BC4_UNORM; + case Hash(TextureFormat::BC4, SNORM): + return PixelFormat::BC4_SNORM; + case Hash(TextureFormat::BC5, UNORM): + return PixelFormat::BC5_UNORM; + case Hash(TextureFormat::BC5, SNORM): + return PixelFormat::BC5_SNORM; + case Hash(TextureFormat::BC7, UNORM, LINEAR): + return PixelFormat::BC7_UNORM; + case Hash(TextureFormat::BC7, UNORM, SRGB): + return PixelFormat::BC7_SRGB; + case Hash(TextureFormat::BC6H_SFLOAT, FLOAT): + return PixelFormat::BC6H_SFLOAT; + case Hash(TextureFormat::BC6H_UFLOAT, FLOAT): + return PixelFormat::BC6H_UFLOAT; + case Hash(TextureFormat::ASTC_2D_4X4, UNORM, LINEAR): + return PixelFormat::ASTC_2D_4X4_UNORM; + case Hash(TextureFormat::ASTC_2D_4X4, UNORM, SRGB): + return PixelFormat::ASTC_2D_4X4_SRGB; + case Hash(TextureFormat::ASTC_2D_5X4, UNORM, LINEAR): + return PixelFormat::ASTC_2D_5X4_UNORM; + case Hash(TextureFormat::ASTC_2D_5X4, UNORM, SRGB): + return PixelFormat::ASTC_2D_5X4_SRGB; + case Hash(TextureFormat::ASTC_2D_5X5, UNORM, LINEAR): + return PixelFormat::ASTC_2D_5X5_UNORM; + case Hash(TextureFormat::ASTC_2D_5X5, UNORM, SRGB): + return PixelFormat::ASTC_2D_5X5_SRGB; + case Hash(TextureFormat::ASTC_2D_8X8, UNORM, LINEAR): + return PixelFormat::ASTC_2D_8X8_UNORM; + case Hash(TextureFormat::ASTC_2D_8X8, UNORM, SRGB): + return PixelFormat::ASTC_2D_8X8_SRGB; + case Hash(TextureFormat::ASTC_2D_8X5, UNORM, LINEAR): + return PixelFormat::ASTC_2D_8X5_UNORM; + case Hash(TextureFormat::ASTC_2D_8X5, UNORM, SRGB): + return PixelFormat::ASTC_2D_8X5_SRGB; + case Hash(TextureFormat::ASTC_2D_10X8, UNORM, LINEAR): + return PixelFormat::ASTC_2D_10X8_UNORM; + case Hash(TextureFormat::ASTC_2D_10X8, UNORM, SRGB): + return PixelFormat::ASTC_2D_10X8_SRGB; + case Hash(TextureFormat::ASTC_2D_6X6, UNORM, LINEAR): + return PixelFormat::ASTC_2D_6X6_UNORM; + case Hash(TextureFormat::ASTC_2D_6X6, UNORM, SRGB): + return PixelFormat::ASTC_2D_6X6_SRGB; + case Hash(TextureFormat::ASTC_2D_10X10, UNORM, LINEAR): + return PixelFormat::ASTC_2D_10X10_UNORM; + case Hash(TextureFormat::ASTC_2D_10X10, UNORM, SRGB): + return PixelFormat::ASTC_2D_10X10_SRGB; + case Hash(TextureFormat::ASTC_2D_12X12, UNORM, LINEAR): + return PixelFormat::ASTC_2D_12X12_UNORM; + case Hash(TextureFormat::ASTC_2D_12X12, UNORM, SRGB): + return PixelFormat::ASTC_2D_12X12_SRGB; + case Hash(TextureFormat::ASTC_2D_8X6, UNORM, LINEAR): + return PixelFormat::ASTC_2D_8X6_UNORM; + case Hash(TextureFormat::ASTC_2D_8X6, UNORM, SRGB): + return PixelFormat::ASTC_2D_8X6_SRGB; + case Hash(TextureFormat::ASTC_2D_6X5, UNORM, LINEAR): + return PixelFormat::ASTC_2D_6X5_UNORM; + case Hash(TextureFormat::ASTC_2D_6X5, UNORM, SRGB): + return PixelFormat::ASTC_2D_6X5_SRGB; } UNIMPLEMENTED_MSG("texture format={} srgb={} components={{{} {} {} {}}}", - static_cast<int>(format), is_srgb, static_cast<int>(red_component), - static_cast<int>(green_component), static_cast<int>(blue_component), - static_cast<int>(alpha_component)); + static_cast<int>(format), is_srgb, static_cast<int>(red), + static_cast<int>(green), static_cast<int>(blue), static_cast<int>(alpha)); return PixelFormat::A8B8G8R8_UNORM; } -void FormatLookupTable::Set(TextureFormat format, bool is_srgb, ComponentType red_component, - ComponentType green_component, ComponentType blue_component, - ComponentType alpha_component, PixelFormat pixel_format) {} - -std::size_t FormatLookupTable::CalculateIndex(TextureFormat format, bool is_srgb, - ComponentType red_component, - ComponentType green_component, - ComponentType blue_component, - ComponentType alpha_component) noexcept { - const auto format_index = static_cast<std::size_t>(format); - const auto red_index = static_cast<std::size_t>(red_component); - const auto green_index = static_cast<std::size_t>(green_component); - const auto blue_index = static_cast<std::size_t>(blue_component); - const auto alpha_index = static_cast<std::size_t>(alpha_component); - const std::size_t srgb_index = is_srgb ? 1 : 0; - - return format_index * PerFormat + - srgb_index * PerComponent * PerComponent * PerComponent * PerComponent + - alpha_index * PerComponent * PerComponent * PerComponent + - blue_index * PerComponent * PerComponent + green_index * PerComponent + red_index; -} - } // namespace VideoCommon diff --git a/src/video_core/texture_cache/format_lookup_table.h b/src/video_core/texture_cache/format_lookup_table.h index aa77e0a5a..729533999 100644 --- a/src/video_core/texture_cache/format_lookup_table.h +++ b/src/video_core/texture_cache/format_lookup_table.h @@ -4,48 +4,14 @@ #pragma once -#include <array> -#include <limits> #include "video_core/surface.h" #include "video_core/textures/texture.h" namespace VideoCommon { -class FormatLookupTable { -public: - explicit FormatLookupTable(); - - VideoCore::Surface::PixelFormat GetPixelFormat( - Tegra::Texture::TextureFormat format, bool is_srgb, - Tegra::Texture::ComponentType red_component, Tegra::Texture::ComponentType green_component, - Tegra::Texture::ComponentType blue_component, - Tegra::Texture::ComponentType alpha_component) const noexcept; - -private: - static_assert(VideoCore::Surface::MaxPixelFormat <= std::numeric_limits<u8>::max()); - - static constexpr std::size_t NumTextureFormats = 128; - - static constexpr std::size_t PerComponent = 8; - static constexpr std::size_t PerComponents2 = PerComponent * PerComponent; - static constexpr std::size_t PerComponents3 = PerComponents2 * PerComponent; - static constexpr std::size_t PerComponents4 = PerComponents3 * PerComponent; - static constexpr std::size_t PerFormat = PerComponents4 * 2; - - static std::size_t CalculateIndex(Tegra::Texture::TextureFormat format, bool is_srgb, - Tegra::Texture::ComponentType red_component, - Tegra::Texture::ComponentType green_component, - Tegra::Texture::ComponentType blue_component, - Tegra::Texture::ComponentType alpha_component) noexcept; - - void Set(Tegra::Texture::TextureFormat format, bool is_srgb, - Tegra::Texture::ComponentType red_component, - Tegra::Texture::ComponentType green_component, - Tegra::Texture::ComponentType blue_component, - Tegra::Texture::ComponentType alpha_component, - VideoCore::Surface::PixelFormat pixel_format); - - std::array<u8, NumTextureFormats * PerFormat> table; -}; +VideoCore::Surface::PixelFormat PixelFormatFromTextureInfo( + Tegra::Texture::TextureFormat format, Tegra::Texture::ComponentType red_component, + Tegra::Texture::ComponentType green_component, Tegra::Texture::ComponentType blue_component, + Tegra::Texture::ComponentType alpha_component, bool is_srgb) noexcept; } // namespace VideoCommon diff --git a/src/video_core/texture_cache/formatter.cpp b/src/video_core/texture_cache/formatter.cpp new file mode 100644 index 000000000..d10ba4ccd --- /dev/null +++ b/src/video_core/texture_cache/formatter.cpp @@ -0,0 +1,95 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <string> + +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/image_base.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_base.h" +#include "video_core/texture_cache/render_targets.h" + +namespace VideoCommon { + +std::string Name(const ImageBase& image) { + const GPUVAddr gpu_addr = image.gpu_addr; + const ImageInfo& info = image.info; + const u32 width = info.size.width; + const u32 height = info.size.height; + const u32 depth = info.size.depth; + const u32 num_layers = image.info.resources.layers; + const u32 num_levels = image.info.resources.levels; + std::string resource; + if (num_layers > 1) { + resource += fmt::format(":L{}", num_layers); + } + if (num_levels > 1) { + resource += fmt::format(":M{}", num_levels); + } + switch (image.info.type) { + case ImageType::e1D: + return fmt::format("Image 1D 0x{:x} {}{}", gpu_addr, width, resource); + case ImageType::e2D: + return fmt::format("Image 2D 0x{:x} {}x{}{}", gpu_addr, width, height, resource); + case ImageType::e3D: + return fmt::format("Image 2D 0x{:x} {}x{}x{}{}", gpu_addr, width, height, depth, resource); + case ImageType::Linear: + return fmt::format("Image Linear 0x{:x} {}x{}", gpu_addr, width, height); + case ImageType::Buffer: + return fmt::format("Buffer 0x{:x} {}", image.gpu_addr, image.info.size.width); + } + return "Invalid"; +} + +std::string Name(const ImageViewBase& image_view, std::optional<ImageViewType> type) { + const u32 width = image_view.size.width; + const u32 height = image_view.size.height; + const u32 depth = image_view.size.depth; + const u32 num_levels = image_view.range.extent.levels; + const u32 num_layers = image_view.range.extent.layers; + + const std::string level = num_levels > 1 ? fmt::format(":{}", num_levels) : ""; + switch (type.value_or(image_view.type)) { + case ImageViewType::e1D: + return fmt::format("ImageView 1D {}{}", width, level); + case ImageViewType::e2D: + return fmt::format("ImageView 2D {}x{}{}", width, height, level); + case ImageViewType::Cube: + return fmt::format("ImageView Cube {}x{}{}", width, height, level); + case ImageViewType::e3D: + return fmt::format("ImageView 3D {}x{}x{}{}", width, height, depth, level); + case ImageViewType::e1DArray: + return fmt::format("ImageView 1DArray {}{}|{}", width, level, num_layers); + case ImageViewType::e2DArray: + return fmt::format("ImageView 2DArray {}x{}{}|{}", width, height, level, num_layers); + case ImageViewType::CubeArray: + return fmt::format("ImageView CubeArray {}x{}{}|{}", width, height, level, num_layers); + case ImageViewType::Rect: + return fmt::format("ImageView Rect {}x{}{}", width, height, level); + case ImageViewType::Buffer: + return fmt::format("BufferView {}", width); + } + return "Invalid"; +} + +std::string Name(const RenderTargets& render_targets) { + std::string_view debug_prefix; + const auto num_color = std::ranges::count_if( + render_targets.color_buffer_ids, [](ImageViewId id) { return static_cast<bool>(id); }); + if (render_targets.depth_buffer_id) { + debug_prefix = num_color > 0 ? "R" : "Z"; + } else { + debug_prefix = num_color > 0 ? "C" : "X"; + } + const Extent2D size = render_targets.size; + if (num_color > 0) { + return fmt::format("Framebuffer {}{} {}x{}", debug_prefix, num_color, size.width, + size.height); + } else { + return fmt::format("Framebuffer {} {}x{}", debug_prefix, size.width, size.height); + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/formatter.h b/src/video_core/texture_cache/formatter.h new file mode 100644 index 000000000..a48413983 --- /dev/null +++ b/src/video_core/texture_cache/formatter.h @@ -0,0 +1,263 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <string> + +#include <fmt/format.h> + +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" + +template <> +struct fmt::formatter<VideoCore::Surface::PixelFormat> : fmt::formatter<fmt::string_view> { + template <typename FormatContext> + auto format(VideoCore::Surface::PixelFormat format, FormatContext& ctx) { + using VideoCore::Surface::PixelFormat; + const string_view name = [format] { + switch (format) { + case PixelFormat::A8B8G8R8_UNORM: + return "A8B8G8R8_UNORM"; + case PixelFormat::A8B8G8R8_SNORM: + return "A8B8G8R8_SNORM"; + case PixelFormat::A8B8G8R8_SINT: + return "A8B8G8R8_SINT"; + case PixelFormat::A8B8G8R8_UINT: + return "A8B8G8R8_UINT"; + case PixelFormat::R5G6B5_UNORM: + return "R5G6B5_UNORM"; + case PixelFormat::B5G6R5_UNORM: + return "B5G6R5_UNORM"; + case PixelFormat::A1R5G5B5_UNORM: + return "A1R5G5B5_UNORM"; + case PixelFormat::A2B10G10R10_UNORM: + return "A2B10G10R10_UNORM"; + case PixelFormat::A2B10G10R10_UINT: + return "A2B10G10R10_UINT"; + case PixelFormat::A1B5G5R5_UNORM: + return "A1B5G5R5_UNORM"; + case PixelFormat::R8_UNORM: + return "R8_UNORM"; + case PixelFormat::R8_SNORM: + return "R8_SNORM"; + case PixelFormat::R8_SINT: + return "R8_SINT"; + case PixelFormat::R8_UINT: + return "R8_UINT"; + case PixelFormat::R16G16B16A16_FLOAT: + return "R16G16B16A16_FLOAT"; + case PixelFormat::R16G16B16A16_UNORM: + return "R16G16B16A16_UNORM"; + case PixelFormat::R16G16B16A16_SNORM: + return "R16G16B16A16_SNORM"; + case PixelFormat::R16G16B16A16_SINT: + return "R16G16B16A16_SINT"; + case PixelFormat::R16G16B16A16_UINT: + return "R16G16B16A16_UINT"; + case PixelFormat::B10G11R11_FLOAT: + return "B10G11R11_FLOAT"; + case PixelFormat::R32G32B32A32_UINT: + return "R32G32B32A32_UINT"; + case PixelFormat::BC1_RGBA_UNORM: + return "BC1_RGBA_UNORM"; + case PixelFormat::BC2_UNORM: + return "BC2_UNORM"; + case PixelFormat::BC3_UNORM: + return "BC3_UNORM"; + case PixelFormat::BC4_UNORM: + return "BC4_UNORM"; + case PixelFormat::BC4_SNORM: + return "BC4_SNORM"; + case PixelFormat::BC5_UNORM: + return "BC5_UNORM"; + case PixelFormat::BC5_SNORM: + return "BC5_SNORM"; + case PixelFormat::BC7_UNORM: + return "BC7_UNORM"; + case PixelFormat::BC6H_UFLOAT: + return "BC6H_UFLOAT"; + case PixelFormat::BC6H_SFLOAT: + return "BC6H_SFLOAT"; + case PixelFormat::ASTC_2D_4X4_UNORM: + return "ASTC_2D_4X4_UNORM"; + case PixelFormat::B8G8R8A8_UNORM: + return "B8G8R8A8_UNORM"; + case PixelFormat::R32G32B32A32_FLOAT: + return "R32G32B32A32_FLOAT"; + case PixelFormat::R32G32B32A32_SINT: + return "R32G32B32A32_SINT"; + case PixelFormat::R32G32_FLOAT: + return "R32G32_FLOAT"; + case PixelFormat::R32G32_SINT: + return "R32G32_SINT"; + case PixelFormat::R32_FLOAT: + return "R32_FLOAT"; + case PixelFormat::R16_FLOAT: + return "R16_FLOAT"; + case PixelFormat::R16_UNORM: + return "R16_UNORM"; + case PixelFormat::R16_SNORM: + return "R16_SNORM"; + case PixelFormat::R16_UINT: + return "R16_UINT"; + case PixelFormat::R16_SINT: + return "R16_SINT"; + case PixelFormat::R16G16_UNORM: + return "R16G16_UNORM"; + case PixelFormat::R16G16_FLOAT: + return "R16G16_FLOAT"; + case PixelFormat::R16G16_UINT: + return "R16G16_UINT"; + case PixelFormat::R16G16_SINT: + return "R16G16_SINT"; + case PixelFormat::R16G16_SNORM: + return "R16G16_SNORM"; + case PixelFormat::R32G32B32_FLOAT: + return "R32G32B32_FLOAT"; + case PixelFormat::A8B8G8R8_SRGB: + return "A8B8G8R8_SRGB"; + case PixelFormat::R8G8_UNORM: + return "R8G8_UNORM"; + case PixelFormat::R8G8_SNORM: + return "R8G8_SNORM"; + case PixelFormat::R8G8_SINT: + return "R8G8_SINT"; + case PixelFormat::R8G8_UINT: + return "R8G8_UINT"; + case PixelFormat::R32G32_UINT: + return "R32G32_UINT"; + case PixelFormat::R16G16B16X16_FLOAT: + return "R16G16B16X16_FLOAT"; + case PixelFormat::R32_UINT: + return "R32_UINT"; + case PixelFormat::R32_SINT: + return "R32_SINT"; + case PixelFormat::ASTC_2D_8X8_UNORM: + return "ASTC_2D_8X8_UNORM"; + case PixelFormat::ASTC_2D_8X5_UNORM: + return "ASTC_2D_8X5_UNORM"; + case PixelFormat::ASTC_2D_5X4_UNORM: + return "ASTC_2D_5X4_UNORM"; + case PixelFormat::B8G8R8A8_SRGB: + return "B8G8R8A8_SRGB"; + case PixelFormat::BC1_RGBA_SRGB: + return "BC1_RGBA_SRGB"; + case PixelFormat::BC2_SRGB: + return "BC2_SRGB"; + case PixelFormat::BC3_SRGB: + return "BC3_SRGB"; + case PixelFormat::BC7_SRGB: + return "BC7_SRGB"; + case PixelFormat::A4B4G4R4_UNORM: + return "A4B4G4R4_UNORM"; + case PixelFormat::ASTC_2D_4X4_SRGB: + return "ASTC_2D_4X4_SRGB"; + case PixelFormat::ASTC_2D_8X8_SRGB: + return "ASTC_2D_8X8_SRGB"; + case PixelFormat::ASTC_2D_8X5_SRGB: + return "ASTC_2D_8X5_SRGB"; + case PixelFormat::ASTC_2D_5X4_SRGB: + return "ASTC_2D_5X4_SRGB"; + case PixelFormat::ASTC_2D_5X5_UNORM: + return "ASTC_2D_5X5_UNORM"; + case PixelFormat::ASTC_2D_5X5_SRGB: + return "ASTC_2D_5X5_SRGB"; + case PixelFormat::ASTC_2D_10X8_UNORM: + return "ASTC_2D_10X8_UNORM"; + case PixelFormat::ASTC_2D_10X8_SRGB: + return "ASTC_2D_10X8_SRGB"; + case PixelFormat::ASTC_2D_6X6_UNORM: + return "ASTC_2D_6X6_UNORM"; + case PixelFormat::ASTC_2D_6X6_SRGB: + return "ASTC_2D_6X6_SRGB"; + case PixelFormat::ASTC_2D_10X10_UNORM: + return "ASTC_2D_10X10_UNORM"; + case PixelFormat::ASTC_2D_10X10_SRGB: + return "ASTC_2D_10X10_SRGB"; + case PixelFormat::ASTC_2D_12X12_UNORM: + return "ASTC_2D_12X12_UNORM"; + case PixelFormat::ASTC_2D_12X12_SRGB: + return "ASTC_2D_12X12_SRGB"; + case PixelFormat::ASTC_2D_8X6_UNORM: + return "ASTC_2D_8X6_UNORM"; + case PixelFormat::ASTC_2D_8X6_SRGB: + return "ASTC_2D_8X6_SRGB"; + case PixelFormat::ASTC_2D_6X5_UNORM: + return "ASTC_2D_6X5_UNORM"; + case PixelFormat::ASTC_2D_6X5_SRGB: + return "ASTC_2D_6X5_SRGB"; + case PixelFormat::E5B9G9R9_FLOAT: + return "E5B9G9R9_FLOAT"; + case PixelFormat::D32_FLOAT: + return "D32_FLOAT"; + case PixelFormat::D16_UNORM: + return "D16_UNORM"; + case PixelFormat::D24_UNORM_S8_UINT: + return "D24_UNORM_S8_UINT"; + case PixelFormat::S8_UINT_D24_UNORM: + return "S8_UINT_D24_UNORM"; + case PixelFormat::D32_FLOAT_S8_UINT: + return "D32_FLOAT_S8_UINT"; + case PixelFormat::MaxDepthStencilFormat: + case PixelFormat::Invalid: + return "Invalid"; + } + return "Invalid"; + }(); + return formatter<string_view>::format(name, ctx); + } +}; + +template <> +struct fmt::formatter<VideoCommon::ImageType> : fmt::formatter<fmt::string_view> { + template <typename FormatContext> + auto format(VideoCommon::ImageType type, FormatContext& ctx) { + const string_view name = [type] { + using VideoCommon::ImageType; + switch (type) { + case ImageType::e1D: + return "1D"; + case ImageType::e2D: + return "2D"; + case ImageType::e3D: + return "3D"; + case ImageType::Linear: + return "Linear"; + case ImageType::Buffer: + return "Buffer"; + } + return "Invalid"; + }(); + return formatter<string_view>::format(name, ctx); + } +}; + +template <> +struct fmt::formatter<VideoCommon::Extent3D> { + constexpr auto parse(fmt::format_parse_context& ctx) { + return ctx.begin(); + } + + template <typename FormatContext> + auto format(const VideoCommon::Extent3D& extent, FormatContext& ctx) { + return fmt::format_to(ctx.out(), "{{{}, {}, {}}}", extent.width, extent.height, + extent.depth); + } +}; + +namespace VideoCommon { + +struct ImageBase; +struct ImageViewBase; +struct RenderTargets; + +[[nodiscard]] std::string Name(const ImageBase& image); + +[[nodiscard]] std::string Name(const ImageViewBase& image_view, + std::optional<ImageViewType> type = std::nullopt); + +[[nodiscard]] std::string Name(const RenderTargets& render_targets); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_base.cpp b/src/video_core/texture_cache/image_base.cpp new file mode 100644 index 000000000..959b3f115 --- /dev/null +++ b/src/video_core/texture_cache/image_base.cpp @@ -0,0 +1,218 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <optional> +#include <utility> +#include <vector> + +#include "common/common_types.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/image_base.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/util.h" + +namespace VideoCommon { + +using VideoCore::Surface::DefaultBlockHeight; +using VideoCore::Surface::DefaultBlockWidth; + +namespace { +/// Returns the base layer and mip level offset +[[nodiscard]] std::pair<s32, s32> LayerMipOffset(s32 diff, u32 layer_stride) { + if (layer_stride == 0) { + return {0, diff}; + } else { + return {diff / layer_stride, diff % layer_stride}; + } +} + +[[nodiscard]] bool ValidateLayers(const SubresourceLayers& layers, const ImageInfo& info) { + return layers.base_level < info.resources.levels && + layers.base_layer + layers.num_layers <= info.resources.layers; +} + +[[nodiscard]] bool ValidateCopy(const ImageCopy& copy, const ImageInfo& dst, const ImageInfo& src) { + const Extent3D src_size = MipSize(src.size, copy.src_subresource.base_level); + const Extent3D dst_size = MipSize(dst.size, copy.dst_subresource.base_level); + if (!ValidateLayers(copy.src_subresource, src)) { + return false; + } + if (!ValidateLayers(copy.dst_subresource, dst)) { + return false; + } + if (copy.src_offset.x + copy.extent.width > src_size.width || + copy.src_offset.y + copy.extent.height > src_size.height || + copy.src_offset.z + copy.extent.depth > src_size.depth) { + return false; + } + if (copy.dst_offset.x + copy.extent.width > dst_size.width || + copy.dst_offset.y + copy.extent.height > dst_size.height || + copy.dst_offset.z + copy.extent.depth > dst_size.depth) { + return false; + } + return true; +} +} // Anonymous namespace + +ImageBase::ImageBase(const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) + : info{info_}, guest_size_bytes{CalculateGuestSizeInBytes(info)}, + unswizzled_size_bytes{CalculateUnswizzledSizeBytes(info)}, + converted_size_bytes{CalculateConvertedSizeBytes(info)}, gpu_addr{gpu_addr_}, + cpu_addr{cpu_addr_}, cpu_addr_end{cpu_addr + guest_size_bytes}, + mip_level_offsets{CalculateMipLevelOffsets(info)} { + if (info.type == ImageType::e3D) { + slice_offsets = CalculateSliceOffsets(info); + slice_subresources = CalculateSliceSubresources(info); + } +} + +std::optional<SubresourceBase> ImageBase::TryFindBase(GPUVAddr other_addr) const noexcept { + if (other_addr < gpu_addr) { + // Subresource address can't be lower than the base + return std::nullopt; + } + const u32 diff = static_cast<u32>(other_addr - gpu_addr); + if (diff > guest_size_bytes) { + // This can happen when two CPU addresses are used for different GPU addresses + return std::nullopt; + } + if (info.type != ImageType::e3D) { + const auto [layer, mip_offset] = LayerMipOffset(diff, info.layer_stride); + const auto end = mip_level_offsets.begin() + info.resources.levels; + const auto it = std::find(mip_level_offsets.begin(), end, mip_offset); + if (layer > info.resources.layers || it == end) { + return std::nullopt; + } + return SubresourceBase{ + .level = static_cast<s32>(std::distance(mip_level_offsets.begin(), it)), + .layer = layer, + }; + } else { + // TODO: Consider using binary_search after a threshold + const auto it = std::ranges::find(slice_offsets, diff); + if (it == slice_offsets.cend()) { + return std::nullopt; + } + return slice_subresources[std::distance(slice_offsets.begin(), it)]; + } +} + +ImageViewId ImageBase::FindView(const ImageViewInfo& view_info) const noexcept { + const auto it = std::ranges::find(image_view_infos, view_info); + if (it == image_view_infos.end()) { + return ImageViewId{}; + } + return image_view_ids[std::distance(image_view_infos.begin(), it)]; +} + +void ImageBase::InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id) { + image_view_infos.push_back(view_info); + image_view_ids.push_back(image_view_id); +} + +void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id) { + static constexpr auto OPTIONS = RelaxedOptions::Size | RelaxedOptions::Format; + ASSERT(lhs.info.type == rhs.info.type); + std::optional<SubresourceBase> base; + if (lhs.info.type == ImageType::Linear) { + base = SubresourceBase{.level = 0, .layer = 0}; + } else { + // We are passing relaxed formats as an option, having broken views or not won't matter + static constexpr bool broken_views = false; + base = FindSubresource(rhs.info, lhs, rhs.gpu_addr, OPTIONS, broken_views); + } + if (!base) { + LOG_ERROR(HW_GPU, "Image alias should have been flipped"); + return; + } + const PixelFormat lhs_format = lhs.info.format; + const PixelFormat rhs_format = rhs.info.format; + const Extent2D lhs_block{ + .width = DefaultBlockWidth(lhs_format), + .height = DefaultBlockHeight(lhs_format), + }; + const Extent2D rhs_block{ + .width = DefaultBlockWidth(rhs_format), + .height = DefaultBlockHeight(rhs_format), + }; + const bool is_lhs_compressed = lhs_block.width > 1 || lhs_block.height > 1; + const bool is_rhs_compressed = rhs_block.width > 1 || rhs_block.height > 1; + if (is_lhs_compressed && is_rhs_compressed) { + LOG_ERROR(HW_GPU, "Compressed to compressed image aliasing is not implemented"); + return; + } + const s32 lhs_mips = lhs.info.resources.levels; + const s32 rhs_mips = rhs.info.resources.levels; + const s32 num_mips = std::min(lhs_mips - base->level, rhs_mips); + AliasedImage lhs_alias; + AliasedImage rhs_alias; + lhs_alias.id = rhs_id; + rhs_alias.id = lhs_id; + lhs_alias.copies.reserve(num_mips); + rhs_alias.copies.reserve(num_mips); + for (s32 mip_level = 0; mip_level < num_mips; ++mip_level) { + Extent3D lhs_size = MipSize(lhs.info.size, base->level + mip_level); + Extent3D rhs_size = MipSize(rhs.info.size, mip_level); + if (is_lhs_compressed) { + lhs_size.width /= lhs_block.width; + lhs_size.height /= lhs_block.height; + } + if (is_rhs_compressed) { + rhs_size.width /= rhs_block.width; + rhs_size.height /= rhs_block.height; + } + const Extent3D copy_size{ + .width = std::min(lhs_size.width, rhs_size.width), + .height = std::min(lhs_size.height, rhs_size.height), + .depth = std::min(lhs_size.depth, rhs_size.depth), + }; + if (copy_size.width == 0 || copy_size.height == 0) { + LOG_WARNING(HW_GPU, "Copy size is smaller than block size. Mip cannot be aliased."); + continue; + } + const bool is_lhs_3d = lhs.info.type == ImageType::e3D; + const bool is_rhs_3d = rhs.info.type == ImageType::e3D; + const Offset3D lhs_offset{0, 0, 0}; + const Offset3D rhs_offset{0, 0, is_rhs_3d ? base->layer : 0}; + const s32 lhs_layers = is_lhs_3d ? 1 : lhs.info.resources.layers - base->layer; + const s32 rhs_layers = is_rhs_3d ? 1 : rhs.info.resources.layers; + const s32 num_layers = std::min(lhs_layers, rhs_layers); + const SubresourceLayers lhs_subresource{ + .base_level = mip_level, + .base_layer = 0, + .num_layers = num_layers, + }; + const SubresourceLayers rhs_subresource{ + .base_level = base->level + mip_level, + .base_layer = is_rhs_3d ? 0 : base->layer, + .num_layers = num_layers, + }; + [[maybe_unused]] const ImageCopy& to_lhs_copy = lhs_alias.copies.emplace_back(ImageCopy{ + .src_subresource = lhs_subresource, + .dst_subresource = rhs_subresource, + .src_offset = lhs_offset, + .dst_offset = rhs_offset, + .extent = copy_size, + }); + [[maybe_unused]] const ImageCopy& to_rhs_copy = rhs_alias.copies.emplace_back(ImageCopy{ + .src_subresource = rhs_subresource, + .dst_subresource = lhs_subresource, + .src_offset = rhs_offset, + .dst_offset = lhs_offset, + .extent = copy_size, + }); + ASSERT_MSG(ValidateCopy(to_lhs_copy, lhs.info, rhs.info), "Invalid RHS to LHS copy"); + ASSERT_MSG(ValidateCopy(to_rhs_copy, rhs.info, lhs.info), "Invalid LHS to RHS copy"); + } + ASSERT(lhs_alias.copies.empty() == rhs_alias.copies.empty()); + if (lhs_alias.copies.empty()) { + return; + } + lhs.aliased_images.push_back(std::move(lhs_alias)); + rhs.aliased_images.push_back(std::move(rhs_alias)); +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_base.h b/src/video_core/texture_cache/image_base.h new file mode 100644 index 000000000..b7f3b7e43 --- /dev/null +++ b/src/video_core/texture_cache/image_base.h @@ -0,0 +1,83 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <optional> +#include <vector> + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +enum class ImageFlagBits : u32 { + AcceleratedUpload = 1 << 0, ///< Upload can be accelerated in the GPU + Converted = 1 << 1, ///< Guest format is not supported natively and it has to be converted + CpuModified = 1 << 2, ///< Contents have been modified from the CPU + GpuModified = 1 << 3, ///< Contents have been modified from the GPU + Tracked = 1 << 4, ///< Writes and reads are being hooked from the CPU JIT + Strong = 1 << 5, ///< Exists in the image table, the dimensions are can be trusted + Registered = 1 << 6, ///< True when the image is registered + Picked = 1 << 7, ///< Temporary flag to mark the image as picked +}; +DECLARE_ENUM_FLAG_OPERATORS(ImageFlagBits) + +struct ImageViewInfo; + +struct AliasedImage { + std::vector<ImageCopy> copies; + ImageId id; +}; + +struct ImageBase { + explicit ImageBase(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); + + [[nodiscard]] std::optional<SubresourceBase> TryFindBase(GPUVAddr other_addr) const noexcept; + + [[nodiscard]] ImageViewId FindView(const ImageViewInfo& view_info) const noexcept; + + void InsertView(const ImageViewInfo& view_info, ImageViewId image_view_id); + + [[nodiscard]] bool Overlaps(VAddr overlap_cpu_addr, size_t overlap_size) const noexcept { + const VAddr overlap_end = overlap_cpu_addr + overlap_size; + return cpu_addr < overlap_end && overlap_cpu_addr < cpu_addr_end; + } + + ImageInfo info; + + u32 guest_size_bytes = 0; + u32 unswizzled_size_bytes = 0; + u32 converted_size_bytes = 0; + ImageFlagBits flags = ImageFlagBits::CpuModified; + + GPUVAddr gpu_addr = 0; + VAddr cpu_addr = 0; + VAddr cpu_addr_end = 0; + + u64 modification_tick = 0; + u64 frame_tick = 0; + + std::array<u32, MAX_MIP_LEVELS> mip_level_offsets{}; + + std::vector<ImageViewInfo> image_view_infos; + std::vector<ImageViewId> image_view_ids; + + std::vector<u32> slice_offsets; + std::vector<SubresourceBase> slice_subresources; + + std::vector<AliasedImage> aliased_images; +}; + +struct ImageAllocBase { + std::vector<ImageId> images; +}; + +void AddImageAlias(ImageBase& lhs, ImageBase& rhs, ImageId lhs_id, ImageId rhs_id); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_info.cpp b/src/video_core/texture_cache/image_info.cpp new file mode 100644 index 000000000..64fd7010a --- /dev/null +++ b/src/video_core/texture_cache/image_info.cpp @@ -0,0 +1,189 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/format_lookup_table.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/types.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +using Tegra::Texture::TextureType; +using Tegra::Texture::TICEntry; +using VideoCore::Surface::PixelFormat; + +ImageInfo::ImageInfo(const TICEntry& config) noexcept { + format = PixelFormatFromTextureInfo(config.format, config.r_type, config.g_type, config.b_type, + config.a_type, config.srgb_conversion); + num_samples = NumSamples(config.msaa_mode); + resources.levels = config.max_mip_level + 1; + if (config.IsPitchLinear()) { + pitch = config.Pitch(); + } else if (config.IsBlockLinear()) { + block = Extent3D{ + .width = config.block_width, + .height = config.block_height, + .depth = config.block_depth, + }; + } + tile_width_spacing = config.tile_width_spacing; + if (config.texture_type != TextureType::Texture2D && + config.texture_type != TextureType::Texture2DNoMipmap) { + ASSERT(!config.IsPitchLinear()); + } + switch (config.texture_type) { + case TextureType::Texture1D: + ASSERT(config.BaseLayer() == 0); + type = ImageType::e1D; + size.width = config.Width(); + break; + case TextureType::Texture1DArray: + UNIMPLEMENTED_IF(config.BaseLayer() != 0); + type = ImageType::e1D; + size.width = config.Width(); + resources.layers = config.Depth(); + break; + case TextureType::Texture2D: + case TextureType::Texture2DNoMipmap: + ASSERT(config.Depth() == 1); + type = config.IsPitchLinear() ? ImageType::Linear : ImageType::e2D; + size.width = config.Width(); + size.height = config.Height(); + resources.layers = config.BaseLayer() + 1; + break; + case TextureType::Texture2DArray: + type = ImageType::e2D; + size.width = config.Width(); + size.height = config.Height(); + resources.layers = config.BaseLayer() + config.Depth(); + break; + case TextureType::TextureCubemap: + ASSERT(config.Depth() == 1); + type = ImageType::e2D; + size.width = config.Width(); + size.height = config.Height(); + resources.layers = config.BaseLayer() + 6; + break; + case TextureType::TextureCubeArray: + UNIMPLEMENTED_IF(config.load_store_hint != 0); + type = ImageType::e2D; + size.width = config.Width(); + size.height = config.Height(); + resources.layers = config.BaseLayer() + config.Depth() * 6; + break; + case TextureType::Texture3D: + ASSERT(config.BaseLayer() == 0); + type = ImageType::e3D; + size.width = config.Width(); + size.height = config.Height(); + size.depth = config.Depth(); + break; + case TextureType::Texture1DBuffer: + type = ImageType::Buffer; + size.width = config.Width(); + break; + default: + UNREACHABLE_MSG("Invalid texture_type={}", static_cast<int>(config.texture_type.Value())); + break; + } + if (type != ImageType::Linear) { + // FIXME: Call this without passing *this + layer_stride = CalculateLayerStride(*this); + maybe_unaligned_layer_stride = CalculateLayerSize(*this); + } +} + +ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept { + const auto& rt = regs.rt[index]; + format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(rt.format); + if (rt.tile_mode.is_pitch_linear) { + ASSERT(rt.tile_mode.is_3d == 0); + type = ImageType::Linear; + pitch = rt.width; + size = Extent3D{ + .width = pitch / BytesPerBlock(format), + .height = rt.height, + .depth = 1, + }; + return; + } + size.width = rt.width; + size.height = rt.height; + layer_stride = rt.layer_stride * 4; + maybe_unaligned_layer_stride = layer_stride; + num_samples = NumSamples(regs.multisample_mode); + block = Extent3D{ + .width = rt.tile_mode.block_width, + .height = rt.tile_mode.block_height, + .depth = rt.tile_mode.block_depth, + }; + if (rt.tile_mode.is_3d) { + type = ImageType::e3D; + size.depth = rt.depth; + } else { + type = ImageType::e2D; + resources.layers = rt.depth; + } +} + +ImageInfo::ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept { + format = VideoCore::Surface::PixelFormatFromDepthFormat(regs.zeta.format); + size.width = regs.zeta_width; + size.height = regs.zeta_height; + resources.levels = 1; + layer_stride = regs.zeta.layer_stride * 4; + maybe_unaligned_layer_stride = layer_stride; + num_samples = NumSamples(regs.multisample_mode); + block = Extent3D{ + .width = regs.zeta.tile_mode.block_width, + .height = regs.zeta.tile_mode.block_height, + .depth = regs.zeta.tile_mode.block_depth, + }; + if (regs.zeta.tile_mode.is_pitch_linear) { + ASSERT(regs.zeta.tile_mode.is_3d == 0); + type = ImageType::Linear; + pitch = size.width * BytesPerBlock(format); + } else if (regs.zeta.tile_mode.is_3d) { + ASSERT(regs.zeta.tile_mode.is_pitch_linear == 0); + type = ImageType::e3D; + size.depth = regs.zeta_depth; + } else { + type = ImageType::e2D; + resources.layers = regs.zeta_depth; + } +} + +ImageInfo::ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept { + UNIMPLEMENTED_IF_MSG(config.layer != 0, "Surface layer is not zero"); + format = VideoCore::Surface::PixelFormatFromRenderTargetFormat(config.format); + if (config.linear == Tegra::Engines::Fermi2D::MemoryLayout::Pitch) { + type = ImageType::Linear; + size = Extent3D{ + .width = config.pitch / VideoCore::Surface::BytesPerBlock(format), + .height = config.height, + .depth = 1, + }; + pitch = config.pitch; + } else { + type = config.block_depth > 0 ? ImageType::e3D : ImageType::e2D; + block = Extent3D{ + .width = config.block_width, + .height = config.block_height, + .depth = config.block_depth, + }; + // 3D blits with more than once slice are not implemented for now + // Render to individual slices + size = Extent3D{ + .width = config.width, + .height = config.height, + .depth = 1, + }; + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_info.h b/src/video_core/texture_cache/image_info.h new file mode 100644 index 000000000..5049fc36e --- /dev/null +++ b/src/video_core/texture_cache/image_info.h @@ -0,0 +1,38 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/engines/fermi_2d.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +using Tegra::Texture::TICEntry; +using VideoCore::Surface::PixelFormat; + +struct ImageInfo { + explicit ImageInfo() = default; + explicit ImageInfo(const TICEntry& config) noexcept; + explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs, size_t index) noexcept; + explicit ImageInfo(const Tegra::Engines::Maxwell3D::Regs& regs) noexcept; + explicit ImageInfo(const Tegra::Engines::Fermi2D::Surface& config) noexcept; + + PixelFormat format = PixelFormat::Invalid; + ImageType type = ImageType::e1D; + SubresourceExtent resources; + Extent3D size{1, 1, 1}; + union { + Extent3D block{0, 0, 0}; + u32 pitch; + }; + u32 layer_stride = 0; + u32 maybe_unaligned_layer_stride = 0; + u32 num_samples = 1; + u32 tile_width_spacing = 0; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_base.cpp b/src/video_core/texture_cache/image_view_base.cpp new file mode 100644 index 000000000..18f72e508 --- /dev/null +++ b/src/video_core/texture_cache/image_view_base.cpp @@ -0,0 +1,41 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> + +#include "common/assert.h" +#include "core/settings.h" +#include "video_core/compatible_formats.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_base.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +ImageViewBase::ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, + ImageId image_id_) + : image_id{image_id_}, format{info.format}, type{info.type}, range{info.range}, + size{ + .width = std::max(image_info.size.width >> range.base.level, 1u), + .height = std::max(image_info.size.height >> range.base.level, 1u), + .depth = std::max(image_info.size.depth >> range.base.level, 1u), + } { + ASSERT_MSG(VideoCore::Surface::IsViewCompatible(image_info.format, info.format, false), + "Image view format {} is incompatible with image format {}", info.format, + image_info.format); + const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); + if (image_info.type == ImageType::Linear && is_async) { + flags |= ImageViewFlagBits::PreemtiveDownload; + } + if (image_info.type == ImageType::e3D && info.type != ImageViewType::e3D) { + flags |= ImageViewFlagBits::Slice; + } +} + +ImageViewBase::ImageViewBase(const NullImageParams&) {} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_base.h b/src/video_core/texture_cache/image_view_base.h new file mode 100644 index 000000000..73954167e --- /dev/null +++ b/src/video_core/texture_cache/image_view_base.h @@ -0,0 +1,47 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_funcs.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +using VideoCore::Surface::PixelFormat; + +struct ImageViewInfo; +struct ImageInfo; + +struct NullImageParams {}; + +enum class ImageViewFlagBits : u16 { + PreemtiveDownload = 1 << 0, + Strong = 1 << 1, + Slice = 1 << 2, +}; +DECLARE_ENUM_FLAG_OPERATORS(ImageViewFlagBits) + +struct ImageViewBase { + explicit ImageViewBase(const ImageViewInfo& info, const ImageInfo& image_info, + ImageId image_id); + explicit ImageViewBase(const NullImageParams&); + + [[nodiscard]] bool IsBuffer() const noexcept { + return type == ImageViewType::Buffer; + } + + ImageId image_id{}; + PixelFormat format{}; + ImageViewType type{}; + SubresourceRange range; + Extent3D size{0, 0, 0}; + ImageViewFlagBits flags{}; + + u64 invalidation_tick = 0; + u64 modification_tick = 0; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_info.cpp b/src/video_core/texture_cache/image_view_info.cpp new file mode 100644 index 000000000..faf5b151f --- /dev/null +++ b/src/video_core/texture_cache/image_view_info.cpp @@ -0,0 +1,88 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <limits> + +#include "common/assert.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/texture_cache.h" +#include "video_core/texture_cache/types.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +namespace { + +constexpr u8 RENDER_TARGET_SWIZZLE = std::numeric_limits<u8>::max(); + +[[nodiscard]] u8 CastSwizzle(SwizzleSource source) { + const u8 casted = static_cast<u8>(source); + ASSERT(static_cast<SwizzleSource>(casted) == source); + return casted; +} + +} // Anonymous namespace + +ImageViewInfo::ImageViewInfo(const TICEntry& config, s32 base_layer) noexcept + : format{PixelFormatFromTIC(config)}, x_source{CastSwizzle(config.x_source)}, + y_source{CastSwizzle(config.y_source)}, z_source{CastSwizzle(config.z_source)}, + w_source{CastSwizzle(config.w_source)} { + range.base = SubresourceBase{ + .level = static_cast<s32>(config.res_min_mip_level), + .layer = base_layer, + }; + range.extent.levels = config.res_max_mip_level - config.res_min_mip_level + 1; + + switch (config.texture_type) { + case TextureType::Texture1D: + ASSERT(config.Height() == 1); + ASSERT(config.Depth() == 1); + type = ImageViewType::e1D; + break; + case TextureType::Texture2D: + case TextureType::Texture2DNoMipmap: + ASSERT(config.Depth() == 1); + type = config.normalized_coords ? ImageViewType::e2D : ImageViewType::Rect; + break; + case TextureType::Texture3D: + type = ImageViewType::e3D; + break; + case TextureType::TextureCubemap: + ASSERT(config.Depth() == 1); + type = ImageViewType::Cube; + range.extent.layers = 6; + break; + case TextureType::Texture1DArray: + type = ImageViewType::e1DArray; + range.extent.layers = config.Depth(); + break; + case TextureType::Texture2DArray: + type = ImageViewType::e2DArray; + range.extent.layers = config.Depth(); + break; + case TextureType::Texture1DBuffer: + type = ImageViewType::Buffer; + break; + case TextureType::TextureCubeArray: + type = ImageViewType::CubeArray; + range.extent.layers = config.Depth() * 6; + break; + default: + UNREACHABLE_MSG("Invalid texture_type={}", static_cast<int>(config.texture_type.Value())); + break; + } +} + +ImageViewInfo::ImageViewInfo(ImageViewType type_, PixelFormat format_, + SubresourceRange range_) noexcept + : type{type_}, format{format_}, range{range_}, x_source{RENDER_TARGET_SWIZZLE}, + y_source{RENDER_TARGET_SWIZZLE}, z_source{RENDER_TARGET_SWIZZLE}, + w_source{RENDER_TARGET_SWIZZLE} {} + +bool ImageViewInfo::IsRenderTarget() const noexcept { + return x_source == RENDER_TARGET_SWIZZLE && y_source == RENDER_TARGET_SWIZZLE && + z_source == RENDER_TARGET_SWIZZLE && w_source == RENDER_TARGET_SWIZZLE; +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/image_view_info.h b/src/video_core/texture_cache/image_view_info.h new file mode 100644 index 000000000..0c1f99117 --- /dev/null +++ b/src/video_core/texture_cache/image_view_info.h @@ -0,0 +1,50 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <type_traits> + +#include "video_core/surface.h" +#include "video_core/texture_cache/types.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +using Tegra::Texture::SwizzleSource; +using Tegra::Texture::TICEntry; +using VideoCore::Surface::PixelFormat; + +/// Properties used to determine a image view +struct ImageViewInfo { + explicit ImageViewInfo() noexcept = default; + explicit ImageViewInfo(const TICEntry& config, s32 base_layer) noexcept; + explicit ImageViewInfo(ImageViewType type, PixelFormat format, + SubresourceRange range = {}) noexcept; + + auto operator<=>(const ImageViewInfo&) const noexcept = default; + + [[nodiscard]] bool IsRenderTarget() const noexcept; + + [[nodiscard]] std::array<SwizzleSource, 4> Swizzle() const noexcept { + return std::array{ + static_cast<SwizzleSource>(x_source), + static_cast<SwizzleSource>(y_source), + static_cast<SwizzleSource>(z_source), + static_cast<SwizzleSource>(w_source), + }; + } + + ImageViewType type{}; + PixelFormat format{}; + SubresourceRange range; + u8 x_source = static_cast<u8>(SwizzleSource::R); + u8 y_source = static_cast<u8>(SwizzleSource::G); + u8 z_source = static_cast<u8>(SwizzleSource::B); + u8 w_source = static_cast<u8>(SwizzleSource::A); +}; +static_assert(std::has_unique_object_representations_v<ImageViewInfo>); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/render_targets.h b/src/video_core/texture_cache/render_targets.h new file mode 100644 index 000000000..9b9544b07 --- /dev/null +++ b/src/video_core/texture_cache/render_targets.h @@ -0,0 +1,51 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <algorithm> +#include <span> +#include <utility> + +#include "common/bit_cast.h" +#include "video_core/texture_cache/types.h" + +namespace VideoCommon { + +/// Framebuffer properties used to lookup a framebuffer +struct RenderTargets { + constexpr auto operator<=>(const RenderTargets&) const noexcept = default; + + constexpr bool Contains(std::span<const ImageViewId> elements) const noexcept { + const auto contains = [elements](ImageViewId item) { + return std::ranges::find(elements, item) != elements.end(); + }; + return std::ranges::any_of(color_buffer_ids, contains) || contains(depth_buffer_id); + } + + std::array<ImageViewId, NUM_RT> color_buffer_ids; + ImageViewId depth_buffer_id; + std::array<u8, NUM_RT> draw_buffers{}; + Extent2D size; +}; + +} // namespace VideoCommon + +namespace std { + +template <> +struct hash<VideoCommon::RenderTargets> { + size_t operator()(const VideoCommon::RenderTargets& rt) const noexcept { + using VideoCommon::ImageViewId; + size_t value = std::hash<ImageViewId>{}(rt.depth_buffer_id); + for (const ImageViewId color_buffer_id : rt.color_buffer_ids) { + value ^= std::hash<ImageViewId>{}(color_buffer_id); + } + value ^= Common::BitCast<u64>(rt.draw_buffers); + value ^= Common::BitCast<u64>(rt.size); + return value; + } +}; + +} // namespace std diff --git a/src/video_core/texture_cache/samples_helper.h b/src/video_core/texture_cache/samples_helper.h new file mode 100644 index 000000000..04539a43c --- /dev/null +++ b/src/video_core/texture_cache/samples_helper.h @@ -0,0 +1,55 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <utility> + +#include "common/assert.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +[[nodiscard]] inline std::pair<int, int> SamplesLog2(int num_samples) { + switch (num_samples) { + case 1: + return {0, 0}; + case 2: + return {1, 0}; + case 4: + return {1, 1}; + case 8: + return {2, 1}; + case 16: + return {2, 2}; + } + UNREACHABLE_MSG("Invalid number of samples={}", num_samples); + return {1, 1}; +} + +[[nodiscard]] inline int NumSamples(Tegra::Texture::MsaaMode msaa_mode) { + using Tegra::Texture::MsaaMode; + switch (msaa_mode) { + case MsaaMode::Msaa1x1: + return 1; + case MsaaMode::Msaa2x1: + case MsaaMode::Msaa2x1_D3D: + return 2; + case MsaaMode::Msaa2x2: + case MsaaMode::Msaa2x2_VC4: + case MsaaMode::Msaa2x2_VC12: + return 4; + case MsaaMode::Msaa4x2: + case MsaaMode::Msaa4x2_D3D: + case MsaaMode::Msaa4x2_VC8: + case MsaaMode::Msaa4x2_VC24: + return 8; + case MsaaMode::Msaa4x4: + return 16; + } + UNREACHABLE_MSG("Invalid MSAA mode={}", static_cast<int>(msaa_mode)); + return 1; +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h new file mode 100644 index 000000000..eae3be6ea --- /dev/null +++ b/src/video_core/texture_cache/slot_vector.h @@ -0,0 +1,156 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <concepts> +#include <numeric> +#include <type_traits> +#include <utility> +#include <vector> + +#include "common/assert.h" +#include "common/common_types.h" + +namespace VideoCommon { + +struct SlotId { + static constexpr u32 INVALID_INDEX = std::numeric_limits<u32>::max(); + + constexpr auto operator<=>(const SlotId&) const noexcept = default; + + constexpr explicit operator bool() const noexcept { + return index != INVALID_INDEX; + } + + u32 index = INVALID_INDEX; +}; + +template <class T> +requires std::is_nothrow_move_assignable_v<T>&& + std::is_nothrow_move_constructible_v<T> class SlotVector { +public: + ~SlotVector() noexcept { + size_t index = 0; + for (u64 bits : stored_bitset) { + for (size_t bit = 0; bits; ++bit, bits >>= 1) { + if ((bits & 1) != 0) { + values[index + bit].object.~T(); + } + } + index += 64; + } + delete[] values; + } + + [[nodiscard]] T& operator[](SlotId id) noexcept { + ValidateIndex(id); + return values[id.index].object; + } + + [[nodiscard]] const T& operator[](SlotId id) const noexcept { + ValidateIndex(id); + return values[id.index].object; + } + + template <typename... Args> + [[nodiscard]] SlotId insert(Args&&... args) noexcept { + const u32 index = FreeValueIndex(); + new (&values[index].object) T(std::forward<Args>(args)...); + SetStorageBit(index); + + return SlotId{index}; + } + + void erase(SlotId id) noexcept { + values[id.index].object.~T(); + free_list.push_back(id.index); + ResetStorageBit(id.index); + } + +private: + struct NonTrivialDummy { + NonTrivialDummy() noexcept {} + }; + + union Entry { + Entry() noexcept : dummy{} {} + ~Entry() noexcept {} + + NonTrivialDummy dummy; + T object; + }; + + void SetStorageBit(u32 index) noexcept { + stored_bitset[index / 64] |= u64(1) << (index % 64); + } + + void ResetStorageBit(u32 index) noexcept { + stored_bitset[index / 64] &= ~(u64(1) << (index % 64)); + } + + bool ReadStorageBit(u32 index) noexcept { + return ((stored_bitset[index / 64] >> (index % 64)) & 1) != 0; + } + + void ValidateIndex(SlotId id) const noexcept { + DEBUG_ASSERT(id); + DEBUG_ASSERT(id.index / 64 < stored_bitset.size()); + DEBUG_ASSERT(((stored_bitset[id.index / 64] >> (id.index % 64)) & 1) != 0); + } + + [[nodiscard]] u32 FreeValueIndex() noexcept { + if (free_list.empty()) { + Reserve(values_capacity ? (values_capacity << 1) : 1); + } + const u32 free_index = free_list.back(); + free_list.pop_back(); + return free_index; + } + + void Reserve(size_t new_capacity) noexcept { + Entry* const new_values = new Entry[new_capacity]; + size_t index = 0; + for (u64 bits : stored_bitset) { + for (size_t bit = 0; bits; ++bit, bits >>= 1) { + const size_t i = index + bit; + if ((bits & 1) == 0) { + continue; + } + T& old_value = values[i].object; + new (&new_values[i].object) T(std::move(old_value)); + old_value.~T(); + } + index += 64; + } + + stored_bitset.resize((new_capacity + 63) / 64); + + const size_t old_free_size = free_list.size(); + free_list.resize(old_free_size + (new_capacity - values_capacity)); + std::iota(free_list.begin() + old_free_size, free_list.end(), + static_cast<u32>(values_capacity)); + + delete[] values; + values = new_values; + values_capacity = new_capacity; + } + + Entry* values = nullptr; + size_t values_capacity = 0; + size_t values_size = 0; + + std::vector<u64> stored_bitset; + std::vector<u32> free_list; +}; + +} // namespace VideoCommon + +template <> +struct std::hash<VideoCommon::SlotId> { + size_t operator()(const VideoCommon::SlotId& id) const noexcept { + return std::hash<u32>{}(id.index); + } +}; diff --git a/src/video_core/texture_cache/surface_base.cpp b/src/video_core/texture_cache/surface_base.cpp deleted file mode 100644 index b44c09d71..000000000 --- a/src/video_core/texture_cache/surface_base.cpp +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include "common/algorithm.h" -#include "common/assert.h" -#include "common/common_types.h" -#include "common/microprofile.h" -#include "video_core/memory_manager.h" -#include "video_core/texture_cache/surface_base.h" -#include "video_core/texture_cache/surface_params.h" -#include "video_core/textures/convert.h" - -namespace VideoCommon { - -MICROPROFILE_DEFINE(GPU_Load_Texture, "GPU", "Texture Load", MP_RGB(128, 192, 128)); -MICROPROFILE_DEFINE(GPU_Flush_Texture, "GPU", "Texture Flush", MP_RGB(128, 192, 128)); - -using Tegra::Texture::ConvertFromGuestToHost; -using VideoCore::MortonSwizzleMode; -using VideoCore::Surface::IsPixelFormatASTC; -using VideoCore::Surface::PixelFormat; - -StagingCache::StagingCache() = default; - -StagingCache::~StagingCache() = default; - -SurfaceBaseImpl::SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params, - bool is_astc_supported) - : params{params}, gpu_addr{gpu_addr}, mipmap_sizes(params.num_levels), - mipmap_offsets(params.num_levels) { - is_converted = IsPixelFormatASTC(params.pixel_format) && !is_astc_supported; - host_memory_size = params.GetHostSizeInBytes(is_converted); - - std::size_t offset = 0; - for (u32 level = 0; level < params.num_levels; ++level) { - const std::size_t mipmap_size{params.GetGuestMipmapSize(level)}; - mipmap_sizes[level] = mipmap_size; - mipmap_offsets[level] = offset; - offset += mipmap_size; - } - layer_size = offset; - if (params.is_layered) { - if (params.is_tiled) { - layer_size = - SurfaceParams::AlignLayered(layer_size, params.block_height, params.block_depth); - } - guest_memory_size = layer_size * params.depth; - } else { - guest_memory_size = layer_size; - } -} - -MatchTopologyResult SurfaceBaseImpl::MatchesTopology(const SurfaceParams& rhs) const { - const u32 src_bpp{params.GetBytesPerPixel()}; - const u32 dst_bpp{rhs.GetBytesPerPixel()}; - const bool ib1 = params.IsBuffer(); - const bool ib2 = rhs.IsBuffer(); - if (std::tie(src_bpp, params.is_tiled, ib1) == std::tie(dst_bpp, rhs.is_tiled, ib2)) { - const bool cb1 = params.IsCompressed(); - const bool cb2 = rhs.IsCompressed(); - if (cb1 == cb2) { - return MatchTopologyResult::FullMatch; - } - return MatchTopologyResult::CompressUnmatch; - } - return MatchTopologyResult::None; -} - -MatchStructureResult SurfaceBaseImpl::MatchesStructure(const SurfaceParams& rhs) const { - // Buffer surface Check - if (params.IsBuffer()) { - const std::size_t wd1 = params.width * params.GetBytesPerPixel(); - const std::size_t wd2 = rhs.width * rhs.GetBytesPerPixel(); - if (wd1 == wd2) { - return MatchStructureResult::FullMatch; - } - return MatchStructureResult::None; - } - - // Linear Surface check - if (!params.is_tiled) { - if (std::tie(params.height, params.pitch) == std::tie(rhs.height, rhs.pitch)) { - if (params.width == rhs.width) { - return MatchStructureResult::FullMatch; - } else { - return MatchStructureResult::SemiMatch; - } - } - return MatchStructureResult::None; - } - - // Tiled Surface check - if (std::tie(params.depth, params.block_width, params.block_height, params.block_depth, - params.tile_width_spacing, params.num_levels) == - std::tie(rhs.depth, rhs.block_width, rhs.block_height, rhs.block_depth, - rhs.tile_width_spacing, rhs.num_levels)) { - if (std::tie(params.width, params.height) == std::tie(rhs.width, rhs.height)) { - return MatchStructureResult::FullMatch; - } - const u32 ws = SurfaceParams::ConvertWidth(rhs.GetBlockAlignedWidth(), params.pixel_format, - rhs.pixel_format); - const u32 hs = - SurfaceParams::ConvertHeight(rhs.height, params.pixel_format, rhs.pixel_format); - const u32 w1 = params.GetBlockAlignedWidth(); - if (std::tie(w1, params.height) == std::tie(ws, hs)) { - return MatchStructureResult::SemiMatch; - } - } - return MatchStructureResult::None; -} - -std::optional<std::pair<u32, u32>> SurfaceBaseImpl::GetLayerMipmap( - const GPUVAddr candidate_gpu_addr) const { - if (gpu_addr == candidate_gpu_addr) { - return {{0, 0}}; - } - - if (candidate_gpu_addr < gpu_addr) { - return std::nullopt; - } - - const auto relative_address{static_cast<GPUVAddr>(candidate_gpu_addr - gpu_addr)}; - const auto layer{static_cast<u32>(relative_address / layer_size)}; - if (layer >= params.depth) { - return std::nullopt; - } - - const GPUVAddr mipmap_address = relative_address - layer_size * layer; - const auto mipmap_it = - Common::BinaryFind(mipmap_offsets.begin(), mipmap_offsets.end(), mipmap_address); - if (mipmap_it == mipmap_offsets.end()) { - return std::nullopt; - } - - const auto level{static_cast<u32>(std::distance(mipmap_offsets.begin(), mipmap_it))}; - return std::make_pair(layer, level); -} - -std::vector<CopyParams> SurfaceBaseImpl::BreakDownLayered(const SurfaceParams& in_params) const { - const u32 layers{params.depth}; - const u32 mipmaps{params.num_levels}; - std::vector<CopyParams> result; - result.reserve(static_cast<std::size_t>(layers) * static_cast<std::size_t>(mipmaps)); - - for (u32 layer = 0; layer < layers; layer++) { - for (u32 level = 0; level < mipmaps; level++) { - const u32 width = SurfaceParams::IntersectWidth(params, in_params, level, level); - const u32 height = SurfaceParams::IntersectHeight(params, in_params, level, level); - result.emplace_back(0, 0, layer, 0, 0, layer, level, level, width, height, 1); - } - } - return result; -} - -std::vector<CopyParams> SurfaceBaseImpl::BreakDownNonLayered(const SurfaceParams& in_params) const { - const u32 mipmaps{params.num_levels}; - std::vector<CopyParams> result; - result.reserve(mipmaps); - - for (u32 level = 0; level < mipmaps; level++) { - const u32 width = SurfaceParams::IntersectWidth(params, in_params, level, level); - const u32 height = SurfaceParams::IntersectHeight(params, in_params, level, level); - const u32 depth{std::min(params.GetMipDepth(level), in_params.GetMipDepth(level))}; - result.emplace_back(width, height, depth, level); - } - return result; -} - -void SurfaceBaseImpl::SwizzleFunc(MortonSwizzleMode mode, u8* memory, const SurfaceParams& params, - u8* buffer, u32 level) { - const u32 width{params.GetMipWidth(level)}; - const u32 height{params.GetMipHeight(level)}; - const u32 block_height{params.GetMipBlockHeight(level)}; - const u32 block_depth{params.GetMipBlockDepth(level)}; - - std::size_t guest_offset{mipmap_offsets[level]}; - if (params.is_layered) { - std::size_t host_offset = 0; - const std::size_t guest_stride = layer_size; - const std::size_t host_stride = params.GetHostLayerSize(level); - for (u32 layer = 0; layer < params.depth; ++layer) { - MortonSwizzle(mode, params.pixel_format, width, block_height, height, block_depth, 1, - params.tile_width_spacing, buffer + host_offset, memory + guest_offset); - guest_offset += guest_stride; - host_offset += host_stride; - } - } else { - MortonSwizzle(mode, params.pixel_format, width, block_height, height, block_depth, - params.GetMipDepth(level), params.tile_width_spacing, buffer, - memory + guest_offset); - } -} - -void SurfaceBaseImpl::LoadBuffer(Tegra::MemoryManager& memory_manager, - StagingCache& staging_cache) { - MICROPROFILE_SCOPE(GPU_Load_Texture); - auto& staging_buffer = staging_cache.GetBuffer(0); - u8* host_ptr; - // Use an extra temporal buffer - auto& tmp_buffer = staging_cache.GetBuffer(1); - tmp_buffer.resize(guest_memory_size); - host_ptr = tmp_buffer.data(); - memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); - - if (params.is_tiled) { - ASSERT_MSG(params.block_width == 0, "Block width is defined as {} on texture target {}", - params.block_width, static_cast<u32>(params.target)); - for (u32 level = 0; level < params.num_levels; ++level) { - const std::size_t host_offset{params.GetHostMipmapLevelOffset(level, false)}; - SwizzleFunc(MortonSwizzleMode::MortonToLinear, host_ptr, params, - staging_buffer.data() + host_offset, level); - } - } else { - ASSERT_MSG(params.num_levels == 1, "Linear mipmap loading is not implemented"); - const u32 bpp{params.GetBytesPerPixel()}; - const u32 block_width{params.GetDefaultBlockWidth()}; - const u32 block_height{params.GetDefaultBlockHeight()}; - const u32 width{(params.width + block_width - 1) / block_width}; - const u32 height{(params.height + block_height - 1) / block_height}; - const u32 copy_size{width * bpp}; - if (params.pitch == copy_size) { - std::memcpy(staging_buffer.data(), host_ptr, params.GetHostSizeInBytes(false)); - } else { - const u8* start{host_ptr}; - u8* write_to{staging_buffer.data()}; - for (u32 h = height; h > 0; --h) { - std::memcpy(write_to, start, copy_size); - start += params.pitch; - write_to += copy_size; - } - } - } - - if (!is_converted && params.pixel_format != PixelFormat::S8_UINT_D24_UNORM) { - return; - } - - for (u32 level = params.num_levels; level--;) { - const std::size_t in_host_offset{params.GetHostMipmapLevelOffset(level, false)}; - const std::size_t out_host_offset{params.GetHostMipmapLevelOffset(level, is_converted)}; - u8* const in_buffer = staging_buffer.data() + in_host_offset; - u8* const out_buffer = staging_buffer.data() + out_host_offset; - ConvertFromGuestToHost(in_buffer, out_buffer, params.pixel_format, - params.GetMipWidth(level), params.GetMipHeight(level), - params.GetMipDepth(level), true, true); - } -} - -void SurfaceBaseImpl::FlushBuffer(Tegra::MemoryManager& memory_manager, - StagingCache& staging_cache) { - MICROPROFILE_SCOPE(GPU_Flush_Texture); - auto& staging_buffer = staging_cache.GetBuffer(0); - u8* host_ptr; - - // Use an extra temporal buffer - auto& tmp_buffer = staging_cache.GetBuffer(1); - tmp_buffer.resize(guest_memory_size); - host_ptr = tmp_buffer.data(); - - if (params.target == SurfaceTarget::Texture3D) { - // Special case for 3D texture segments - memory_manager.ReadBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); - } - - if (params.is_tiled) { - ASSERT_MSG(params.block_width == 0, "Block width is defined as {}", params.block_width); - for (u32 level = 0; level < params.num_levels; ++level) { - const std::size_t host_offset{params.GetHostMipmapLevelOffset(level, false)}; - SwizzleFunc(MortonSwizzleMode::LinearToMorton, host_ptr, params, - staging_buffer.data() + host_offset, level); - } - } else if (params.IsBuffer()) { - // Buffers don't have pitch or any fancy layout property. We can just memcpy them to guest - // memory. - std::memcpy(host_ptr, staging_buffer.data(), guest_memory_size); - } else { - ASSERT(params.target == SurfaceTarget::Texture2D); - ASSERT(params.num_levels == 1); - - const u32 bpp{params.GetBytesPerPixel()}; - const u32 copy_size{params.width * bpp}; - if (params.pitch == copy_size) { - std::memcpy(host_ptr, staging_buffer.data(), guest_memory_size); - } else { - u8* start{host_ptr}; - const u8* read_to{staging_buffer.data()}; - for (u32 h = params.height; h > 0; --h) { - std::memcpy(start, read_to, copy_size); - start += params.pitch; - read_to += copy_size; - } - } - } - memory_manager.WriteBlockUnsafe(gpu_addr, host_ptr, guest_memory_size); -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_base.h b/src/video_core/texture_cache/surface_base.h deleted file mode 100644 index 173f2edba..000000000 --- a/src/video_core/texture_cache/surface_base.h +++ /dev/null @@ -1,333 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <optional> -#include <tuple> -#include <unordered_map> -#include <vector> - -#include "common/common_types.h" -#include "video_core/gpu.h" -#include "video_core/morton.h" -#include "video_core/texture_cache/copy_params.h" -#include "video_core/texture_cache/surface_params.h" -#include "video_core/texture_cache/surface_view.h" - -namespace Tegra { -class MemoryManager; -} - -namespace VideoCommon { - -using VideoCore::MortonSwizzleMode; -using VideoCore::Surface::SurfaceTarget; - -enum class MatchStructureResult : u32 { - FullMatch = 0, - SemiMatch = 1, - None = 2, -}; - -enum class MatchTopologyResult : u32 { - FullMatch = 0, - CompressUnmatch = 1, - None = 2, -}; - -class StagingCache { -public: - explicit StagingCache(); - ~StagingCache(); - - std::vector<u8>& GetBuffer(std::size_t index) { - return staging_buffer[index]; - } - - const std::vector<u8>& GetBuffer(std::size_t index) const { - return staging_buffer[index]; - } - - void SetSize(std::size_t size) { - staging_buffer.resize(size); - } - -private: - std::vector<std::vector<u8>> staging_buffer; -}; - -class SurfaceBaseImpl { -public: - void LoadBuffer(Tegra::MemoryManager& memory_manager, StagingCache& staging_cache); - - void FlushBuffer(Tegra::MemoryManager& memory_manager, StagingCache& staging_cache); - - GPUVAddr GetGpuAddr() const { - return gpu_addr; - } - - bool Overlaps(const VAddr start, const VAddr end) const { - return (cpu_addr < end) && (cpu_addr_end > start); - } - - bool IsInside(const GPUVAddr other_start, const GPUVAddr other_end) const { - const GPUVAddr gpu_addr_end = gpu_addr + guest_memory_size; - return gpu_addr <= other_start && other_end <= gpu_addr_end; - } - - // Use only when recycling a surface - void SetGpuAddr(const GPUVAddr new_addr) { - gpu_addr = new_addr; - } - - VAddr GetCpuAddr() const { - return cpu_addr; - } - - VAddr GetCpuAddrEnd() const { - return cpu_addr_end; - } - - void SetCpuAddr(const VAddr new_addr) { - cpu_addr = new_addr; - cpu_addr_end = new_addr + guest_memory_size; - } - - const SurfaceParams& GetSurfaceParams() const { - return params; - } - - std::size_t GetSizeInBytes() const { - return guest_memory_size; - } - - std::size_t GetHostSizeInBytes() const { - return host_memory_size; - } - - std::size_t GetMipmapSize(const u32 level) const { - return mipmap_sizes[level]; - } - - bool IsLinear() const { - return !params.is_tiled; - } - - bool IsConverted() const { - return is_converted; - } - - bool MatchFormat(VideoCore::Surface::PixelFormat pixel_format) const { - return params.pixel_format == pixel_format; - } - - VideoCore::Surface::PixelFormat GetFormat() const { - return params.pixel_format; - } - - bool MatchTarget(VideoCore::Surface::SurfaceTarget target) const { - return params.target == target; - } - - MatchTopologyResult MatchesTopology(const SurfaceParams& rhs) const; - - MatchStructureResult MatchesStructure(const SurfaceParams& rhs) const; - - bool MatchesSubTexture(const SurfaceParams& rhs, const GPUVAddr other_gpu_addr) const { - return std::tie(gpu_addr, params.target, params.num_levels) == - std::tie(other_gpu_addr, rhs.target, rhs.num_levels) && - params.target == SurfaceTarget::Texture2D && params.num_levels == 1; - } - - std::optional<std::pair<u32, u32>> GetLayerMipmap(const GPUVAddr candidate_gpu_addr) const; - - std::vector<CopyParams> BreakDown(const SurfaceParams& in_params) const { - return params.is_layered ? BreakDownLayered(in_params) : BreakDownNonLayered(in_params); - } - -protected: - explicit SurfaceBaseImpl(GPUVAddr gpu_addr, const SurfaceParams& params, - bool is_astc_supported); - ~SurfaceBaseImpl() = default; - - virtual void DecorateSurfaceName() = 0; - - const SurfaceParams params; - std::size_t layer_size; - std::size_t guest_memory_size; - std::size_t host_memory_size; - GPUVAddr gpu_addr{}; - VAddr cpu_addr{}; - VAddr cpu_addr_end{}; - bool is_converted{}; - - std::vector<std::size_t> mipmap_sizes; - std::vector<std::size_t> mipmap_offsets; - -private: - void SwizzleFunc(MortonSwizzleMode mode, u8* memory, const SurfaceParams& params, u8* buffer, - u32 level); - - std::vector<CopyParams> BreakDownLayered(const SurfaceParams& in_params) const; - - std::vector<CopyParams> BreakDownNonLayered(const SurfaceParams& in_params) const; -}; - -template <typename TView> -class SurfaceBase : public SurfaceBaseImpl { -public: - virtual void UploadTexture(const std::vector<u8>& staging_buffer) = 0; - - virtual void DownloadTexture(std::vector<u8>& staging_buffer) = 0; - - void MarkAsModified(bool is_modified_, u64 tick) { - is_modified = is_modified_ || is_target; - modification_tick = tick; - } - - void MarkAsRenderTarget(bool is_target_, u32 index_) { - is_target = is_target_; - index = index_; - } - - void SetMemoryMarked(bool is_memory_marked_) { - is_memory_marked = is_memory_marked_; - } - - bool IsMemoryMarked() const { - return is_memory_marked; - } - - void SetSyncPending(bool is_sync_pending_) { - is_sync_pending = is_sync_pending_; - } - - bool IsSyncPending() const { - return is_sync_pending; - } - - void MarkAsPicked(bool is_picked_) { - is_picked = is_picked_; - } - - bool IsModified() const { - return is_modified; - } - - bool IsProtected() const { - // Only 3D slices are to be protected - return is_target && params.target == SurfaceTarget::Texture3D; - } - - bool IsRenderTarget() const { - return is_target; - } - - u32 GetRenderTarget() const { - return index; - } - - bool IsRegistered() const { - return is_registered; - } - - bool IsPicked() const { - return is_picked; - } - - void MarkAsRegistered(bool is_reg) { - is_registered = is_reg; - } - - u64 GetModificationTick() const { - return modification_tick; - } - - TView EmplaceOverview(const SurfaceParams& overview_params) { - const u32 num_layers{(params.is_layered && !overview_params.is_layered) ? 1 : params.depth}; - return GetView(ViewParams(overview_params.target, 0, num_layers, 0, params.num_levels)); - } - - TView Emplace3DView(u32 slice, u32 depth, u32 base_level, u32 num_levels) { - return GetView(ViewParams(VideoCore::Surface::SurfaceTarget::Texture3D, slice, depth, - base_level, num_levels)); - } - - std::optional<TView> EmplaceIrregularView(const SurfaceParams& view_params, - const GPUVAddr view_addr, - const std::size_t candidate_size, const u32 mipmap, - const u32 layer) { - const auto layer_mipmap{GetLayerMipmap(view_addr + candidate_size)}; - if (!layer_mipmap) { - return {}; - } - const auto [end_layer, end_mipmap] = *layer_mipmap; - if (layer != end_layer) { - if (mipmap == 0 && end_mipmap == 0) { - return GetView(ViewParams(view_params.target, layer, end_layer - layer, 0, 1)); - } - return {}; - } else { - return GetView(ViewParams(view_params.target, layer, 1, mipmap, end_mipmap - mipmap)); - } - } - - std::optional<TView> EmplaceView(const SurfaceParams& view_params, const GPUVAddr view_addr, - const std::size_t candidate_size) { - if (params.target == SurfaceTarget::Texture3D || - view_params.target == SurfaceTarget::Texture3D || - (params.num_levels == 1 && !params.is_layered)) { - return {}; - } - const auto layer_mipmap{GetLayerMipmap(view_addr)}; - if (!layer_mipmap) { - return {}; - } - const auto [layer, mipmap] = *layer_mipmap; - if (GetMipmapSize(mipmap) != candidate_size) { - return EmplaceIrregularView(view_params, view_addr, candidate_size, mipmap, layer); - } - return GetView(ViewParams(view_params.target, layer, 1, mipmap, 1)); - } - - TView GetMainView() const { - return main_view; - } - -protected: - explicit SurfaceBase(const GPUVAddr gpu_addr, const SurfaceParams& params, - bool is_astc_supported) - : SurfaceBaseImpl(gpu_addr, params, is_astc_supported) {} - - ~SurfaceBase() = default; - - virtual TView CreateView(const ViewParams& view_key) = 0; - - TView main_view; - std::unordered_map<ViewParams, TView> views; - -private: - TView GetView(const ViewParams& key) { - const auto [entry, is_cache_miss] = views.try_emplace(key); - auto& view{entry->second}; - if (is_cache_miss) { - view = CreateView(key); - } - return view; - } - - static constexpr u32 NO_RT = 0xFFFFFFFF; - - bool is_modified{}; - bool is_target{}; - bool is_registered{}; - bool is_picked{}; - bool is_memory_marked{}; - bool is_sync_pending{}; - u32 index{NO_RT}; - u64 modification_tick{}; -}; - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_params.cpp b/src/video_core/texture_cache/surface_params.cpp deleted file mode 100644 index e8515321b..000000000 --- a/src/video_core/texture_cache/surface_params.cpp +++ /dev/null @@ -1,444 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <string> -#include <tuple> - -#include "common/alignment.h" -#include "common/bit_util.h" -#include "core/core.h" -#include "video_core/engines/shader_bytecode.h" -#include "video_core/surface.h" -#include "video_core/texture_cache/format_lookup_table.h" -#include "video_core/texture_cache/surface_params.h" - -namespace VideoCommon { - -using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::PixelFormatFromDepthFormat; -using VideoCore::Surface::PixelFormatFromRenderTargetFormat; -using VideoCore::Surface::SurfaceTarget; -using VideoCore::Surface::SurfaceTargetFromTextureType; -using VideoCore::Surface::SurfaceType; - -namespace { - -SurfaceTarget TextureTypeToSurfaceTarget(Tegra::Shader::TextureType type, bool is_array) { - switch (type) { - case Tegra::Shader::TextureType::Texture1D: - return is_array ? SurfaceTarget::Texture1DArray : SurfaceTarget::Texture1D; - case Tegra::Shader::TextureType::Texture2D: - return is_array ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D; - case Tegra::Shader::TextureType::Texture3D: - ASSERT(!is_array); - return SurfaceTarget::Texture3D; - case Tegra::Shader::TextureType::TextureCube: - return is_array ? SurfaceTarget::TextureCubeArray : SurfaceTarget::TextureCubemap; - default: - UNREACHABLE(); - return SurfaceTarget::Texture2D; - } -} - -SurfaceTarget ImageTypeToSurfaceTarget(Tegra::Shader::ImageType type) { - switch (type) { - case Tegra::Shader::ImageType::Texture1D: - return SurfaceTarget::Texture1D; - case Tegra::Shader::ImageType::TextureBuffer: - return SurfaceTarget::TextureBuffer; - case Tegra::Shader::ImageType::Texture1DArray: - return SurfaceTarget::Texture1DArray; - case Tegra::Shader::ImageType::Texture2D: - return SurfaceTarget::Texture2D; - case Tegra::Shader::ImageType::Texture2DArray: - return SurfaceTarget::Texture2DArray; - case Tegra::Shader::ImageType::Texture3D: - return SurfaceTarget::Texture3D; - default: - UNREACHABLE(); - return SurfaceTarget::Texture2D; - } -} - -constexpr u32 GetMipmapSize(bool uncompressed, u32 mip_size, u32 tile) { - return uncompressed ? mip_size : std::max(1U, (mip_size + tile - 1) / tile); -} - -} // Anonymous namespace - -SurfaceParams SurfaceParams::CreateForTexture(const FormatLookupTable& lookup_table, - const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Sampler& entry) { - SurfaceParams params; - params.is_tiled = tic.IsTiled(); - params.srgb_conversion = tic.IsSrgbConversionEnabled(); - params.block_width = params.is_tiled ? tic.BlockWidth() : 0; - params.block_height = params.is_tiled ? tic.BlockHeight() : 0; - params.block_depth = params.is_tiled ? tic.BlockDepth() : 0; - params.tile_width_spacing = params.is_tiled ? (1 << tic.tile_width_spacing.Value()) : 1; - params.pixel_format = lookup_table.GetPixelFormat( - tic.format, params.srgb_conversion, tic.r_type, tic.g_type, tic.b_type, tic.a_type); - params.type = GetFormatType(params.pixel_format); - if (entry.is_shadow && params.type == SurfaceType::ColorTexture) { - switch (params.pixel_format) { - case PixelFormat::R16_UNORM: - case PixelFormat::R16_FLOAT: - params.pixel_format = PixelFormat::D16_UNORM; - break; - case PixelFormat::R32_FLOAT: - params.pixel_format = PixelFormat::D32_FLOAT; - break; - default: - UNIMPLEMENTED_MSG("Unimplemented shadow convert format: {}", - static_cast<u32>(params.pixel_format)); - } - params.type = GetFormatType(params.pixel_format); - } - // TODO: on 1DBuffer we should use the tic info. - if (tic.IsBuffer()) { - params.target = SurfaceTarget::TextureBuffer; - params.width = tic.Width(); - params.pitch = params.width * params.GetBytesPerPixel(); - params.height = 1; - params.depth = 1; - params.num_levels = 1; - params.emulated_levels = 1; - params.is_layered = false; - } else { - params.target = TextureTypeToSurfaceTarget(entry.type, entry.is_array); - params.width = tic.Width(); - params.height = tic.Height(); - params.depth = tic.Depth(); - params.pitch = params.is_tiled ? 0 : tic.Pitch(); - if (params.target == SurfaceTarget::TextureCubemap || - params.target == SurfaceTarget::TextureCubeArray) { - params.depth *= 6; - } - params.num_levels = tic.max_mip_level + 1; - params.emulated_levels = std::min(params.num_levels, params.MaxPossibleMipmap()); - params.is_layered = params.IsLayered(); - } - return params; -} - -SurfaceParams SurfaceParams::CreateForImage(const FormatLookupTable& lookup_table, - const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Image& entry) { - SurfaceParams params; - params.is_tiled = tic.IsTiled(); - params.srgb_conversion = tic.IsSrgbConversionEnabled(); - params.block_width = params.is_tiled ? tic.BlockWidth() : 0; - params.block_height = params.is_tiled ? tic.BlockHeight() : 0; - params.block_depth = params.is_tiled ? tic.BlockDepth() : 0; - params.tile_width_spacing = params.is_tiled ? (1 << tic.tile_width_spacing.Value()) : 1; - params.pixel_format = lookup_table.GetPixelFormat( - tic.format, params.srgb_conversion, tic.r_type, tic.g_type, tic.b_type, tic.a_type); - params.type = GetFormatType(params.pixel_format); - params.target = ImageTypeToSurfaceTarget(entry.type); - // TODO: on 1DBuffer we should use the tic info. - if (tic.IsBuffer()) { - params.target = SurfaceTarget::TextureBuffer; - params.width = tic.Width(); - params.pitch = params.width * params.GetBytesPerPixel(); - params.height = 1; - params.depth = 1; - params.num_levels = 1; - params.emulated_levels = 1; - params.is_layered = false; - } else { - params.width = tic.Width(); - params.height = tic.Height(); - params.depth = tic.Depth(); - params.pitch = params.is_tiled ? 0 : tic.Pitch(); - if (params.target == SurfaceTarget::TextureCubemap || - params.target == SurfaceTarget::TextureCubeArray) { - params.depth *= 6; - } - params.num_levels = tic.max_mip_level + 1; - params.emulated_levels = std::min(params.num_levels, params.MaxPossibleMipmap()); - params.is_layered = params.IsLayered(); - } - return params; -} - -SurfaceParams SurfaceParams::CreateForDepthBuffer(Tegra::Engines::Maxwell3D& maxwell3d) { - const auto& regs = maxwell3d.regs; - const auto block_depth = std::min(regs.zeta.memory_layout.block_depth.Value(), 5U); - const bool is_layered = regs.zeta_layers > 1 && block_depth == 0; - const auto pixel_format = PixelFormatFromDepthFormat(regs.zeta.format); - return { - .is_tiled = regs.zeta.memory_layout.type == - Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear, - .srgb_conversion = false, - .is_layered = is_layered, - .block_width = std::min(regs.zeta.memory_layout.block_width.Value(), 5U), - .block_height = std::min(regs.zeta.memory_layout.block_height.Value(), 5U), - .block_depth = block_depth, - .tile_width_spacing = 1, - .width = regs.zeta_width, - .height = regs.zeta_height, - .depth = is_layered ? regs.zeta_layers.Value() : 1U, - .pitch = 0, - .num_levels = 1, - .emulated_levels = 1, - .pixel_format = pixel_format, - .type = GetFormatType(pixel_format), - .target = is_layered ? SurfaceTarget::Texture2DArray : SurfaceTarget::Texture2D, - }; -} - -SurfaceParams SurfaceParams::CreateForFramebuffer(Tegra::Engines::Maxwell3D& maxwell3d, - std::size_t index) { - const auto& config{maxwell3d.regs.rt[index]}; - SurfaceParams params; - params.is_tiled = - config.memory_layout.type == Tegra::Engines::Maxwell3D::Regs::InvMemoryLayout::BlockLinear; - params.srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB || - config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB; - params.block_width = config.memory_layout.block_width; - params.block_height = config.memory_layout.block_height; - params.block_depth = config.memory_layout.block_depth; - params.tile_width_spacing = 1; - params.pixel_format = PixelFormatFromRenderTargetFormat(config.format); - params.type = GetFormatType(params.pixel_format); - if (params.is_tiled) { - params.pitch = 0; - params.width = config.width; - } else { - const u32 bpp = GetFormatBpp(params.pixel_format) / CHAR_BIT; - params.pitch = config.width; - params.width = params.pitch / bpp; - } - params.height = config.height; - params.num_levels = 1; - params.emulated_levels = 1; - - if (config.memory_layout.is_3d != 0) { - params.depth = config.layers.Value(); - params.is_layered = false; - params.target = SurfaceTarget::Texture3D; - } else if (config.layers > 1) { - params.depth = config.layers.Value(); - params.is_layered = true; - params.target = SurfaceTarget::Texture2DArray; - } else { - params.depth = 1; - params.is_layered = false; - params.target = SurfaceTarget::Texture2D; - } - return params; -} - -SurfaceParams SurfaceParams::CreateForFermiCopySurface( - const Tegra::Engines::Fermi2D::Regs::Surface& config) { - const bool is_tiled = !config.linear; - const auto pixel_format = PixelFormatFromRenderTargetFormat(config.format); - - SurfaceParams params{ - .is_tiled = is_tiled, - .srgb_conversion = config.format == Tegra::RenderTargetFormat::B8G8R8A8_SRGB || - config.format == Tegra::RenderTargetFormat::A8B8G8R8_SRGB, - .block_width = is_tiled ? std::min(config.BlockWidth(), 5U) : 0U, - .block_height = is_tiled ? std::min(config.BlockHeight(), 5U) : 0U, - .block_depth = is_tiled ? std::min(config.BlockDepth(), 5U) : 0U, - .tile_width_spacing = 1, - .width = config.width, - .height = config.height, - .depth = 1, - .pitch = config.pitch, - .num_levels = 1, - .emulated_levels = 1, - .pixel_format = pixel_format, - .type = GetFormatType(pixel_format), - // TODO(Rodrigo): Try to guess texture arrays from parameters - .target = SurfaceTarget::Texture2D, - }; - - params.is_layered = params.IsLayered(); - return params; -} - -VideoCore::Surface::SurfaceTarget SurfaceParams::ExpectedTarget( - const VideoCommon::Shader::Sampler& entry) { - return TextureTypeToSurfaceTarget(entry.type, entry.is_array); -} - -VideoCore::Surface::SurfaceTarget SurfaceParams::ExpectedTarget( - const VideoCommon::Shader::Image& entry) { - return ImageTypeToSurfaceTarget(entry.type); -} - -bool SurfaceParams::IsLayered() const { - switch (target) { - case SurfaceTarget::Texture1DArray: - case SurfaceTarget::Texture2DArray: - case SurfaceTarget::TextureCubemap: - case SurfaceTarget::TextureCubeArray: - return true; - default: - return false; - } -} - -// Auto block resizing algorithm from: -// https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_miptree.c -u32 SurfaceParams::GetMipBlockHeight(u32 level) const { - if (level == 0) { - return this->block_height; - } - - const u32 height_new{GetMipHeight(level)}; - const u32 default_block_height{GetDefaultBlockHeight()}; - const u32 blocks_in_y{(height_new + default_block_height - 1) / default_block_height}; - const u32 block_height_new = Common::Log2Ceil32(blocks_in_y); - return std::clamp(block_height_new, 3U, 7U) - 3U; -} - -u32 SurfaceParams::GetMipBlockDepth(u32 level) const { - if (level == 0) { - return this->block_depth; - } - if (is_layered) { - return 0; - } - - const u32 depth_new{GetMipDepth(level)}; - const u32 block_depth_new = Common::Log2Ceil32(depth_new); - if (block_depth_new > 4) { - return 5 - (GetMipBlockHeight(level) >= 2); - } - return block_depth_new; -} - -std::size_t SurfaceParams::GetGuestMipmapLevelOffset(u32 level) const { - std::size_t offset = 0; - for (u32 i = 0; i < level; i++) { - offset += GetInnerMipmapMemorySize(i, false, false); - } - return offset; -} - -std::size_t SurfaceParams::GetHostMipmapLevelOffset(u32 level, bool is_converted) const { - std::size_t offset = 0; - if (is_converted) { - for (u32 i = 0; i < level; ++i) { - offset += GetConvertedMipmapSize(i) * GetNumLayers(); - } - } else { - for (u32 i = 0; i < level; ++i) { - offset += GetInnerMipmapMemorySize(i, true, false) * GetNumLayers(); - } - } - return offset; -} - -std::size_t SurfaceParams::GetConvertedMipmapSize(u32 level) const { - constexpr std::size_t rgba8_bpp = 4ULL; - const std::size_t mip_width = GetMipWidth(level); - const std::size_t mip_height = GetMipHeight(level); - const std::size_t mip_depth = is_layered ? 1 : GetMipDepth(level); - return mip_width * mip_height * mip_depth * rgba8_bpp; -} - -std::size_t SurfaceParams::GetLayerSize(bool as_host_size, bool uncompressed) const { - std::size_t size = 0; - for (u32 level = 0; level < num_levels; ++level) { - size += GetInnerMipmapMemorySize(level, as_host_size, uncompressed); - } - if (is_tiled && is_layered) { - return Common::AlignBits(size, Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth); - } - return size; -} - -std::size_t SurfaceParams::GetInnerMipmapMemorySize(u32 level, bool as_host_size, - bool uncompressed) const { - const u32 width{GetMipmapSize(uncompressed, GetMipWidth(level), GetDefaultBlockWidth())}; - const u32 height{GetMipmapSize(uncompressed, GetMipHeight(level), GetDefaultBlockHeight())}; - const u32 depth{is_layered ? 1U : GetMipDepth(level)}; - if (is_tiled) { - return Tegra::Texture::CalculateSize(!as_host_size, GetBytesPerPixel(), width, height, - depth, GetMipBlockHeight(level), - GetMipBlockDepth(level)); - } else if (as_host_size || IsBuffer()) { - return GetBytesPerPixel() * width * height * depth; - } else { - // Linear Texture Case - return pitch * height * depth; - } -} - -bool SurfaceParams::operator==(const SurfaceParams& rhs) const { - return std::tie(is_tiled, block_width, block_height, block_depth, tile_width_spacing, width, - height, depth, pitch, num_levels, pixel_format, type, target) == - std::tie(rhs.is_tiled, rhs.block_width, rhs.block_height, rhs.block_depth, - rhs.tile_width_spacing, rhs.width, rhs.height, rhs.depth, rhs.pitch, - rhs.num_levels, rhs.pixel_format, rhs.type, rhs.target); -} - -std::string SurfaceParams::TargetName() const { - switch (target) { - case SurfaceTarget::Texture1D: - return "1D"; - case SurfaceTarget::TextureBuffer: - return "TexBuffer"; - case SurfaceTarget::Texture2D: - return "2D"; - case SurfaceTarget::Texture3D: - return "3D"; - case SurfaceTarget::Texture1DArray: - return "1DArray"; - case SurfaceTarget::Texture2DArray: - return "2DArray"; - case SurfaceTarget::TextureCubemap: - return "Cube"; - case SurfaceTarget::TextureCubeArray: - return "CubeArray"; - default: - LOG_CRITICAL(HW_GPU, "Unimplemented surface_target={}", static_cast<u32>(target)); - UNREACHABLE(); - return fmt::format("TUK({})", static_cast<u32>(target)); - } -} - -u32 SurfaceParams::GetBlockSize() const { - const u32 x = 64U << block_width; - const u32 y = 8U << block_height; - const u32 z = 1U << block_depth; - return x * y * z; -} - -std::pair<u32, u32> SurfaceParams::GetBlockXY() const { - const u32 x_pixels = 64U / GetBytesPerPixel(); - const u32 x = x_pixels << block_width; - const u32 y = 8U << block_height; - return {x, y}; -} - -std::tuple<u32, u32, u32> SurfaceParams::GetBlockOffsetXYZ(u32 offset) const { - const auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; - const u32 block_size = GetBlockSize(); - const u32 block_index = offset / block_size; - const u32 gob_offset = offset % block_size; - const u32 gob_index = gob_offset / static_cast<u32>(Tegra::Texture::GOB_SIZE); - const u32 x_gob_pixels = 64U / GetBytesPerPixel(); - const u32 x_block_pixels = x_gob_pixels << block_width; - const u32 y_block_pixels = 8U << block_height; - const u32 z_block_pixels = 1U << block_depth; - const u32 x_blocks = div_ceil(width, x_block_pixels); - const u32 y_blocks = div_ceil(height, y_block_pixels); - const u32 z_blocks = div_ceil(depth, z_block_pixels); - const u32 base_x = block_index % x_blocks; - const u32 base_y = (block_index / x_blocks) % y_blocks; - const u32 base_z = (block_index / (x_blocks * y_blocks)) % z_blocks; - u32 x = base_x * x_block_pixels; - u32 y = base_y * y_block_pixels; - u32 z = base_z * z_block_pixels; - z += gob_index >> block_height; - y += (gob_index * 8U) % y_block_pixels; - return {x, y, z}; -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_params.h b/src/video_core/texture_cache/surface_params.h deleted file mode 100644 index 4466c3c34..000000000 --- a/src/video_core/texture_cache/surface_params.h +++ /dev/null @@ -1,294 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <utility> - -#include "common/alignment.h" -#include "common/bit_util.h" -#include "common/cityhash.h" -#include "common/common_types.h" -#include "video_core/engines/fermi_2d.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/shader/shader_ir.h" -#include "video_core/surface.h" -#include "video_core/textures/decoders.h" - -namespace VideoCommon { - -class FormatLookupTable; - -class SurfaceParams { -public: - /// Creates SurfaceCachedParams from a texture configuration. - static SurfaceParams CreateForTexture(const FormatLookupTable& lookup_table, - const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Sampler& entry); - - /// Creates SurfaceCachedParams from an image configuration. - static SurfaceParams CreateForImage(const FormatLookupTable& lookup_table, - const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Image& entry); - - /// Creates SurfaceCachedParams for a depth buffer configuration. - static SurfaceParams CreateForDepthBuffer(Tegra::Engines::Maxwell3D& maxwell3d); - - /// Creates SurfaceCachedParams from a framebuffer configuration. - static SurfaceParams CreateForFramebuffer(Tegra::Engines::Maxwell3D& maxwell3d, - std::size_t index); - - /// Creates SurfaceCachedParams from a Fermi2D surface configuration. - static SurfaceParams CreateForFermiCopySurface( - const Tegra::Engines::Fermi2D::Regs::Surface& config); - - /// Obtains the texture target from a shader's sampler entry. - static VideoCore::Surface::SurfaceTarget ExpectedTarget( - const VideoCommon::Shader::Sampler& entry); - - /// Obtains the texture target from a shader's sampler entry. - static VideoCore::Surface::SurfaceTarget ExpectedTarget( - const VideoCommon::Shader::Image& entry); - - std::size_t Hash() const { - return static_cast<std::size_t>( - Common::CityHash64(reinterpret_cast<const char*>(this), sizeof(*this))); - } - - bool operator==(const SurfaceParams& rhs) const; - - bool operator!=(const SurfaceParams& rhs) const { - return !operator==(rhs); - } - - std::size_t GetGuestSizeInBytes() const { - return GetInnerMemorySize(false, false, false); - } - - std::size_t GetHostSizeInBytes(bool is_converted) const { - if (!is_converted) { - return GetInnerMemorySize(true, false, false); - } - // ASTC is uncompressed in software, in emulated as RGBA8 - std::size_t host_size_in_bytes = 0; - for (u32 level = 0; level < num_levels; ++level) { - host_size_in_bytes += GetConvertedMipmapSize(level) * GetNumLayers(); - } - return host_size_in_bytes; - } - - u32 GetBlockAlignedWidth() const { - return Common::AlignUp(width, 64 / GetBytesPerPixel()); - } - - /// Returns the width of a given mipmap level. - u32 GetMipWidth(u32 level) const { - return std::max(1U, width >> level); - } - - /// Returns the height of a given mipmap level. - u32 GetMipHeight(u32 level) const { - return std::max(1U, height >> level); - } - - /// Returns the depth of a given mipmap level. - u32 GetMipDepth(u32 level) const { - return is_layered ? depth : std::max(1U, depth >> level); - } - - /// Returns the block height of a given mipmap level. - u32 GetMipBlockHeight(u32 level) const; - - /// Returns the block depth of a given mipmap level. - u32 GetMipBlockDepth(u32 level) const; - - /// Returns the best possible row/pitch alignment for the surface. - u32 GetRowAlignment(u32 level, bool is_converted) const { - const u32 bpp = is_converted ? 4 : GetBytesPerPixel(); - return 1U << Common::CountTrailingZeroes32(GetMipWidth(level) * bpp); - } - - /// Returns the offset in bytes in guest memory of a given mipmap level. - std::size_t GetGuestMipmapLevelOffset(u32 level) const; - - /// Returns the offset in bytes in host memory (linear) of a given mipmap level. - std::size_t GetHostMipmapLevelOffset(u32 level, bool is_converted) const; - - /// Returns the size in bytes in guest memory of a given mipmap level. - std::size_t GetGuestMipmapSize(u32 level) const { - return GetInnerMipmapMemorySize(level, false, false); - } - - /// Returns the size in bytes in host memory (linear) of a given mipmap level. - std::size_t GetHostMipmapSize(u32 level) const { - return GetInnerMipmapMemorySize(level, true, false) * GetNumLayers(); - } - - std::size_t GetConvertedMipmapSize(u32 level) const; - - /// Get this texture Tegra Block size in guest memory layout - u32 GetBlockSize() const; - - /// Get X, Y coordinates max sizes of a single block. - std::pair<u32, u32> GetBlockXY() const; - - /// Get the offset in x, y, z coordinates from a memory offset - std::tuple<u32, u32, u32> GetBlockOffsetXYZ(u32 offset) const; - - /// Returns the size of a layer in bytes in guest memory. - std::size_t GetGuestLayerSize() const { - return GetLayerSize(false, false); - } - - /// Returns the size of a layer in bytes in host memory for a given mipmap level. - std::size_t GetHostLayerSize(u32 level) const { - ASSERT(target != VideoCore::Surface::SurfaceTarget::Texture3D); - return GetInnerMipmapMemorySize(level, true, false); - } - - /// Returns the max possible mipmap that the texture can have in host gpu - u32 MaxPossibleMipmap() const { - const u32 max_mipmap_w = Common::Log2Ceil32(width) + 1U; - const u32 max_mipmap_h = Common::Log2Ceil32(height) + 1U; - const u32 max_mipmap = std::max(max_mipmap_w, max_mipmap_h); - if (target != VideoCore::Surface::SurfaceTarget::Texture3D) - return max_mipmap; - return std::max(max_mipmap, Common::Log2Ceil32(depth) + 1U); - } - - /// Returns if the guest surface is a compressed surface. - bool IsCompressed() const { - return GetDefaultBlockHeight() > 1 || GetDefaultBlockWidth() > 1; - } - - /// Returns the default block width. - u32 GetDefaultBlockWidth() const { - return VideoCore::Surface::GetDefaultBlockWidth(pixel_format); - } - - /// Returns the default block height. - u32 GetDefaultBlockHeight() const { - return VideoCore::Surface::GetDefaultBlockHeight(pixel_format); - } - - /// Returns the bits per pixel. - u32 GetBitsPerPixel() const { - return VideoCore::Surface::GetFormatBpp(pixel_format); - } - - /// Returns the bytes per pixel. - u32 GetBytesPerPixel() const { - return VideoCore::Surface::GetBytesPerPixel(pixel_format); - } - - /// Returns true if the pixel format is a depth and/or stencil format. - bool IsPixelFormatZeta() const { - return pixel_format >= VideoCore::Surface::PixelFormat::MaxColorFormat && - pixel_format < VideoCore::Surface::PixelFormat::MaxDepthStencilFormat; - } - - /// Returns is the surface is a TextureBuffer type of surface. - bool IsBuffer() const { - return target == VideoCore::Surface::SurfaceTarget::TextureBuffer; - } - - /// Returns the number of layers in the surface. - std::size_t GetNumLayers() const { - return is_layered ? depth : 1; - } - - /// Returns the debug name of the texture for use in graphic debuggers. - std::string TargetName() const; - - // Helper used for out of class size calculations - static std::size_t AlignLayered(const std::size_t out_size, const u32 block_height, - const u32 block_depth) { - return Common::AlignBits(out_size, - Tegra::Texture::GOB_SIZE_SHIFT + block_height + block_depth); - } - - /// Converts a width from a type of surface into another. This helps represent the - /// equivalent value between compressed/non-compressed textures. - static u32 ConvertWidth(u32 width, VideoCore::Surface::PixelFormat pixel_format_from, - VideoCore::Surface::PixelFormat pixel_format_to) { - const u32 bw1 = VideoCore::Surface::GetDefaultBlockWidth(pixel_format_from); - const u32 bw2 = VideoCore::Surface::GetDefaultBlockWidth(pixel_format_to); - return (width * bw2 + bw1 - 1) / bw1; - } - - /// Converts a height from a type of surface into another. This helps represent the - /// equivalent value between compressed/non-compressed textures. - static u32 ConvertHeight(u32 height, VideoCore::Surface::PixelFormat pixel_format_from, - VideoCore::Surface::PixelFormat pixel_format_to) { - const u32 bh1 = VideoCore::Surface::GetDefaultBlockHeight(pixel_format_from); - const u32 bh2 = VideoCore::Surface::GetDefaultBlockHeight(pixel_format_to); - return (height * bh2 + bh1 - 1) / bh1; - } - - // Finds the maximun possible width between 2 2D layers of different formats - static u32 IntersectWidth(const SurfaceParams& src_params, const SurfaceParams& dst_params, - const u32 src_level, const u32 dst_level) { - const u32 bw1 = src_params.GetDefaultBlockWidth(); - const u32 bw2 = dst_params.GetDefaultBlockWidth(); - const u32 t_src_width = (src_params.GetMipWidth(src_level) * bw2 + bw1 - 1) / bw1; - const u32 t_dst_width = (dst_params.GetMipWidth(dst_level) * bw1 + bw2 - 1) / bw2; - return std::min(t_src_width, t_dst_width); - } - - // Finds the maximun possible height between 2 2D layers of different formats - static u32 IntersectHeight(const SurfaceParams& src_params, const SurfaceParams& dst_params, - const u32 src_level, const u32 dst_level) { - const u32 bh1 = src_params.GetDefaultBlockHeight(); - const u32 bh2 = dst_params.GetDefaultBlockHeight(); - const u32 t_src_height = (src_params.GetMipHeight(src_level) * bh2 + bh1 - 1) / bh1; - const u32 t_dst_height = (dst_params.GetMipHeight(dst_level) * bh1 + bh2 - 1) / bh2; - return std::min(t_src_height, t_dst_height); - } - - bool is_tiled; - bool srgb_conversion; - bool is_layered; - u32 block_width; - u32 block_height; - u32 block_depth; - u32 tile_width_spacing; - u32 width; - u32 height; - u32 depth; - u32 pitch; - u32 num_levels; - u32 emulated_levels; - VideoCore::Surface::PixelFormat pixel_format; - VideoCore::Surface::SurfaceType type; - VideoCore::Surface::SurfaceTarget target; - -private: - /// Returns the size of a given mipmap level inside a layer. - std::size_t GetInnerMipmapMemorySize(u32 level, bool as_host_size, bool uncompressed) const; - - /// Returns the size of all mipmap levels and aligns as needed. - std::size_t GetInnerMemorySize(bool as_host_size, bool layer_only, bool uncompressed) const { - return GetLayerSize(as_host_size, uncompressed) * - (layer_only ? 1U : (is_layered ? depth : 1U)); - } - - /// Returns the size of a layer - std::size_t GetLayerSize(bool as_host_size, bool uncompressed) const; - - /// Returns true if these parameters are from a layered surface. - bool IsLayered() const; -}; - -} // namespace VideoCommon - -namespace std { - -template <> -struct hash<VideoCommon::SurfaceParams> { - std::size_t operator()(const VideoCommon::SurfaceParams& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std diff --git a/src/video_core/texture_cache/surface_view.cpp b/src/video_core/texture_cache/surface_view.cpp deleted file mode 100644 index 6b5f5984b..000000000 --- a/src/video_core/texture_cache/surface_view.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <tuple> - -#include "common/common_types.h" -#include "video_core/texture_cache/surface_view.h" - -namespace VideoCommon { - -std::size_t ViewParams::Hash() const { - return static_cast<std::size_t>(base_layer) ^ (static_cast<std::size_t>(num_layers) << 16) ^ - (static_cast<std::size_t>(base_level) << 24) ^ - (static_cast<std::size_t>(num_levels) << 32) ^ (static_cast<std::size_t>(target) << 36); -} - -bool ViewParams::operator==(const ViewParams& rhs) const { - return std::tie(base_layer, num_layers, base_level, num_levels, target) == - std::tie(rhs.base_layer, rhs.num_layers, rhs.base_level, rhs.num_levels, rhs.target); -} - -bool ViewParams::operator!=(const ViewParams& rhs) const { - return !operator==(rhs); -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/surface_view.h b/src/video_core/texture_cache/surface_view.h deleted file mode 100644 index 90a8bb0ae..000000000 --- a/src/video_core/texture_cache/surface_view.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <functional> - -#include "common/common_types.h" -#include "video_core/surface.h" -#include "video_core/texture_cache/surface_params.h" - -namespace VideoCommon { - -struct ViewParams { - constexpr explicit ViewParams(VideoCore::Surface::SurfaceTarget target, u32 base_layer, - u32 num_layers, u32 base_level, u32 num_levels) - : target{target}, base_layer{base_layer}, num_layers{num_layers}, base_level{base_level}, - num_levels{num_levels} {} - - std::size_t Hash() const; - - bool operator==(const ViewParams& rhs) const; - bool operator!=(const ViewParams& rhs) const; - - bool IsLayered() const { - switch (target) { - case VideoCore::Surface::SurfaceTarget::Texture1DArray: - case VideoCore::Surface::SurfaceTarget::Texture2DArray: - case VideoCore::Surface::SurfaceTarget::TextureCubemap: - case VideoCore::Surface::SurfaceTarget::TextureCubeArray: - return true; - default: - return false; - } - } - - VideoCore::Surface::SurfaceTarget target{}; - u32 base_layer{}; - u32 num_layers{}; - u32 base_level{}; - u32 num_levels{}; -}; - -class ViewBase { -public: - constexpr explicit ViewBase(const ViewParams& params) : params{params} {} - - constexpr const ViewParams& GetViewParams() const { - return params; - } - -protected: - ViewParams params; -}; - -} // namespace VideoCommon - -namespace std { - -template <> -struct hash<VideoCommon::ViewParams> { - std::size_t operator()(const VideoCommon::ViewParams& k) const noexcept { - return k.Hash(); - } -}; - -} // namespace std diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index ea835c59f..d1080300f 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -6,1299 +6,1454 @@ #include <algorithm> #include <array> -#include <list> +#include <bit> #include <memory> #include <mutex> -#include <set> -#include <tuple> +#include <optional> +#include <span> +#include <type_traits> #include <unordered_map> +#include <utility> #include <vector> #include <boost/container/small_vector.hpp> -#include <boost/icl/interval_map.hpp> -#include <boost/range/iterator_range.hpp> -#include "common/assert.h" +#include "common/alignment.h" +#include "common/common_funcs.h" #include "common/common_types.h" -#include "common/math_util.h" -#include "core/core.h" -#include "core/memory.h" -#include "core/settings.h" +#include "common/logging/log.h" #include "video_core/compatible_formats.h" +#include "video_core/delayed_destruction_ring.h" #include "video_core/dirty_flags.h" #include "video_core/engines/fermi_2d.h" +#include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/gpu.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/surface.h" -#include "video_core/texture_cache/copy_params.h" +#include "video_core/texture_cache/descriptor_table.h" #include "video_core/texture_cache/format_lookup_table.h" -#include "video_core/texture_cache/surface_base.h" -#include "video_core/texture_cache/surface_params.h" -#include "video_core/texture_cache/surface_view.h" - -namespace Tegra::Texture { -struct FullTextureInfo; -} - -namespace VideoCore { -class RasterizerInterface; -} +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/image_base.h" +#include "video_core/texture_cache/image_info.h" +#include "video_core/texture_cache/image_view_base.h" +#include "video_core/texture_cache/image_view_info.h" +#include "video_core/texture_cache/render_targets.h" +#include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/texture.h" namespace VideoCommon { -using VideoCore::Surface::FormatCompatibility; +using Tegra::Texture::SwizzleSource; +using Tegra::Texture::TextureType; +using Tegra::Texture::TICEntry; +using Tegra::Texture::TSCEntry; +using VideoCore::Surface::GetFormatType; +using VideoCore::Surface::IsCopyCompatible; using VideoCore::Surface::PixelFormat; -using VideoCore::Surface::SurfaceTarget; -using RenderTargetConfig = Tegra::Engines::Maxwell3D::Regs::RenderTargetConfig; +using VideoCore::Surface::PixelFormatFromDepthFormat; +using VideoCore::Surface::PixelFormatFromRenderTargetFormat; +using VideoCore::Surface::SurfaceType; -template <typename TSurface, typename TView> +template <class P> class TextureCache { - using VectorSurface = boost::container::small_vector<TSurface, 1>; + /// Address shift for caching images into a hash table + static constexpr u64 PAGE_BITS = 20; + + /// Enables debugging features to the texture cache + static constexpr bool ENABLE_VALIDATION = P::ENABLE_VALIDATION; + /// Implement blits as copies between framebuffers + static constexpr bool FRAMEBUFFER_BLITS = P::FRAMEBUFFER_BLITS; + /// True when some copies have to be emulated + static constexpr bool HAS_EMULATED_COPIES = P::HAS_EMULATED_COPIES; + + /// Image view ID for null descriptors + static constexpr ImageViewId NULL_IMAGE_VIEW_ID{0}; + /// Sampler ID for bugged sampler ids + static constexpr SamplerId NULL_SAMPLER_ID{0}; + + using Runtime = typename P::Runtime; + using Image = typename P::Image; + using ImageAlloc = typename P::ImageAlloc; + using ImageView = typename P::ImageView; + using Sampler = typename P::Sampler; + using Framebuffer = typename P::Framebuffer; + + struct BlitImages { + ImageId dst_id; + ImageId src_id; + PixelFormat dst_format; + PixelFormat src_format; + }; + + template <typename T> + struct IdentityHash { + [[nodiscard]] size_t operator()(T value) const noexcept { + return static_cast<size_t>(value); + } + }; public: - void InvalidateRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + explicit TextureCache(Runtime&, VideoCore::RasterizerInterface&, Tegra::Engines::Maxwell3D&, + Tegra::Engines::KeplerCompute&, Tegra::MemoryManager&); - for (const auto& surface : GetSurfacesInRegion(addr, size)) { - Unregister(surface); - } - } + /// Notify the cache that a new frame has been queued + void TickFrame(); - void OnCPUWrite(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + /// Return an unique mutually exclusive lock for the cache + [[nodiscard]] std::unique_lock<std::mutex> AcquireLock(); - for (const auto& surface : GetSurfacesInRegion(addr, size)) { - if (surface->IsMemoryMarked()) { - UnmarkMemory(surface); - surface->SetSyncPending(true); - marked_for_unregister.emplace_back(surface); - } - } - } + /// Return a constant reference to the given image view id + [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept; - void SyncGuestHost() { - std::lock_guard lock{mutex}; + /// Return a reference to the given image view id + [[nodiscard]] ImageView& GetImageView(ImageViewId id) noexcept; - for (const auto& surface : marked_for_unregister) { - if (surface->IsRegistered()) { - surface->SetSyncPending(false); - Unregister(surface); - } - } - marked_for_unregister.clear(); - } + /// Fill image_view_ids with the graphics images in indices + void FillGraphicsImageViews(std::span<const u32> indices, + std::span<ImageViewId> image_view_ids); - /** - * Guarantees that rendertargets don't unregister themselves if the - * collide. Protection is currently only done on 3D slices. - */ - void GuardRenderTargets(bool new_guard) { - guard_render_targets = new_guard; - } + /// Fill image_view_ids with the compute images in indices + void FillComputeImageViews(std::span<const u32> indices, std::span<ImageViewId> image_view_ids); - void GuardSamplers(bool new_guard) { - guard_samplers = new_guard; - } + /// Get the sampler from the graphics descriptor table in the specified index + Sampler* GetGraphicsSampler(u32 index); - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + /// Get the sampler from the compute descriptor table in the specified index + Sampler* GetComputeSampler(u32 index); - auto surfaces = GetSurfacesInRegion(addr, size); - if (surfaces.empty()) { - return; - } - std::sort(surfaces.begin(), surfaces.end(), [](const TSurface& a, const TSurface& b) { - return a->GetModificationTick() < b->GetModificationTick(); - }); - for (const auto& surface : surfaces) { - mutex.unlock(); - FlushSurface(surface); - mutex.lock(); - } - } + /// Refresh the state for graphics image view and sampler descriptors + void SynchronizeGraphicsDescriptors(); - bool MustFlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; + /// Refresh the state for compute image view and sampler descriptors + void SynchronizeComputeDescriptors(); - const auto surfaces = GetSurfacesInRegion(addr, size); - return std::any_of(surfaces.cbegin(), surfaces.cend(), - [](const TSurface& surface) { return surface->IsModified(); }); - } + /// Update bound render targets and upload memory if necessary + /// @param is_clear True when the render targets are being used for clears + void UpdateRenderTargets(bool is_clear); - TView GetTextureSurface(const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Sampler& entry) { - std::lock_guard lock{mutex}; - const auto gpu_addr{tic.Address()}; - if (!gpu_addr) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } + /// Find a framebuffer with the currently bound render targets + /// UpdateRenderTargets should be called before this + Framebuffer* GetFramebuffer(); - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } + /// Mark images in a range as modified from the CPU + void WriteMemory(VAddr cpu_addr, size_t size); - if (!IsTypeCompatible(tic.texture_type, entry)) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } + /// Download contents of host images to guest memory in a region + void DownloadMemory(VAddr cpu_addr, size_t size); - const auto params{SurfaceParams::CreateForTexture(format_lookup_table, tic, entry)}; - const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); - if (guard_samplers) { - sampled_textures.push_back(surface); - } - return view; - } + /// Remove images in a region + void UnmapMemory(VAddr cpu_addr, size_t size); - TView GetImageSurface(const Tegra::Texture::TICEntry& tic, - const VideoCommon::Shader::Image& entry) { - std::lock_guard lock{mutex}; - const auto gpu_addr{tic.Address()}; - if (!gpu_addr) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - return GetNullSurface(SurfaceParams::ExpectedTarget(entry)); - } - const auto params{SurfaceParams::CreateForImage(format_lookup_table, tic, entry)}; - const auto [surface, view] = GetSurface(gpu_addr, *cpu_addr, params, true, false); - if (guard_samplers) { - sampled_textures.push_back(surface); - } - return view; - } + /// Blit an image with the given parameters + void BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Config& copy); - bool TextureBarrier() { - const bool any_rt = - std::any_of(sampled_textures.begin(), sampled_textures.end(), - [](const auto& surface) { return surface->IsRenderTarget(); }); - sampled_textures.clear(); - return any_rt; - } + /// Invalidate the contents of the color buffer index + /// These contents become unspecified, the cache can assume aggressive optimizations. + void InvalidateColorBuffer(size_t index); - TView GetDepthBufferSurface(bool preserve_contents) { - std::lock_guard lock{mutex}; - auto& dirty = maxwell3d.dirty; - if (!dirty.flags[VideoCommon::Dirty::ZetaBuffer]) { - return depth_buffer.view; - } - dirty.flags[VideoCommon::Dirty::ZetaBuffer] = false; + /// Invalidate the contents of the depth buffer + /// These contents become unspecified, the cache can assume aggressive optimizations. + void InvalidateDepthBuffer(); - const auto& regs{maxwell3d.regs}; - const auto gpu_addr{regs.zeta.Address()}; - if (!gpu_addr || !regs.zeta_enable) { - SetEmptyDepthBuffer(); - return {}; - } - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - SetEmptyDepthBuffer(); - return {}; - } - const auto depth_params{SurfaceParams::CreateForDepthBuffer(maxwell3d)}; - auto surface_view = GetSurface(gpu_addr, *cpu_addr, depth_params, preserve_contents, true); - if (depth_buffer.target) - depth_buffer.target->MarkAsRenderTarget(false, NO_RT); - depth_buffer.target = surface_view.first; - depth_buffer.view = surface_view.second; - if (depth_buffer.target) - depth_buffer.target->MarkAsRenderTarget(true, DEPTH_RT); - return surface_view.second; - } - - TView GetColorBufferSurface(std::size_t index, bool preserve_contents) { - std::lock_guard lock{mutex}; - ASSERT(index < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets); - if (!maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index]) { - return render_targets[index].view; - } - maxwell3d.dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index] = false; + /// Try to find a cached image view in the given CPU address + [[nodiscard]] ImageView* TryFindFramebufferImageView(VAddr cpu_addr); - const auto& regs{maxwell3d.regs}; - if (index >= regs.rt_control.count || regs.rt[index].Address() == 0 || - regs.rt[index].format == Tegra::RenderTargetFormat::NONE) { - SetEmptyColorBuffer(index); - return {}; - } + /// Return true when there are uncommitted images to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; - const auto& config{regs.rt[index]}; - const auto gpu_addr{config.Address()}; - if (!gpu_addr) { - SetEmptyColorBuffer(index); - return {}; - } + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - SetEmptyColorBuffer(index); - return {}; - } + /// Commit asynchronous downloads + void CommitAsyncFlushes(); + + /// Pop asynchronous downloads + void PopAsyncFlushes(); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); - auto surface_view = - GetSurface(gpu_addr, *cpu_addr, SurfaceParams::CreateForFramebuffer(maxwell3d, index), - preserve_contents, true); - if (render_targets[index].target) { - auto& surface = render_targets[index].target; - surface->MarkAsRenderTarget(false, NO_RT); - const auto& cr_params = surface->GetSurfaceParams(); - if (!cr_params.is_tiled && Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - AsyncFlushSurface(surface); +private: + /// Iterate over all page indices in a range + template <typename Func> + static void ForEachPage(VAddr addr, size_t size, Func&& func) { + static constexpr bool RETURNS_BOOL = std::is_same_v<std::invoke_result<Func, u64>, bool>; + const u64 page_end = (addr + size - 1) >> PAGE_BITS; + for (u64 page = addr >> PAGE_BITS; page <= page_end; ++page) { + if constexpr (RETURNS_BOOL) { + if (func(page)) { + break; + } + } else { + func(page); } } - render_targets[index].target = surface_view.first; - render_targets[index].view = surface_view.second; - if (render_targets[index].target) - render_targets[index].target->MarkAsRenderTarget(true, static_cast<u32>(index)); - return surface_view.second; } - void MarkColorBufferInUse(std::size_t index) { - if (auto& render_target = render_targets[index].target) { - render_target->MarkAsModified(true, Tick()); - } - } + /// Fills image_view_ids in the image views in indices + void FillImageViews(DescriptorTable<TICEntry>& table, + std::span<ImageViewId> cached_image_view_ids, std::span<const u32> indices, + std::span<ImageViewId> image_view_ids); - void MarkDepthBufferInUse() { - if (depth_buffer.target) { - depth_buffer.target->MarkAsModified(true, Tick()); - } - } + /// Find or create an image view in the guest descriptor table + ImageViewId VisitImageView(DescriptorTable<TICEntry>& table, + std::span<ImageViewId> cached_image_view_ids, u32 index); - void SetEmptyDepthBuffer() { - if (depth_buffer.target == nullptr) { - return; - } - depth_buffer.target->MarkAsRenderTarget(false, NO_RT); - depth_buffer.target = nullptr; - depth_buffer.view = nullptr; - } + /// Find or create a framebuffer with the given render target parameters + FramebufferId GetFramebufferId(const RenderTargets& key); - void SetEmptyColorBuffer(std::size_t index) { - if (render_targets[index].target == nullptr) { - return; - } - render_targets[index].target->MarkAsRenderTarget(false, NO_RT); - render_targets[index].target = nullptr; - render_targets[index].view = nullptr; - } - - void DoFermiCopy(const Tegra::Engines::Fermi2D::Regs::Surface& src_config, - const Tegra::Engines::Fermi2D::Regs::Surface& dst_config, - const Tegra::Engines::Fermi2D::Config& copy_config) { - std::lock_guard lock{mutex}; - SurfaceParams src_params = SurfaceParams::CreateForFermiCopySurface(src_config); - SurfaceParams dst_params = SurfaceParams::CreateForFermiCopySurface(dst_config); - const GPUVAddr src_gpu_addr = src_config.Address(); - const GPUVAddr dst_gpu_addr = dst_config.Address(); - DeduceBestBlit(src_params, dst_params, src_gpu_addr, dst_gpu_addr); - - const std::optional<VAddr> dst_cpu_addr = gpu_memory.GpuToCpuAddress(dst_gpu_addr); - const std::optional<VAddr> src_cpu_addr = gpu_memory.GpuToCpuAddress(src_gpu_addr); - std::pair dst_surface = GetSurface(dst_gpu_addr, *dst_cpu_addr, dst_params, true, false); - TView src_surface = GetSurface(src_gpu_addr, *src_cpu_addr, src_params, true, false).second; - ImageBlit(src_surface, dst_surface.second, copy_config); - dst_surface.first->MarkAsModified(true, Tick()); - } - - TSurface TryFindFramebufferSurface(VAddr addr) const { - if (!addr) { - return nullptr; - } - const VAddr page = addr >> registry_page_bits; - const auto it = registry.find(page); - if (it == registry.end()) { - return nullptr; - } - const auto& list = it->second; - const auto found = std::find_if(list.begin(), list.end(), [addr](const auto& surface) { - return surface->GetCpuAddr() == addr; - }); - return found != list.end() ? *found : nullptr; - } + /// Refresh the contents (pixel data) of an image + void RefreshContents(Image& image); - u64 Tick() { - return ++ticks; - } + /// Upload data from guest to an image + template <typename MapBuffer> + void UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset); - void CommitAsyncFlushes() { - committed_flushes.push_back(uncommitted_flushes); - uncommitted_flushes.reset(); - } + /// Find or create an image view from a guest descriptor + [[nodiscard]] ImageViewId FindImageView(const TICEntry& config); - bool HasUncommittedFlushes() const { - return uncommitted_flushes != nullptr; - } + /// Create a new image view from a guest descriptor + [[nodiscard]] ImageViewId CreateImageView(const TICEntry& config); - bool ShouldWaitAsyncFlushes() const { - return !committed_flushes.empty() && committed_flushes.front() != nullptr; - } + /// Find or create an image from the given parameters + [[nodiscard]] ImageId FindOrInsertImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options = RelaxedOptions{}); - void PopAsyncFlushes() { - if (committed_flushes.empty()) { - return; - } - auto& flush_list = committed_flushes.front(); - if (!flush_list) { - committed_flushes.pop_front(); - return; - } - for (TSurface& surface : *flush_list) { - FlushSurface(surface); - } - committed_flushes.pop_front(); - } + /// Find an image from the given parameters + [[nodiscard]] ImageId FindImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options); -protected: - explicit TextureCache(VideoCore::RasterizerInterface& rasterizer_, - Tegra::Engines::Maxwell3D& maxwell3d_, Tegra::MemoryManager& gpu_memory_, - bool is_astc_supported_) - : is_astc_supported{is_astc_supported_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, - gpu_memory{gpu_memory_} { - for (std::size_t i = 0; i < Tegra::Engines::Maxwell3D::Regs::NumRenderTargets; i++) { - SetEmptyColorBuffer(i); - } + /// Create an image from the given parameters + [[nodiscard]] ImageId InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options); - SetEmptyDepthBuffer(); - staging_cache.SetSize(2); + /// Create a new image and join perfectly matching existing images + /// Remove joined images from the cache + [[nodiscard]] ImageId JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); - const auto make_siblings = [this](PixelFormat a, PixelFormat b) { - siblings_table[static_cast<std::size_t>(a)] = b; - siblings_table[static_cast<std::size_t>(b)] = a; - }; - std::fill(siblings_table.begin(), siblings_table.end(), PixelFormat::Invalid); - make_siblings(PixelFormat::D16_UNORM, PixelFormat::R16_UNORM); - make_siblings(PixelFormat::D32_FLOAT, PixelFormat::R32_FLOAT); - make_siblings(PixelFormat::D32_FLOAT_S8_UINT, PixelFormat::R32G32_FLOAT); + /// Return a blit image pair from the given guest blit parameters + [[nodiscard]] BlitImages GetBlitImages(const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Surface& src); - sampled_textures.reserve(64); - } + /// Find or create a sampler from a guest descriptor sampler + [[nodiscard]] SamplerId FindSampler(const TSCEntry& config); - ~TextureCache() = default; + /// Find or create an image view for the given color buffer index + [[nodiscard]] ImageViewId FindColorBuffer(size_t index, bool is_clear); - virtual TSurface CreateSurface(GPUVAddr gpu_addr, const SurfaceParams& params) = 0; + /// Find or create an image view for the depth buffer + [[nodiscard]] ImageViewId FindDepthBuffer(bool is_clear); - virtual void ImageCopy(TSurface& src_surface, TSurface& dst_surface, - const CopyParams& copy_params) = 0; + /// Find or create a view for a render target with the given image parameters + [[nodiscard]] ImageViewId FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, + bool is_clear); - virtual void ImageBlit(TView& src_view, TView& dst_view, - const Tegra::Engines::Fermi2D::Config& copy_config) = 0; + /// Iterates over all the images in a region calling func + template <typename Func> + void ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func); - // Depending on the backend, a buffer copy can be slow as it means deoptimizing the texture - // and reading it from a separate buffer. - virtual void BufferCopy(TSurface& src_surface, TSurface& dst_surface) = 0; + /// Find or create an image view in the given image with the passed parameters + [[nodiscard]] ImageViewId FindOrEmplaceImageView(ImageId image_id, const ImageViewInfo& info); - void ManageRenderTargetUnregister(TSurface& surface) { - auto& dirty = maxwell3d.dirty; - const u32 index = surface->GetRenderTarget(); - if (index == DEPTH_RT) { - dirty.flags[VideoCommon::Dirty::ZetaBuffer] = true; - } else { - dirty.flags[VideoCommon::Dirty::ColorBuffer0 + index] = true; - } - dirty.flags[VideoCommon::Dirty::RenderTargets] = true; + /// Register image in the page table + void RegisterImage(ImageId image); + + /// Unregister image from the page table + void UnregisterImage(ImageId image); + + /// Track CPU reads and writes for image + void TrackImage(ImageBase& image); + + /// Stop tracking CPU reads and writes for image + void UntrackImage(ImageBase& image); + + /// Delete image from the cache + void DeleteImage(ImageId image); + + /// Remove image views references from the cache + void RemoveImageViewReferences(std::span<const ImageViewId> removed_views); + + /// Remove framebuffers using the given image views from the cache + void RemoveFramebuffers(std::span<const ImageViewId> removed_views); + + /// Mark an image as modified from the GPU + void MarkModification(ImageBase& image) noexcept; + + /// Synchronize image aliases, copying data if needed + void SynchronizeAliases(ImageId image_id); + + /// Prepare an image to be used + void PrepareImage(ImageId image_id, bool is_modification, bool invalidate); + + /// Prepare an image view to be used + void PrepareImageView(ImageViewId image_view_id, bool is_modification, bool invalidate); + + /// Execute copies from one image to the other, even if they are incompatible + void CopyImage(ImageId dst_id, ImageId src_id, std::span<const ImageCopy> copies); + + /// Bind an image view as render target, downloading resources preemtively if needed + void BindRenderTarget(ImageViewId* old_id, ImageViewId new_id); + + /// Create a render target from a given image and image view parameters + [[nodiscard]] std::pair<FramebufferId, ImageViewId> RenderTargetFromImage( + ImageId, const ImageViewInfo& view_info); + + /// Returns true if the current clear parameters clear the whole image of a given image view + [[nodiscard]] bool IsFullClear(ImageViewId id); + + Runtime& runtime; + VideoCore::RasterizerInterface& rasterizer; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; + Tegra::MemoryManager& gpu_memory; + + DescriptorTable<TICEntry> graphics_image_table{gpu_memory}; + DescriptorTable<TSCEntry> graphics_sampler_table{gpu_memory}; + std::vector<SamplerId> graphics_sampler_ids; + std::vector<ImageViewId> graphics_image_view_ids; + + DescriptorTable<TICEntry> compute_image_table{gpu_memory}; + DescriptorTable<TSCEntry> compute_sampler_table{gpu_memory}; + std::vector<SamplerId> compute_sampler_ids; + std::vector<ImageViewId> compute_image_view_ids; + + RenderTargets render_targets; + + std::mutex mutex; + + std::unordered_map<TICEntry, ImageViewId> image_views; + std::unordered_map<TSCEntry, SamplerId> samplers; + std::unordered_map<RenderTargets, FramebufferId> framebuffers; + + std::unordered_map<u64, std::vector<ImageId>, IdentityHash<u64>> page_table; + + bool has_deleted_images = false; + + SlotVector<Image> slot_images; + SlotVector<ImageView> slot_image_views; + SlotVector<ImageAlloc> slot_image_allocs; + SlotVector<Sampler> slot_samplers; + SlotVector<Framebuffer> slot_framebuffers; + + // TODO: This data structure is not optimal and it should be reworked + std::vector<ImageId> uncommitted_downloads; + std::queue<std::vector<ImageId>> committed_downloads; + + static constexpr size_t TICKS_TO_DESTROY = 6; + DelayedDestructionRing<Image, TICKS_TO_DESTROY> sentenced_images; + DelayedDestructionRing<ImageView, TICKS_TO_DESTROY> sentenced_image_view; + DelayedDestructionRing<Framebuffer, TICKS_TO_DESTROY> sentenced_framebuffers; + + std::unordered_map<GPUVAddr, ImageAllocId> image_allocs_table; + + u64 modification_tick = 0; + u64 frame_tick = 0; +}; + +template <class P> +TextureCache<P>::TextureCache(Runtime& runtime_, VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_) + : runtime{runtime_}, rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, + kepler_compute{kepler_compute_}, gpu_memory{gpu_memory_} { + // Configure null sampler + TSCEntry sampler_descriptor{}; + sampler_descriptor.min_filter.Assign(Tegra::Texture::TextureFilter::Linear); + sampler_descriptor.mag_filter.Assign(Tegra::Texture::TextureFilter::Linear); + sampler_descriptor.mipmap_filter.Assign(Tegra::Texture::TextureMipmapFilter::Linear); + sampler_descriptor.cubemap_anisotropy.Assign(1); + + // Make sure the first index is reserved for the null resources + // This way the null resource becomes a compile time constant + void(slot_image_views.insert(runtime, NullImageParams{})); + void(slot_samplers.insert(runtime, sampler_descriptor)); +} + +template <class P> +void TextureCache<P>::TickFrame() { + // Tick sentenced resources in this order to ensure they are destroyed in the right order + sentenced_images.Tick(); + sentenced_framebuffers.Tick(); + sentenced_image_view.Tick(); + ++frame_tick; +} + +template <class P> +std::unique_lock<std::mutex> TextureCache<P>::AcquireLock() { + return std::unique_lock{mutex}; +} + +template <class P> +const typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) const noexcept { + return slot_image_views[id]; +} + +template <class P> +typename P::ImageView& TextureCache<P>::GetImageView(ImageViewId id) noexcept { + return slot_image_views[id]; +} + +template <class P> +void TextureCache<P>::FillGraphicsImageViews(std::span<const u32> indices, + std::span<ImageViewId> image_view_ids) { + FillImageViews(graphics_image_table, graphics_image_view_ids, indices, image_view_ids); +} + +template <class P> +void TextureCache<P>::FillComputeImageViews(std::span<const u32> indices, + std::span<ImageViewId> image_view_ids) { + FillImageViews(compute_image_table, compute_image_view_ids, indices, image_view_ids); +} + +template <class P> +typename P::Sampler* TextureCache<P>::GetGraphicsSampler(u32 index) { + [[unlikely]] if (index > graphics_sampler_table.Limit()) { + LOG_ERROR(HW_GPU, "Invalid sampler index={}", index); + return &slot_samplers[NULL_SAMPLER_ID]; + } + const auto [descriptor, is_new] = graphics_sampler_table.Read(index); + SamplerId& id = graphics_sampler_ids[index]; + [[unlikely]] if (is_new) { + id = FindSampler(descriptor); } + return &slot_samplers[id]; +} - void Register(TSurface surface) { - const GPUVAddr gpu_addr = surface->GetGpuAddr(); - const std::size_t size = surface->GetSizeInBytes(); - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - LOG_CRITICAL(HW_GPU, "Failed to register surface with unmapped gpu_address 0x{:016x}", - gpu_addr); - return; - } - surface->SetCpuAddr(*cpu_addr); - RegisterInnerCache(surface); - surface->MarkAsRegistered(true); - surface->SetMemoryMarked(true); - rasterizer.UpdatePagesCachedCount(*cpu_addr, size, 1); +template <class P> +typename P::Sampler* TextureCache<P>::GetComputeSampler(u32 index) { + [[unlikely]] if (index > compute_sampler_table.Limit()) { + LOG_ERROR(HW_GPU, "Invalid sampler index={}", index); + return &slot_samplers[NULL_SAMPLER_ID]; + } + const auto [descriptor, is_new] = compute_sampler_table.Read(index); + SamplerId& id = compute_sampler_ids[index]; + [[unlikely]] if (is_new) { + id = FindSampler(descriptor); } + return &slot_samplers[id]; +} - void UnmarkMemory(TSurface surface) { - if (!surface->IsMemoryMarked()) { - return; - } - const std::size_t size = surface->GetSizeInBytes(); - const VAddr cpu_addr = surface->GetCpuAddr(); - rasterizer.UpdatePagesCachedCount(cpu_addr, size, -1); - surface->SetMemoryMarked(false); +template <class P> +void TextureCache<P>::SynchronizeGraphicsDescriptors() { + using SamplerIndex = Tegra::Engines::Maxwell3D::Regs::SamplerIndex; + const bool linked_tsc = maxwell3d.regs.sampler_index == SamplerIndex::ViaHeaderIndex; + const u32 tic_limit = maxwell3d.regs.tic.limit; + const u32 tsc_limit = linked_tsc ? tic_limit : maxwell3d.regs.tsc.limit; + if (graphics_sampler_table.Synchornize(maxwell3d.regs.tsc.Address(), tsc_limit)) { + graphics_sampler_ids.resize(tsc_limit + 1, CORRUPT_ID); } + if (graphics_image_table.Synchornize(maxwell3d.regs.tic.Address(), tic_limit)) { + graphics_image_view_ids.resize(tic_limit + 1, CORRUPT_ID); + } +} - void Unregister(TSurface surface) { - if (guard_render_targets && surface->IsProtected()) { - return; - } - if (!guard_render_targets && surface->IsRenderTarget()) { - ManageRenderTargetUnregister(surface); - } - UnmarkMemory(surface); - if (surface->IsSyncPending()) { - marked_for_unregister.remove(surface); - surface->SetSyncPending(false); - } - UnregisterInnerCache(surface); - surface->MarkAsRegistered(false); - ReserveSurface(surface->GetSurfaceParams(), surface); +template <class P> +void TextureCache<P>::SynchronizeComputeDescriptors() { + const bool linked_tsc = kepler_compute.launch_description.linked_tsc; + const u32 tic_limit = kepler_compute.regs.tic.limit; + const u32 tsc_limit = linked_tsc ? tic_limit : kepler_compute.regs.tsc.limit; + const GPUVAddr tsc_gpu_addr = kepler_compute.regs.tsc.Address(); + if (compute_sampler_table.Synchornize(tsc_gpu_addr, tsc_limit)) { + compute_sampler_ids.resize(tsc_limit + 1, CORRUPT_ID); } + if (compute_image_table.Synchornize(kepler_compute.regs.tic.Address(), tic_limit)) { + compute_image_view_ids.resize(tic_limit + 1, CORRUPT_ID); + } +} - TSurface GetUncachedSurface(const GPUVAddr gpu_addr, const SurfaceParams& params) { - if (const auto surface = TryGetReservedSurface(params); surface) { - surface->SetGpuAddr(gpu_addr); - return surface; - } - // No reserved surface available, create a new one and reserve it - auto new_surface{CreateSurface(gpu_addr, params)}; - return new_surface; +template <class P> +void TextureCache<P>::UpdateRenderTargets(bool is_clear) { + using namespace VideoCommon::Dirty; + auto& flags = maxwell3d.dirty.flags; + if (!flags[Dirty::RenderTargets]) { + return; } + flags[Dirty::RenderTargets] = false; - const bool is_astc_supported; + // Render target control is used on all render targets, so force look ups when this one is up + const bool force = flags[Dirty::RenderTargetControl]; + flags[Dirty::RenderTargetControl] = false; -private: - enum class RecycleStrategy : u32 { - Ignore = 0, - Flush = 1, - BufferCopy = 3, - }; + for (size_t index = 0; index < NUM_RT; ++index) { + ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; + if (flags[Dirty::ColorBuffer0 + index] || force) { + flags[Dirty::ColorBuffer0 + index] = false; + BindRenderTarget(&color_buffer_id, FindColorBuffer(index, is_clear)); + } + PrepareImageView(color_buffer_id, true, is_clear && IsFullClear(color_buffer_id)); + } + if (flags[Dirty::ZetaBuffer] || force) { + flags[Dirty::ZetaBuffer] = false; + BindRenderTarget(&render_targets.depth_buffer_id, FindDepthBuffer(is_clear)); + } + const ImageViewId depth_buffer_id = render_targets.depth_buffer_id; + PrepareImageView(depth_buffer_id, true, is_clear && IsFullClear(depth_buffer_id)); - enum class DeductionType : u32 { - DeductionComplete, - DeductionIncomplete, - DeductionFailed, + for (size_t index = 0; index < NUM_RT; ++index) { + render_targets.draw_buffers[index] = static_cast<u8>(maxwell3d.regs.rt_control.Map(index)); + } + render_targets.size = Extent2D{ + maxwell3d.regs.render_area.width, + maxwell3d.regs.render_area.height, }; +} - struct Deduction { - DeductionType type{DeductionType::DeductionFailed}; - TSurface surface{}; +template <class P> +typename P::Framebuffer* TextureCache<P>::GetFramebuffer() { + return &slot_framebuffers[GetFramebufferId(render_targets)]; +} - bool Failed() const { - return type == DeductionType::DeductionFailed; - } +template <class P> +void TextureCache<P>::FillImageViews(DescriptorTable<TICEntry>& table, + std::span<ImageViewId> cached_image_view_ids, + std::span<const u32> indices, + std::span<ImageViewId> image_view_ids) { + ASSERT(indices.size() <= image_view_ids.size()); + do { + has_deleted_images = false; + std::ranges::transform(indices, image_view_ids.begin(), [&](u32 index) { + return VisitImageView(table, cached_image_view_ids, index); + }); + } while (has_deleted_images); +} - bool Incomplete() const { - return type == DeductionType::DeductionIncomplete; - } +template <class P> +ImageViewId TextureCache<P>::VisitImageView(DescriptorTable<TICEntry>& table, + std::span<ImageViewId> cached_image_view_ids, + u32 index) { + if (index > table.Limit()) { + LOG_ERROR(HW_GPU, "Invalid image view index={}", index); + return NULL_IMAGE_VIEW_ID; + } + const auto [descriptor, is_new] = table.Read(index); + ImageViewId& image_view_id = cached_image_view_ids[index]; + if (is_new) { + image_view_id = FindImageView(descriptor); + } + if (image_view_id != NULL_IMAGE_VIEW_ID) { + PrepareImageView(image_view_id, false, false); + } + return image_view_id; +} - bool IsDepth() const { - return surface->GetSurfaceParams().IsPixelFormatZeta(); - } - }; +template <class P> +FramebufferId TextureCache<P>::GetFramebufferId(const RenderTargets& key) { + const auto [pair, is_new] = framebuffers.try_emplace(key); + FramebufferId& framebuffer_id = pair->second; + if (!is_new) { + return framebuffer_id; + } + std::array<ImageView*, NUM_RT> color_buffers; + std::ranges::transform(key.color_buffer_ids, color_buffers.begin(), + [this](ImageViewId id) { return id ? &slot_image_views[id] : nullptr; }); + ImageView* const depth_buffer = + key.depth_buffer_id ? &slot_image_views[key.depth_buffer_id] : nullptr; + framebuffer_id = slot_framebuffers.insert(runtime, color_buffers, depth_buffer, key); + return framebuffer_id; +} - /** - * Takes care of selecting a proper strategy to deal with a texture recycle. - * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters on the new surface. - * @param gpu_addr The starting address of the new surface. - * @param untopological Indicates to the recycler that the texture has no way - * to match the overlaps due to topological reasons. - **/ - RecycleStrategy PickStrategy(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr, const MatchTopologyResult untopological) { - if (Settings::IsGPULevelExtreme()) { - return RecycleStrategy::Flush; - } - // 3D Textures decision - if (params.target == SurfaceTarget::Texture3D) { - return RecycleStrategy::Flush; - } - for (const auto& s : overlaps) { - const auto& s_params = s->GetSurfaceParams(); - if (s_params.target == SurfaceTarget::Texture3D) { - return RecycleStrategy::Flush; - } - } - // Untopological decision - if (untopological == MatchTopologyResult::CompressUnmatch) { - return RecycleStrategy::Flush; - } - if (untopological == MatchTopologyResult::FullMatch && !params.is_tiled) { - return RecycleStrategy::Flush; - } - return RecycleStrategy::Ignore; - } - - /** - * Used to decide what to do with textures we can't resolve in the cache It has 2 implemented - * strategies: Ignore and Flush. - * - * - Ignore: Just unregisters all the overlaps and loads the new texture. - * - Flush: Flushes all the overlaps into memory and loads the new surface from that data. - * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters for the new surface. - * @param gpu_addr The starting address of the new surface. - * @param preserve_contents Indicates that the new surface should be loaded from memory or left - * blank. - * @param untopological Indicates to the recycler that the texture has no way to match the - * overlaps due to topological reasons. - **/ - std::pair<TSurface, TView> RecycleSurface(VectorSurface& overlaps, const SurfaceParams& params, - const GPUVAddr gpu_addr, const bool preserve_contents, - const MatchTopologyResult untopological) { - const bool do_load = preserve_contents && Settings::IsGPULevelExtreme(); - for (auto& surface : overlaps) { - Unregister(surface); - } - switch (PickStrategy(overlaps, params, gpu_addr, untopological)) { - case RecycleStrategy::Ignore: { - return InitializeSurface(gpu_addr, params, do_load); - } - case RecycleStrategy::Flush: { - std::sort(overlaps.begin(), overlaps.end(), - [](const TSurface& a, const TSurface& b) -> bool { - return a->GetModificationTick() < b->GetModificationTick(); - }); - for (auto& surface : overlaps) { - FlushSurface(surface); - } - return InitializeSurface(gpu_addr, params, preserve_contents); +template <class P> +void TextureCache<P>::WriteMemory(VAddr cpu_addr, size_t size) { + ForEachImageInRegion(cpu_addr, size, [this](ImageId image_id, Image& image) { + if (True(image.flags & ImageFlagBits::CpuModified)) { + return; } - case RecycleStrategy::BufferCopy: { - auto new_surface = GetUncachedSurface(gpu_addr, params); - BufferCopy(overlaps[0], new_surface); - return {new_surface, new_surface->GetMainView()}; + image.flags |= ImageFlagBits::CpuModified; + UntrackImage(image); + }); +} + +template <class P> +void TextureCache<P>::DownloadMemory(VAddr cpu_addr, size_t size) { + std::vector<ImageId> images; + ForEachImageInRegion(cpu_addr, size, [this, &images](ImageId image_id, ImageBase& image) { + // Skip images that were not modified from the GPU + if (False(image.flags & ImageFlagBits::GpuModified)) { + return; } - default: { - UNIMPLEMENTED_MSG("Unimplemented Texture Cache Recycling Strategy!"); - return InitializeSurface(gpu_addr, params, do_load); + // Skip images that .are. modified from the CPU + // We don't want to write sensitive data from the guest + if (True(image.flags & ImageFlagBits::CpuModified)) { + return; } + if (image.info.num_samples > 1) { + LOG_WARNING(HW_GPU, "MSAA image downloads are not implemented"); + return; } + image.flags &= ~ImageFlagBits::GpuModified; + images.push_back(image_id); + }); + if (images.empty()) { + return; + } + std::ranges::sort(images, [this](ImageId lhs, ImageId rhs) { + return slot_images[lhs].modification_tick < slot_images[rhs].modification_tick; + }); + for (const ImageId image_id : images) { + Image& image = slot_images[image_id]; + auto map = runtime.MapDownloadBuffer(image.unswizzled_size_bytes); + const auto copies = FullDownloadCopies(image.info); + image.DownloadMemory(map, 0, copies); + runtime.Finish(); + SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.Span()); } +} - /** - * Takes a single surface and recreates into another that may differ in - * format, target or width alignment. - * - * @param current_surface The registered surface in the cache which we want to convert. - * @param params The new surface params which we'll use to recreate the surface. - * @param is_render Whether or not the surface is a render target. - **/ - std::pair<TSurface, TView> RebuildSurface(TSurface current_surface, const SurfaceParams& params, - bool is_render) { - const auto gpu_addr = current_surface->GetGpuAddr(); - const auto& cr_params = current_surface->GetSurfaceParams(); - TSurface new_surface; - if (cr_params.pixel_format != params.pixel_format && !is_render && - GetSiblingFormat(cr_params.pixel_format) == params.pixel_format) { - SurfaceParams new_params = params; - new_params.pixel_format = cr_params.pixel_format; - new_params.type = cr_params.type; - new_surface = GetUncachedSurface(gpu_addr, new_params); - } else { - new_surface = GetUncachedSurface(gpu_addr, params); - } - const SurfaceParams& final_params = new_surface->GetSurfaceParams(); - if (cr_params.type != final_params.type) { - if (Settings::IsGPULevelExtreme()) { - BufferCopy(current_surface, new_surface); - } - } else { - std::vector<CopyParams> bricks = current_surface->BreakDown(final_params); - for (auto& brick : bricks) { - TryCopyImage(current_surface, new_surface, brick); - } - } - Unregister(current_surface); - Register(new_surface); - new_surface->MarkAsModified(current_surface->IsModified(), Tick()); - return {new_surface, new_surface->GetMainView()}; - } - - /** - * Takes a single surface and checks with the new surface's params if it's an exact - * match, we return the main view of the registered surface. If its formats don't - * match, we rebuild the surface. We call this last method a `Mirage`. If formats - * match but the targets don't, we create an overview View of the registered surface. - * - * @param current_surface The registered surface in the cache which we want to convert. - * @param params The new surface params which we want to check. - * @param is_render Whether or not the surface is a render target. - **/ - std::pair<TSurface, TView> ManageStructuralMatch(TSurface current_surface, - const SurfaceParams& params, bool is_render) { - const bool is_mirage = !current_surface->MatchFormat(params.pixel_format); - const bool matches_target = current_surface->MatchTarget(params.target); - const auto match_check = [&]() -> std::pair<TSurface, TView> { - if (matches_target) { - return {current_surface, current_surface->GetMainView()}; - } - return {current_surface, current_surface->EmplaceOverview(params)}; - }; - if (!is_mirage) { - return match_check(); - } - if (!is_render && GetSiblingFormat(current_surface->GetFormat()) == params.pixel_format) { - return match_check(); - } - return RebuildSurface(current_surface, params, is_render); - } - - /** - * Unlike RebuildSurface where we know whether or not registered surfaces match the candidate - * in some way, we have no guarantees here. We try to see if the overlaps are sublayers/mipmaps - * of the new surface, if they all match we end up recreating a surface for them, - * else we return nothing. - * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters on the new surface. - * @param gpu_addr The starting address of the new surface. - **/ - std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps, - const SurfaceParams& params, - GPUVAddr gpu_addr) { - if (params.target == SurfaceTarget::Texture3D) { - return std::nullopt; - } - const auto test_modified = [](TSurface& surface) { return surface->IsModified(); }; - TSurface new_surface = GetUncachedSurface(gpu_addr, params); +template <class P> +void TextureCache<P>::UnmapMemory(VAddr cpu_addr, size_t size) { + std::vector<ImageId> deleted_images; + ForEachImageInRegion(cpu_addr, size, [&](ImageId id, Image&) { deleted_images.push_back(id); }); + for (const ImageId id : deleted_images) { + Image& image = slot_images[id]; + if (True(image.flags & ImageFlagBits::Tracked)) { + UntrackImage(image); + } + UnregisterImage(id); + DeleteImage(id); + } +} - if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) { - LoadSurface(new_surface); - for (const auto& surface : overlaps) { - Unregister(surface); - } - Register(new_surface); - return {{new_surface, new_surface->GetMainView()}}; - } +template <class P> +void TextureCache<P>::BlitImage(const Tegra::Engines::Fermi2D::Surface& dst, + const Tegra::Engines::Fermi2D::Surface& src, + const Tegra::Engines::Fermi2D::Config& copy) { + const BlitImages images = GetBlitImages(dst, src); + const ImageId dst_id = images.dst_id; + const ImageId src_id = images.src_id; + PrepareImage(src_id, false, false); + PrepareImage(dst_id, true, false); + + ImageBase& dst_image = slot_images[dst_id]; + const ImageBase& src_image = slot_images[src_id]; + + // TODO: Deduplicate + const std::optional dst_base = dst_image.TryFindBase(dst.Address()); + const SubresourceRange dst_range{.base = dst_base.value(), .extent = {1, 1}}; + const ImageViewInfo dst_view_info(ImageViewType::e2D, images.dst_format, dst_range); + const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info); + const auto [src_samples_x, src_samples_y] = SamplesLog2(src_image.info.num_samples); + const std::array src_region{ + Offset2D{.x = copy.src_x0 >> src_samples_x, .y = copy.src_y0 >> src_samples_y}, + Offset2D{.x = copy.src_x1 >> src_samples_x, .y = copy.src_y1 >> src_samples_y}, + }; - std::size_t passed_tests = 0; - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; - if (!mipmap_layer) { - continue; - } - const auto [base_layer, base_mipmap] = *mipmap_layer; - if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) { - continue; - } - ++passed_tests; - - // Copy all mipmaps and layers - const u32 block_width = params.GetDefaultBlockWidth(); - const u32 block_height = params.GetDefaultBlockHeight(); - for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) { - const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); - const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); - if (width < block_width || height < block_height) { - // Current APIs forbid copying small compressed textures, avoid errors - break; - } - const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height, - src_params.depth); - TryCopyImage(surface, new_surface, copy_params); - } - } - if (passed_tests == 0) { - return std::nullopt; - } - if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { - // In Accurate GPU all tests should pass, else we recycle - return std::nullopt; - } + const std::optional src_base = src_image.TryFindBase(src.Address()); + const SubresourceRange src_range{.base = src_base.value(), .extent = {1, 1}}; + const ImageViewInfo src_view_info(ImageViewType::e2D, images.src_format, src_range); + const auto [src_framebuffer_id, src_view_id] = RenderTargetFromImage(src_id, src_view_info); + const auto [dst_samples_x, dst_samples_y] = SamplesLog2(dst_image.info.num_samples); + const std::array dst_region{ + Offset2D{.x = copy.dst_x0 >> dst_samples_x, .y = copy.dst_y0 >> dst_samples_y}, + Offset2D{.x = copy.dst_x1 >> dst_samples_x, .y = copy.dst_y1 >> dst_samples_y}, + }; - const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified); - for (const auto& surface : overlaps) { - Unregister(surface); - } + // Always call this after src_framebuffer_id was queried, as the address might be invalidated. + Framebuffer* const dst_framebuffer = &slot_framebuffers[dst_framebuffer_id]; + if constexpr (FRAMEBUFFER_BLITS) { + // OpenGL blits from framebuffers, not images + Framebuffer* const src_framebuffer = &slot_framebuffers[src_framebuffer_id]; + runtime.BlitFramebuffer(dst_framebuffer, src_framebuffer, dst_region, src_region, + copy.filter, copy.operation); + } else { + // Vulkan can blit images, but it lacks format reinterpretations + // Provide a framebuffer in case it's necessary + ImageView& dst_view = slot_image_views[dst_view_id]; + ImageView& src_view = slot_image_views[src_view_id]; + runtime.BlitImage(dst_framebuffer, dst_view, src_view, dst_region, src_region, copy.filter, + copy.operation); + } +} - new_surface->MarkAsModified(modified, Tick()); - Register(new_surface); - return {{new_surface, new_surface->GetMainView()}}; - } - - /** - * Takes care of managing 3D textures and its slices. Does HLE methods for reconstructing the 3D - * textures within the GPU if possible. Falls back to LLE when it isn't possible to use any of - * the HLE methods. - * - * @param overlaps The overlapping surfaces registered in the cache. - * @param params The parameters on the new surface. - * @param gpu_addr The starting address of the new surface. - * @param cpu_addr The starting address of the new surface on physical memory. - * @param preserve_contents Indicates that the new surface should be loaded from memory or - * left blank. - */ - std::optional<std::pair<TSurface, TView>> Manage3DSurfaces(VectorSurface& overlaps, - const SurfaceParams& params, - GPUVAddr gpu_addr, VAddr cpu_addr, - bool preserve_contents) { - if (params.target != SurfaceTarget::Texture3D) { - for (const auto& surface : overlaps) { - if (!surface->MatchTarget(params.target)) { - if (overlaps.size() == 1 && surface->GetCpuAddr() == cpu_addr) { - if (Settings::IsGPULevelExtreme()) { - return std::nullopt; - } - Unregister(surface); - return InitializeSurface(gpu_addr, params, preserve_contents); - } - return std::nullopt; - } - if (surface->GetCpuAddr() != cpu_addr) { - continue; - } - if (surface->MatchesStructure(params) == MatchStructureResult::FullMatch) { - return std::make_pair(surface, surface->GetMainView()); - } - } - return InitializeSurface(gpu_addr, params, preserve_contents); - } +template <class P> +void TextureCache<P>::InvalidateColorBuffer(size_t index) { + ImageViewId& color_buffer_id = render_targets.color_buffer_ids[index]; + color_buffer_id = FindColorBuffer(index, false); + if (!color_buffer_id) { + LOG_ERROR(HW_GPU, "Invalidating invalid color buffer in index={}", index); + return; + } + // When invalidating a color buffer, the old contents are no longer relevant + ImageView& color_buffer = slot_image_views[color_buffer_id]; + Image& image = slot_images[color_buffer.image_id]; + image.flags &= ~ImageFlagBits::CpuModified; + image.flags &= ~ImageFlagBits::GpuModified; - if (params.num_levels > 1) { - // We can't handle mipmaps in 3D textures yet, better fallback to LLE approach - return std::nullopt; - } + runtime.InvalidateColorBuffer(color_buffer, index); +} - if (overlaps.size() == 1) { - const auto& surface = overlaps[0]; - const SurfaceParams& overlap_params = surface->GetSurfaceParams(); - // Don't attempt to render to textures with more than one level for now - // The texture has to be to the right or the sample address if we want to render to it - if (overlap_params.num_levels == 1 && cpu_addr >= surface->GetCpuAddr()) { - const u32 offset = static_cast<u32>(cpu_addr - surface->GetCpuAddr()); - const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); - if (slice < overlap_params.depth) { - auto view = surface->Emplace3DView(slice, params.depth, 0, 1); - return std::make_pair(std::move(surface), std::move(view)); - } - } - } +template <class P> +void TextureCache<P>::InvalidateDepthBuffer() { + ImageViewId& depth_buffer_id = render_targets.depth_buffer_id; + depth_buffer_id = FindDepthBuffer(false); + if (!depth_buffer_id) { + LOG_ERROR(HW_GPU, "Invalidating invalid depth buffer"); + return; + } + // When invalidating the depth buffer, the old contents are no longer relevant + ImageBase& image = slot_images[slot_image_views[depth_buffer_id].image_id]; + image.flags &= ~ImageFlagBits::CpuModified; + image.flags &= ~ImageFlagBits::GpuModified; - TSurface new_surface = GetUncachedSurface(gpu_addr, params); - bool modified = false; + ImageView& depth_buffer = slot_image_views[depth_buffer_id]; + runtime.InvalidateDepthBuffer(depth_buffer); +} - for (auto& surface : overlaps) { - const SurfaceParams& src_params = surface->GetSurfaceParams(); - if (src_params.target != SurfaceTarget::Texture2D || - src_params.height != params.height || - src_params.block_depth != params.block_depth || - src_params.block_height != params.block_height) { - return std::nullopt; - } - modified |= surface->IsModified(); - - const u32 offset = static_cast<u32>(surface->GetCpuAddr() - cpu_addr); - const u32 slice = std::get<2>(params.GetBlockOffsetXYZ(offset)); - const u32 width = params.width; - const u32 height = params.height; - const CopyParams copy_params(0, 0, 0, 0, 0, slice, 0, 0, width, height, 1); - TryCopyImage(surface, new_surface, copy_params); +template <class P> +typename P::ImageView* TextureCache<P>::TryFindFramebufferImageView(VAddr cpu_addr) { + // TODO: Properly implement this + const auto it = page_table.find(cpu_addr >> PAGE_BITS); + if (it == page_table.end()) { + return nullptr; + } + const auto& image_ids = it->second; + for (const ImageId image_id : image_ids) { + const ImageBase& image = slot_images[image_id]; + if (image.cpu_addr != cpu_addr) { + continue; } - for (const auto& surface : overlaps) { - Unregister(surface); + if (image.image_view_ids.empty()) { + continue; } - new_surface->MarkAsModified(modified, Tick()); - Register(new_surface); - - TView view = new_surface->GetMainView(); - return std::make_pair(std::move(new_surface), std::move(view)); - } - - /** - * Gets the starting address and parameters of a candidate surface and tries - * to find a matching surface within the cache. This is done in 3 big steps: - * - * 1. Check the 1st Level Cache in order to find an exact match, if we fail, we move to step 2. - * - * 2. Check if there are any overlaps at all, if there are none, we just load the texture from - * memory else we move to step 3. - * - * 3. Consists of figuring out the relationship between the candidate texture and the - * overlaps. We divide the scenarios depending if there's 1 or many overlaps. If - * there's many, we just try to reconstruct a new surface out of them based on the - * candidate's parameters, if we fail, we recycle. When there's only 1 overlap then we - * have to check if the candidate is a view (layer/mipmap) of the overlap or if the - * registered surface is a mipmap/layer of the candidate. In this last case we reconstruct - * a new surface. - * - * @param gpu_addr The starting address of the candidate surface. - * @param params The parameters on the candidate surface. - * @param preserve_contents Indicates that the new surface should be loaded from memory or - * left blank. - * @param is_render Whether or not the surface is a render target. - **/ - std::pair<TSurface, TView> GetSurface(const GPUVAddr gpu_addr, const VAddr cpu_addr, - const SurfaceParams& params, bool preserve_contents, - bool is_render) { - // Step 1 - // Check Level 1 Cache for a fast structural match. If candidate surface - // matches at certain level we are pretty much done. - if (const auto iter = l1_cache.find(cpu_addr); iter != l1_cache.end()) { - TSurface& current_surface = iter->second; - const auto topological_result = current_surface->MatchesTopology(params); - if (topological_result != MatchTopologyResult::FullMatch) { - VectorSurface overlaps{current_surface}; - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - topological_result); - } + return &slot_image_views[image.image_view_ids.at(0)]; + } + return nullptr; +} - const auto struct_result = current_surface->MatchesStructure(params); - if (struct_result != MatchStructureResult::None) { - const auto& old_params = current_surface->GetSurfaceParams(); - const bool not_3d = params.target != SurfaceTarget::Texture3D && - old_params.target != SurfaceTarget::Texture3D; - if (not_3d || current_surface->MatchTarget(params.target)) { - if (struct_result == MatchStructureResult::FullMatch) { - return ManageStructuralMatch(current_surface, params, is_render); - } else { - return RebuildSurface(current_surface, params, is_render); - } - } - } - } +template <class P> +bool TextureCache<P>::HasUncommittedFlushes() const noexcept { + return !uncommitted_downloads.empty(); +} - // Step 2 - // Obtain all possible overlaps in the memory region - const std::size_t candidate_size = params.GetGuestSizeInBytes(); - auto overlaps{GetSurfacesInRegion(cpu_addr, candidate_size)}; +template <class P> +bool TextureCache<P>::ShouldWaitAsyncFlushes() const noexcept { + return !committed_downloads.empty() && !committed_downloads.front().empty(); +} - // If none are found, we are done. we just load the surface and create it. - if (overlaps.empty()) { - return InitializeSurface(gpu_addr, params, preserve_contents); - } +template <class P> +void TextureCache<P>::CommitAsyncFlushes() { + // This is intentionally passing the value by copy + committed_downloads.push(uncommitted_downloads); + uncommitted_downloads.clear(); +} - // Step 3 - // Now we need to figure the relationship between the texture and its overlaps - // we do a topological test to ensure we can find some relationship. If it fails - // immediately recycle the texture - for (const auto& surface : overlaps) { - const auto topological_result = surface->MatchesTopology(params); - if (topological_result != MatchTopologyResult::FullMatch) { - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - topological_result); - } - } +template <class P> +void TextureCache<P>::PopAsyncFlushes() { + if (committed_downloads.empty()) { + return; + } + const std::span<const ImageId> download_ids = committed_downloads.front(); + if (download_ids.empty()) { + committed_downloads.pop(); + return; + } + size_t total_size_bytes = 0; + for (const ImageId image_id : download_ids) { + total_size_bytes += slot_images[image_id].unswizzled_size_bytes; + } + auto download_map = runtime.MapDownloadBuffer(total_size_bytes); + size_t buffer_offset = 0; + for (const ImageId image_id : download_ids) { + Image& image = slot_images[image_id]; + const auto copies = FullDownloadCopies(image.info); + image.DownloadMemory(download_map, buffer_offset, copies); + buffer_offset += image.unswizzled_size_bytes; + } + // Wait for downloads to finish + runtime.Finish(); + + buffer_offset = 0; + const std::span<u8> download_span = download_map.Span(); + for (const ImageId image_id : download_ids) { + const ImageBase& image = slot_images[image_id]; + const auto copies = FullDownloadCopies(image.info); + const std::span<u8> image_download_span = download_span.subspan(buffer_offset); + SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, image_download_span); + buffer_offset += image.unswizzled_size_bytes; + } + committed_downloads.pop(); +} - // Manage 3D textures - if (params.block_depth > 0) { - auto surface = - Manage3DSurfaces(overlaps, params, gpu_addr, cpu_addr, preserve_contents); - if (surface) { - return *surface; - } +template <class P> +bool TextureCache<P>::IsRegionGpuModified(VAddr addr, size_t size) { + bool is_modified = false; + ForEachImageInRegion(addr, size, [&is_modified](ImageId, ImageBase& image) { + if (False(image.flags & ImageFlagBits::GpuModified)) { + return false; } + is_modified = true; + return true; + }); + return is_modified; +} - // Split cases between 1 overlap or many. - if (overlaps.size() == 1) { - TSurface current_surface = overlaps[0]; - // First check if the surface is within the overlap. If not, it means - // two things either the candidate surface is a supertexture of the overlap - // or they don't match in any known way. - if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) { - const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr); - if (view) { - return *view; - } - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - MatchTopologyResult::FullMatch); - } - // Now we check if the candidate is a mipmap/layer of the overlap - std::optional<TView> view = - current_surface->EmplaceView(params, gpu_addr, candidate_size); - if (view) { - const bool is_mirage = !current_surface->MatchFormat(params.pixel_format); - if (is_mirage) { - // On a mirage view, we need to recreate the surface under this new view - // and then obtain a view again. - SurfaceParams new_params = current_surface->GetSurfaceParams(); - const u32 wh = SurfaceParams::ConvertWidth( - new_params.width, new_params.pixel_format, params.pixel_format); - const u32 hh = SurfaceParams::ConvertHeight( - new_params.height, new_params.pixel_format, params.pixel_format); - new_params.width = wh; - new_params.height = hh; - new_params.pixel_format = params.pixel_format; - std::pair<TSurface, TView> pair = - RebuildSurface(current_surface, new_params, is_render); - std::optional<TView> mirage_view = - pair.first->EmplaceView(params, gpu_addr, candidate_size); - if (mirage_view) - return {pair.first, *mirage_view}; - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - MatchTopologyResult::FullMatch); - } - return {current_surface, *view}; - } - } else { - // If there are many overlaps, odds are they are subtextures of the candidate - // surface. We try to construct a new surface based on the candidate parameters, - // using the overlaps. If a single overlap fails, this will fail. - std::optional<std::pair<TSurface, TView>> view = - TryReconstructSurface(overlaps, params, gpu_addr); - if (view) { - return *view; - } - } - // We failed all the tests, recycle the overlaps into a new texture. - return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, - MatchTopologyResult::FullMatch); - } - - /** - * Gets the starting address and parameters of a candidate surface and tries to find a - * matching surface within the cache that's similar to it. If there are many textures - * or the texture found if entirely incompatible, it will fail. If no texture is found, the - * blit will be unsuccessful. - * - * @param gpu_addr The starting address of the candidate surface. - * @param params The parameters on the candidate surface. - **/ - Deduction DeduceSurface(const GPUVAddr gpu_addr, const SurfaceParams& params) { - const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); - - if (!cpu_addr) { - Deduction result{}; - result.type = DeductionType::DeductionFailed; - return result; - } +template <class P> +void TextureCache<P>::RefreshContents(Image& image) { + if (False(image.flags & ImageFlagBits::CpuModified)) { + // Only upload modified images + return; + } + image.flags &= ~ImageFlagBits::CpuModified; + TrackImage(image); - if (const auto iter = l1_cache.find(*cpu_addr); iter != l1_cache.end()) { - TSurface& current_surface = iter->second; - const auto topological_result = current_surface->MatchesTopology(params); - if (topological_result != MatchTopologyResult::FullMatch) { - Deduction result{}; - result.type = DeductionType::DeductionFailed; - return result; - } - const auto struct_result = current_surface->MatchesStructure(params); - if (struct_result != MatchStructureResult::None && - current_surface->MatchTarget(params.target)) { - Deduction result{}; - result.type = DeductionType::DeductionComplete; - result.surface = current_surface; - return result; - } - } + if (image.info.num_samples > 1) { + LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); + return; + } + auto map = runtime.MapUploadBuffer(MapSizeBytes(image)); + UploadImageContents(image, map, 0); + runtime.InsertUploadMemoryBarrier(); +} - const std::size_t candidate_size = params.GetGuestSizeInBytes(); - auto overlaps{GetSurfacesInRegion(*cpu_addr, candidate_size)}; +template <class P> +template <typename MapBuffer> +void TextureCache<P>::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) { + const std::span<u8> mapped_span = map.Span().subspan(buffer_offset); + const GPUVAddr gpu_addr = image.gpu_addr; + + if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { + gpu_memory.ReadBlockUnsafe(gpu_addr, mapped_span.data(), mapped_span.size_bytes()); + const auto uploads = FullUploadSwizzles(image.info); + runtime.AccelerateImageUpload(image, map, buffer_offset, uploads); + } else if (True(image.flags & ImageFlagBits::Converted)) { + std::vector<u8> unswizzled_data(image.unswizzled_size_bytes); + auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, unswizzled_data); + ConvertImage(unswizzled_data, image.info, mapped_span, copies); + image.UploadMemory(map, buffer_offset, copies); + } else if (image.info.type == ImageType::Buffer) { + const std::array copies{UploadBufferCopy(gpu_memory, gpu_addr, image, mapped_span)}; + image.UploadMemory(map, buffer_offset, copies); + } else { + const auto copies = UnswizzleImage(gpu_memory, gpu_addr, image.info, mapped_span); + image.UploadMemory(map, buffer_offset, copies); + } +} - if (overlaps.empty()) { - Deduction result{}; - result.type = DeductionType::DeductionIncomplete; - return result; - } +template <class P> +ImageViewId TextureCache<P>::FindImageView(const TICEntry& config) { + if (!IsValidAddress(gpu_memory, config)) { + return NULL_IMAGE_VIEW_ID; + } + const auto [pair, is_new] = image_views.try_emplace(config); + ImageViewId& image_view_id = pair->second; + if (is_new) { + image_view_id = CreateImageView(config); + } + return image_view_id; +} - if (overlaps.size() > 1) { - Deduction result{}; - result.type = DeductionType::DeductionFailed; - return result; - } else { - Deduction result{}; - result.type = DeductionType::DeductionComplete; - result.surface = overlaps[0]; - return result; - } +template <class P> +ImageViewId TextureCache<P>::CreateImageView(const TICEntry& config) { + const ImageInfo info(config); + const GPUVAddr image_gpu_addr = config.Address() - config.BaseLayer() * info.layer_stride; + const ImageId image_id = FindOrInsertImage(info, image_gpu_addr); + if (!image_id) { + return NULL_IMAGE_VIEW_ID; } + ImageBase& image = slot_images[image_id]; + const SubresourceBase base = image.TryFindBase(config.Address()).value(); + ASSERT(base.level == 0); + const ImageViewInfo view_info(config, base.layer); + const ImageViewId image_view_id = FindOrEmplaceImageView(image_id, view_info); + ImageViewBase& image_view = slot_image_views[image_view_id]; + image_view.flags |= ImageViewFlagBits::Strong; + image.flags |= ImageFlagBits::Strong; + return image_view_id; +} - /** - * Gets a null surface based on a target texture. - * @param target The target of the null surface. - */ - TView GetNullSurface(SurfaceTarget target) { - const u32 i_target = static_cast<u32>(target); - if (const auto it = invalid_cache.find(i_target); it != invalid_cache.end()) { - return it->second->GetMainView(); - } - SurfaceParams params{}; - params.target = target; - params.is_tiled = false; - params.srgb_conversion = false; - params.is_layered = - target == SurfaceTarget::Texture1DArray || target == SurfaceTarget::Texture2DArray || - target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray; - params.block_width = 0; - params.block_height = 0; - params.block_depth = 0; - params.tile_width_spacing = 1; - params.width = 1; - params.height = 1; - params.depth = 1; - if (target == SurfaceTarget::TextureCubemap || target == SurfaceTarget::TextureCubeArray) { - params.depth = 6; - } - params.pitch = 4; - params.num_levels = 1; - params.emulated_levels = 1; - params.pixel_format = VideoCore::Surface::PixelFormat::R8_UNORM; - params.type = VideoCore::Surface::SurfaceType::ColorTexture; - auto surface = CreateSurface(0ULL, params); - invalid_memory.resize(surface->GetHostSizeInBytes(), 0U); - surface->UploadTexture(invalid_memory); - surface->MarkAsModified(false, Tick()); - invalid_cache.emplace(i_target, surface); - return surface->GetMainView(); - } - - /** - * Gets the a source and destination starting address and parameters, - * and tries to deduce if they are supposed to be depth textures. If so, their - * parameters are modified and fixed into so. - * - * @param src_params The parameters of the candidate surface. - * @param dst_params The parameters of the destination surface. - * @param src_gpu_addr The starting address of the candidate surface. - * @param dst_gpu_addr The starting address of the destination surface. - **/ - void DeduceBestBlit(SurfaceParams& src_params, SurfaceParams& dst_params, - const GPUVAddr src_gpu_addr, const GPUVAddr dst_gpu_addr) { - auto deduced_src = DeduceSurface(src_gpu_addr, src_params); - auto deduced_dst = DeduceSurface(dst_gpu_addr, dst_params); - if (deduced_src.Failed() || deduced_dst.Failed()) { - return; +template <class P> +ImageId TextureCache<P>::FindOrInsertImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options) { + if (const ImageId image_id = FindImage(info, gpu_addr, options); image_id) { + return image_id; + } + return InsertImage(info, gpu_addr, options); +} + +template <class P> +ImageId TextureCache<P>::FindImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options) { + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + return ImageId{}; + } + const bool broken_views = runtime.HasBrokenTextureViewFormats(); + ImageId image_id; + const auto lambda = [&](ImageId existing_image_id, ImageBase& existing_image) { + if (info.type == ImageType::Linear || existing_image.info.type == ImageType::Linear) { + const bool strict_size = False(options & RelaxedOptions::Size) && + True(existing_image.flags & ImageFlagBits::Strong); + const ImageInfo& existing = existing_image.info; + if (existing_image.gpu_addr == gpu_addr && existing.type == info.type && + existing.pitch == info.pitch && + IsPitchLinearSameSize(existing, info, strict_size) && + IsViewCompatible(existing.format, info.format, broken_views)) { + image_id = existing_image_id; + return true; + } + } else if (IsSubresource(info, existing_image, gpu_addr, options, broken_views)) { + image_id = existing_image_id; + return true; } + return false; + }; + ForEachImageInRegion(*cpu_addr, CalculateGuestSizeInBytes(info), lambda); + return image_id; +} - const bool incomplete_src = deduced_src.Incomplete(); - const bool incomplete_dst = deduced_dst.Incomplete(); +template <class P> +ImageId TextureCache<P>::InsertImage(const ImageInfo& info, GPUVAddr gpu_addr, + RelaxedOptions options) { + const std::optional<VAddr> cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + ASSERT_MSG(cpu_addr, "Tried to insert an image to an invalid gpu_addr=0x{:x}", gpu_addr); + const ImageId image_id = JoinImages(info, gpu_addr, *cpu_addr); + const Image& image = slot_images[image_id]; + // Using "image.gpu_addr" instead of "gpu_addr" is important because it might be different + const auto [it, is_new] = image_allocs_table.try_emplace(image.gpu_addr); + if (is_new) { + it->second = slot_image_allocs.insert(); + } + slot_image_allocs[it->second].images.push_back(image_id); + return image_id; +} - if (incomplete_src && incomplete_dst) { +template <class P> +ImageId TextureCache<P>::JoinImages(const ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr) { + ImageInfo new_info = info; + const size_t size_bytes = CalculateGuestSizeInBytes(new_info); + const bool broken_views = runtime.HasBrokenTextureViewFormats(); + std::vector<ImageId> overlap_ids; + std::vector<ImageId> left_aliased_ids; + std::vector<ImageId> right_aliased_ids; + ForEachImageInRegion(cpu_addr, size_bytes, [&](ImageId overlap_id, ImageBase& overlap) { + if (info.type != overlap.info.type) { return; } - - const bool any_incomplete = incomplete_src || incomplete_dst; - - if (!any_incomplete) { - if (!(deduced_src.IsDepth() && deduced_dst.IsDepth())) { - return; - } - } else { - if (incomplete_src && !(deduced_dst.IsDepth())) { - return; - } - - if (incomplete_dst && !(deduced_src.IsDepth())) { - return; + if (info.type == ImageType::Linear) { + if (info.pitch == overlap.info.pitch && gpu_addr == overlap.gpu_addr) { + // Alias linear images with the same pitch + left_aliased_ids.push_back(overlap_id); } + return; } - - const auto inherit_format = [](SurfaceParams& to, TSurface from) { - const SurfaceParams& params = from->GetSurfaceParams(); - to.pixel_format = params.pixel_format; - to.type = params.type; - }; - // Now we got the cases where one or both is Depth and the other is not known - if (!incomplete_src) { - inherit_format(src_params, deduced_src.surface); - } else { - inherit_format(src_params, deduced_dst.surface); + static constexpr bool strict_size = true; + const std::optional<OverlapResult> solution = + ResolveOverlap(new_info, gpu_addr, cpu_addr, overlap, strict_size, broken_views); + if (solution) { + gpu_addr = solution->gpu_addr; + cpu_addr = solution->cpu_addr; + new_info.resources = solution->resources; + overlap_ids.push_back(overlap_id); + return; } - if (!incomplete_dst) { - inherit_format(dst_params, deduced_dst.surface); + static constexpr auto options = RelaxedOptions::Size | RelaxedOptions::Format; + const ImageBase new_image_base(new_info, gpu_addr, cpu_addr); + if (IsSubresource(new_info, overlap, gpu_addr, options, broken_views)) { + left_aliased_ids.push_back(overlap_id); + } else if (IsSubresource(overlap.info, new_image_base, overlap.gpu_addr, options, + broken_views)) { + right_aliased_ids.push_back(overlap_id); + } + }); + const ImageId new_image_id = slot_images.insert(runtime, new_info, gpu_addr, cpu_addr); + Image& new_image = slot_images[new_image_id]; + + // TODO: Only upload what we need + RefreshContents(new_image); + + for (const ImageId overlap_id : overlap_ids) { + Image& overlap = slot_images[overlap_id]; + if (overlap.info.num_samples != new_image.info.num_samples) { + LOG_WARNING(HW_GPU, "Copying between images with different samples is not implemented"); } else { - inherit_format(dst_params, deduced_src.surface); + const SubresourceBase base = new_image.TryFindBase(overlap.gpu_addr).value(); + const auto copies = MakeShrinkImageCopies(new_info, overlap.info, base); + runtime.CopyImage(new_image, overlap, copies); } + if (True(overlap.flags & ImageFlagBits::Tracked)) { + UntrackImage(overlap); + } + UnregisterImage(overlap_id); + DeleteImage(overlap_id); + } + ImageBase& new_image_base = new_image; + for (const ImageId aliased_id : right_aliased_ids) { + ImageBase& aliased = slot_images[aliased_id]; + AddImageAlias(new_image_base, aliased, new_image_id, aliased_id); + } + for (const ImageId aliased_id : left_aliased_ids) { + ImageBase& aliased = slot_images[aliased_id]; + AddImageAlias(aliased, new_image_base, aliased_id, new_image_id); } + RegisterImage(new_image_id); + return new_image_id; +} - std::pair<TSurface, TView> InitializeSurface(GPUVAddr gpu_addr, const SurfaceParams& params, - bool preserve_contents) { - auto new_surface{GetUncachedSurface(gpu_addr, params)}; - Register(new_surface); - if (preserve_contents) { - LoadSurface(new_surface); - } - return {new_surface, new_surface->GetMainView()}; +template <class P> +typename TextureCache<P>::BlitImages TextureCache<P>::GetBlitImages( + const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Surface& src) { + static constexpr auto FIND_OPTIONS = RelaxedOptions::Format | RelaxedOptions::Samples; + const GPUVAddr dst_addr = dst.Address(); + const GPUVAddr src_addr = src.Address(); + ImageInfo dst_info(dst); + ImageInfo src_info(src); + ImageId dst_id; + ImageId src_id; + do { + has_deleted_images = false; + dst_id = FindImage(dst_info, dst_addr, FIND_OPTIONS); + src_id = FindImage(src_info, src_addr, FIND_OPTIONS); + const ImageBase* const dst_image = dst_id ? &slot_images[dst_id] : nullptr; + const ImageBase* const src_image = src_id ? &slot_images[src_id] : nullptr; + DeduceBlitImages(dst_info, src_info, dst_image, src_image); + if (GetFormatType(dst_info.format) != GetFormatType(src_info.format)) { + continue; + } + if (!dst_id) { + dst_id = InsertImage(dst_info, dst_addr, RelaxedOptions{}); + } + if (!src_id) { + src_id = InsertImage(src_info, src_addr, RelaxedOptions{}); + } + } while (has_deleted_images); + return BlitImages{ + .dst_id = dst_id, + .src_id = src_id, + .dst_format = dst_info.format, + .src_format = src_info.format, + }; +} + +template <class P> +SamplerId TextureCache<P>::FindSampler(const TSCEntry& config) { + if (std::ranges::all_of(config.raw, [](u64 value) { return value == 0; })) { + return NULL_SAMPLER_ID; + } + const auto [pair, is_new] = samplers.try_emplace(config); + if (is_new) { + pair->second = slot_samplers.insert(runtime, config); } + return pair->second; +} - void LoadSurface(const TSurface& surface) { - staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes()); - surface->LoadBuffer(gpu_memory, staging_cache); - surface->UploadTexture(staging_cache.GetBuffer(0)); - surface->MarkAsModified(false, Tick()); +template <class P> +ImageViewId TextureCache<P>::FindColorBuffer(size_t index, bool is_clear) { + const auto& regs = maxwell3d.regs; + if (index >= regs.rt_control.count) { + return ImageViewId{}; + } + const auto& rt = regs.rt[index]; + const GPUVAddr gpu_addr = rt.Address(); + if (gpu_addr == 0) { + return ImageViewId{}; + } + if (rt.format == Tegra::RenderTargetFormat::NONE) { + return ImageViewId{}; } + const ImageInfo info(regs, index); + return FindRenderTargetView(info, gpu_addr, is_clear); +} - void FlushSurface(const TSurface& surface) { - if (!surface->IsModified()) { - return; - } - staging_cache.GetBuffer(0).resize(surface->GetHostSizeInBytes()); - surface->DownloadTexture(staging_cache.GetBuffer(0)); - surface->FlushBuffer(gpu_memory, staging_cache); - surface->MarkAsModified(false, Tick()); - } - - void RegisterInnerCache(TSurface& surface) { - const VAddr cpu_addr = surface->GetCpuAddr(); - VAddr start = cpu_addr >> registry_page_bits; - const VAddr end = (surface->GetCpuAddrEnd() - 1) >> registry_page_bits; - l1_cache[cpu_addr] = surface; - while (start <= end) { - registry[start].push_back(surface); - start++; - } +template <class P> +ImageViewId TextureCache<P>::FindDepthBuffer(bool is_clear) { + const auto& regs = maxwell3d.regs; + if (!regs.zeta_enable) { + return ImageViewId{}; + } + const GPUVAddr gpu_addr = regs.zeta.Address(); + if (gpu_addr == 0) { + return ImageViewId{}; } + const ImageInfo info(regs); + return FindRenderTargetView(info, gpu_addr, is_clear); +} - void UnregisterInnerCache(TSurface& surface) { - const VAddr cpu_addr = surface->GetCpuAddr(); - VAddr start = cpu_addr >> registry_page_bits; - const VAddr end = (surface->GetCpuAddrEnd() - 1) >> registry_page_bits; - l1_cache.erase(cpu_addr); - while (start <= end) { - auto& reg{registry[start]}; - reg.erase(std::find(reg.begin(), reg.end(), surface)); - start++; - } +template <class P> +ImageViewId TextureCache<P>::FindRenderTargetView(const ImageInfo& info, GPUVAddr gpu_addr, + bool is_clear) { + const auto options = is_clear ? RelaxedOptions::Samples : RelaxedOptions{}; + const ImageId image_id = FindOrInsertImage(info, gpu_addr, options); + if (!image_id) { + return NULL_IMAGE_VIEW_ID; + } + Image& image = slot_images[image_id]; + const ImageViewType view_type = RenderTargetImageViewType(info); + SubresourceBase base; + if (image.info.type == ImageType::Linear) { + base = SubresourceBase{.level = 0, .layer = 0}; + } else { + base = image.TryFindBase(gpu_addr).value(); } + const s32 layers = image.info.type == ImageType::e3D ? info.size.depth : info.resources.layers; + const SubresourceRange range{ + .base = base, + .extent = {.levels = 1, .layers = layers}, + }; + return FindOrEmplaceImageView(image_id, ImageViewInfo(view_type, info.format, range)); +} - VectorSurface GetSurfacesInRegion(const VAddr cpu_addr, const std::size_t size) { - if (size == 0) { - return {}; +template <class P> +template <typename Func> +void TextureCache<P>::ForEachImageInRegion(VAddr cpu_addr, size_t size, Func&& func) { + using FuncReturn = typename std::invoke_result<Func, ImageId, Image&>::type; + static constexpr bool BOOL_BREAK = std::is_same_v<FuncReturn, bool>; + boost::container::small_vector<ImageId, 32> images; + ForEachPage(cpu_addr, size, [this, &images, cpu_addr, size, func](u64 page) { + const auto it = page_table.find(page); + if (it == page_table.end()) { + if constexpr (BOOL_BREAK) { + return false; + } else { + return; + } } - const VAddr cpu_addr_end = cpu_addr + size; - const VAddr end = (cpu_addr_end - 1) >> registry_page_bits; - VectorSurface surfaces; - for (VAddr start = cpu_addr >> registry_page_bits; start <= end; ++start) { - const auto it = registry.find(start); - if (it == registry.end()) { + for (const ImageId image_id : it->second) { + Image& image = slot_images[image_id]; + if (True(image.flags & ImageFlagBits::Picked)) { continue; } - for (auto& surface : it->second) { - if (surface->IsPicked() || !surface->Overlaps(cpu_addr, cpu_addr_end)) { - continue; + if (!image.Overlaps(cpu_addr, size)) { + continue; + } + image.flags |= ImageFlagBits::Picked; + images.push_back(image_id); + if constexpr (BOOL_BREAK) { + if (func(image_id, image)) { + return true; } - surface->MarkAsPicked(true); - surfaces.push_back(surface); + } else { + func(image_id, image); } } - for (auto& surface : surfaces) { - surface->MarkAsPicked(false); + if constexpr (BOOL_BREAK) { + return false; } - return surfaces; + }); + for (const ImageId image_id : images) { + slot_images[image_id].flags &= ~ImageFlagBits::Picked; } +} - void ReserveSurface(const SurfaceParams& params, TSurface surface) { - surface_reserve[params].push_back(std::move(surface)); +template <class P> +ImageViewId TextureCache<P>::FindOrEmplaceImageView(ImageId image_id, const ImageViewInfo& info) { + Image& image = slot_images[image_id]; + if (const ImageViewId image_view_id = image.FindView(info); image_view_id) { + return image_view_id; } + const ImageViewId image_view_id = slot_image_views.insert(runtime, info, image_id, image); + image.InsertView(info, image_view_id); + return image_view_id; +} + +template <class P> +void TextureCache<P>::RegisterImage(ImageId image_id) { + ImageBase& image = slot_images[image_id]; + ASSERT_MSG(False(image.flags & ImageFlagBits::Registered), + "Trying to register an already registered image"); + image.flags |= ImageFlagBits::Registered; + ForEachPage(image.cpu_addr, image.guest_size_bytes, + [this, image_id](u64 page) { page_table[page].push_back(image_id); }); +} - TSurface TryGetReservedSurface(const SurfaceParams& params) { - auto search{surface_reserve.find(params)}; - if (search == surface_reserve.end()) { - return {}; +template <class P> +void TextureCache<P>::UnregisterImage(ImageId image_id) { + Image& image = slot_images[image_id]; + ASSERT_MSG(True(image.flags & ImageFlagBits::Registered), + "Trying to unregister an already registered image"); + image.flags &= ~ImageFlagBits::Registered; + ForEachPage(image.cpu_addr, image.guest_size_bytes, [this, image_id](u64 page) { + const auto page_it = page_table.find(page); + if (page_it == page_table.end()) { + UNREACHABLE_MSG("Unregistering unregistered page=0x{:x}", page << PAGE_BITS); + return; } - for (auto& surface : search->second) { - if (!surface->IsRegistered()) { - return surface; - } + std::vector<ImageId>& image_ids = page_it->second; + const auto vector_it = std::ranges::find(image_ids, image_id); + if (vector_it == image_ids.end()) { + UNREACHABLE_MSG("Unregistering unregistered image in page=0x{:x}", page << PAGE_BITS); + return; } - return {}; - } + image_ids.erase(vector_it); + }); +} - /// Try to do an image copy logging when formats are incompatible. - void TryCopyImage(TSurface& src, TSurface& dst, const CopyParams& copy) { - const SurfaceParams& src_params = src->GetSurfaceParams(); - const SurfaceParams& dst_params = dst->GetSurfaceParams(); - if (!format_compatibility.TestCopy(src_params.pixel_format, dst_params.pixel_format)) { - LOG_ERROR(HW_GPU, "Illegal copy between formats={{{}, {}}}", - static_cast<int>(dst_params.pixel_format), - static_cast<int>(src_params.pixel_format)); - return; +template <class P> +void TextureCache<P>::TrackImage(ImageBase& image) { + ASSERT(False(image.flags & ImageFlagBits::Tracked)); + image.flags |= ImageFlagBits::Tracked; + rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, 1); +} + +template <class P> +void TextureCache<P>::UntrackImage(ImageBase& image) { + ASSERT(True(image.flags & ImageFlagBits::Tracked)); + image.flags &= ~ImageFlagBits::Tracked; + rasterizer.UpdatePagesCachedCount(image.cpu_addr, image.guest_size_bytes, -1); +} + +template <class P> +void TextureCache<P>::DeleteImage(ImageId image_id) { + ImageBase& image = slot_images[image_id]; + const GPUVAddr gpu_addr = image.gpu_addr; + const auto alloc_it = image_allocs_table.find(gpu_addr); + if (alloc_it == image_allocs_table.end()) { + UNREACHABLE_MSG("Trying to delete an image alloc that does not exist in address 0x{:x}", + gpu_addr); + return; + } + const ImageAllocId alloc_id = alloc_it->second; + std::vector<ImageId>& alloc_images = slot_image_allocs[alloc_id].images; + const auto alloc_image_it = std::ranges::find(alloc_images, image_id); + if (alloc_image_it == alloc_images.end()) { + UNREACHABLE_MSG("Trying to delete an image that does not exist"); + return; + } + ASSERT_MSG(False(image.flags & ImageFlagBits::Tracked), "Image was not untracked"); + ASSERT_MSG(False(image.flags & ImageFlagBits::Registered), "Image was not unregistered"); + + // Mark render targets as dirty + auto& dirty = maxwell3d.dirty.flags; + dirty[Dirty::RenderTargets] = true; + dirty[Dirty::ZetaBuffer] = true; + for (size_t rt = 0; rt < NUM_RT; ++rt) { + dirty[Dirty::ColorBuffer0 + rt] = true; + } + const std::span<const ImageViewId> image_view_ids = image.image_view_ids; + for (const ImageViewId image_view_id : image_view_ids) { + std::ranges::replace(render_targets.color_buffer_ids, image_view_id, ImageViewId{}); + if (render_targets.depth_buffer_id == image_view_id) { + render_targets.depth_buffer_id = ImageViewId{}; } - ImageCopy(src, dst, copy); } + RemoveImageViewReferences(image_view_ids); + RemoveFramebuffers(image_view_ids); + + for (const AliasedImage& alias : image.aliased_images) { + ImageBase& other_image = slot_images[alias.id]; + [[maybe_unused]] const size_t num_removed_aliases = + std::erase_if(other_image.aliased_images, [image_id](const AliasedImage& other_alias) { + return other_alias.id == image_id; + }); + ASSERT_MSG(num_removed_aliases == 1, "Invalid number of removed aliases: {}", + num_removed_aliases); + } + for (const ImageViewId image_view_id : image_view_ids) { + sentenced_image_view.Push(std::move(slot_image_views[image_view_id])); + slot_image_views.erase(image_view_id); + } + sentenced_images.Push(std::move(slot_images[image_id])); + slot_images.erase(image_id); - constexpr PixelFormat GetSiblingFormat(PixelFormat format) const { - return siblings_table[static_cast<std::size_t>(format)]; + alloc_images.erase(alloc_image_it); + if (alloc_images.empty()) { + image_allocs_table.erase(alloc_it); } + if constexpr (ENABLE_VALIDATION) { + std::ranges::fill(graphics_image_view_ids, CORRUPT_ID); + std::ranges::fill(compute_image_view_ids, CORRUPT_ID); + } + graphics_image_table.Invalidate(); + compute_image_table.Invalidate(); + has_deleted_images = true; +} - /// Returns true the shader sampler entry is compatible with the TIC texture type. - static bool IsTypeCompatible(Tegra::Texture::TextureType tic_type, - const VideoCommon::Shader::Sampler& entry) { - const auto shader_type = entry.type; - switch (tic_type) { - case Tegra::Texture::TextureType::Texture1D: - case Tegra::Texture::TextureType::Texture1DArray: - return shader_type == Tegra::Shader::TextureType::Texture1D; - case Tegra::Texture::TextureType::Texture1DBuffer: - // TODO(Rodrigo): Assume as valid for now - return true; - case Tegra::Texture::TextureType::Texture2D: - case Tegra::Texture::TextureType::Texture2DNoMipmap: - return shader_type == Tegra::Shader::TextureType::Texture2D; - case Tegra::Texture::TextureType::Texture2DArray: - return shader_type == Tegra::Shader::TextureType::Texture2D || - shader_type == Tegra::Shader::TextureType::TextureCube; - case Tegra::Texture::TextureType::Texture3D: - return shader_type == Tegra::Shader::TextureType::Texture3D; - case Tegra::Texture::TextureType::TextureCubeArray: - case Tegra::Texture::TextureType::TextureCubemap: - if (shader_type == Tegra::Shader::TextureType::TextureCube) { - return true; - } - return shader_type == Tegra::Shader::TextureType::Texture2D && entry.is_array; +template <class P> +void TextureCache<P>::RemoveImageViewReferences(std::span<const ImageViewId> removed_views) { + auto it = image_views.begin(); + while (it != image_views.end()) { + const auto found = std::ranges::find(removed_views, it->second); + if (found != removed_views.end()) { + it = image_views.erase(it); + } else { + ++it; } - UNREACHABLE(); - return true; } +} - struct FramebufferTargetInfo { - TSurface target; - TView view; - }; - - void AsyncFlushSurface(TSurface& surface) { - if (!uncommitted_flushes) { - uncommitted_flushes = std::make_shared<std::list<TSurface>>(); +template <class P> +void TextureCache<P>::RemoveFramebuffers(std::span<const ImageViewId> removed_views) { + auto it = framebuffers.begin(); + while (it != framebuffers.end()) { + if (it->first.Contains(removed_views)) { + it = framebuffers.erase(it); + } else { + ++it; } - uncommitted_flushes->push_back(surface); } +} - VideoCore::RasterizerInterface& rasterizer; - Tegra::Engines::Maxwell3D& maxwell3d; - Tegra::MemoryManager& gpu_memory; - - FormatLookupTable format_lookup_table; - FormatCompatibility format_compatibility; - - u64 ticks{}; - - // Guards the cache for protection conflicts. - bool guard_render_targets{}; - bool guard_samplers{}; - - // The siblings table is for formats that can inter exchange with one another - // without causing issues. This is only valid when a conflict occurs on a non - // rendering use. - std::array<PixelFormat, static_cast<std::size_t>(PixelFormat::Max)> siblings_table; - - // The internal Cache is different for the Texture Cache. It's based on buckets - // of 1MB. This fits better for the purpose of this cache as textures are normaly - // large in size. - static constexpr u64 registry_page_bits{20}; - static constexpr u64 registry_page_size{1 << registry_page_bits}; - std::unordered_map<VAddr, std::vector<TSurface>> registry; +template <class P> +void TextureCache<P>::MarkModification(ImageBase& image) noexcept { + image.flags |= ImageFlagBits::GpuModified; + image.modification_tick = ++modification_tick; +} - static constexpr u32 DEPTH_RT = 8; - static constexpr u32 NO_RT = 0xFFFFFFFF; +template <class P> +void TextureCache<P>::SynchronizeAliases(ImageId image_id) { + boost::container::small_vector<const AliasedImage*, 1> aliased_images; + ImageBase& image = slot_images[image_id]; + u64 most_recent_tick = image.modification_tick; + for (const AliasedImage& aliased : image.aliased_images) { + ImageBase& aliased_image = slot_images[aliased.id]; + if (image.modification_tick < aliased_image.modification_tick) { + most_recent_tick = std::max(most_recent_tick, aliased_image.modification_tick); + aliased_images.push_back(&aliased); + } + } + if (aliased_images.empty()) { + return; + } + image.modification_tick = most_recent_tick; + std::ranges::sort(aliased_images, [this](const AliasedImage* lhs, const AliasedImage* rhs) { + const ImageBase& lhs_image = slot_images[lhs->id]; + const ImageBase& rhs_image = slot_images[rhs->id]; + return lhs_image.modification_tick < rhs_image.modification_tick; + }); + for (const AliasedImage* const aliased : aliased_images) { + CopyImage(image_id, aliased->id, aliased->copies); + } +} - // The L1 Cache is used for fast texture lookup before checking the overlaps - // This avoids calculating size and other stuffs. - std::unordered_map<VAddr, TSurface> l1_cache; +template <class P> +void TextureCache<P>::PrepareImage(ImageId image_id, bool is_modification, bool invalidate) { + Image& image = slot_images[image_id]; + if (invalidate) { + image.flags &= ~(ImageFlagBits::CpuModified | ImageFlagBits::GpuModified); + if (False(image.flags & ImageFlagBits::Tracked)) { + TrackImage(image); + } + } else { + RefreshContents(image); + SynchronizeAliases(image_id); + } + if (is_modification) { + MarkModification(image); + } + image.frame_tick = frame_tick; +} - /// The surface reserve is a "backup" cache, this is where we put unique surfaces that have - /// previously been used. This is to prevent surfaces from being constantly created and - /// destroyed when used with different surface parameters. - std::unordered_map<SurfaceParams, std::vector<TSurface>> surface_reserve; - std::array<FramebufferTargetInfo, Tegra::Engines::Maxwell3D::Regs::NumRenderTargets> - render_targets; - FramebufferTargetInfo depth_buffer; +template <class P> +void TextureCache<P>::PrepareImageView(ImageViewId image_view_id, bool is_modification, + bool invalidate) { + if (!image_view_id) { + return; + } + const ImageViewBase& image_view = slot_image_views[image_view_id]; + PrepareImage(image_view.image_id, is_modification, invalidate); +} - std::vector<TSurface> sampled_textures; +template <class P> +void TextureCache<P>::CopyImage(ImageId dst_id, ImageId src_id, std::span<const ImageCopy> copies) { + Image& dst = slot_images[dst_id]; + Image& src = slot_images[src_id]; + const auto dst_format_type = GetFormatType(dst.info.format); + const auto src_format_type = GetFormatType(src.info.format); + if (src_format_type == dst_format_type) { + if constexpr (HAS_EMULATED_COPIES) { + if (!runtime.CanImageBeCopied(dst, src)) { + return runtime.EmulateCopyImage(dst, src, copies); + } + } + return runtime.CopyImage(dst, src, copies); + } + UNIMPLEMENTED_IF(dst.info.type != ImageType::e2D); + UNIMPLEMENTED_IF(src.info.type != ImageType::e2D); + for (const ImageCopy& copy : copies) { + UNIMPLEMENTED_IF(copy.dst_subresource.num_layers != 1); + UNIMPLEMENTED_IF(copy.src_subresource.num_layers != 1); + UNIMPLEMENTED_IF(copy.src_offset != Offset3D{}); + UNIMPLEMENTED_IF(copy.dst_offset != Offset3D{}); + + const SubresourceBase dst_base{ + .level = copy.dst_subresource.base_level, + .layer = copy.dst_subresource.base_layer, + }; + const SubresourceBase src_base{ + .level = copy.src_subresource.base_level, + .layer = copy.src_subresource.base_layer, + }; + const SubresourceExtent dst_extent{.levels = 1, .layers = 1}; + const SubresourceExtent src_extent{.levels = 1, .layers = 1}; + const SubresourceRange dst_range{.base = dst_base, .extent = dst_extent}; + const SubresourceRange src_range{.base = src_base, .extent = src_extent}; + const ImageViewInfo dst_view_info(ImageViewType::e2D, dst.info.format, dst_range); + const ImageViewInfo src_view_info(ImageViewType::e2D, src.info.format, src_range); + const auto [dst_framebuffer_id, dst_view_id] = RenderTargetFromImage(dst_id, dst_view_info); + Framebuffer* const dst_framebuffer = &slot_framebuffers[dst_framebuffer_id]; + const ImageViewId src_view_id = FindOrEmplaceImageView(src_id, src_view_info); + ImageView& dst_view = slot_image_views[dst_view_id]; + ImageView& src_view = slot_image_views[src_view_id]; + [[maybe_unused]] const Extent3D expected_size{ + .width = std::min(dst_view.size.width, src_view.size.width), + .height = std::min(dst_view.size.height, src_view.size.height), + .depth = std::min(dst_view.size.depth, src_view.size.depth), + }; + UNIMPLEMENTED_IF(copy.extent != expected_size); - /// This cache stores null surfaces in order to be used as a placeholder - /// for invalid texture calls. - std::unordered_map<u32, TSurface> invalid_cache; - std::vector<u8> invalid_memory; + runtime.ConvertImage(dst_framebuffer, dst_view, src_view); + } +} - std::list<TSurface> marked_for_unregister; +template <class P> +void TextureCache<P>::BindRenderTarget(ImageViewId* old_id, ImageViewId new_id) { + if (*old_id == new_id) { + return; + } + if (*old_id) { + const ImageViewBase& old_view = slot_image_views[*old_id]; + if (True(old_view.flags & ImageViewFlagBits::PreemtiveDownload)) { + uncommitted_downloads.push_back(old_view.image_id); + } + } + *old_id = new_id; +} - std::shared_ptr<std::list<TSurface>> uncommitted_flushes{}; - std::list<std::shared_ptr<std::list<TSurface>>> committed_flushes; +template <class P> +std::pair<FramebufferId, ImageViewId> TextureCache<P>::RenderTargetFromImage( + ImageId image_id, const ImageViewInfo& view_info) { + const ImageViewId view_id = FindOrEmplaceImageView(image_id, view_info); + const ImageBase& image = slot_images[image_id]; + const bool is_color = GetFormatType(image.info.format) == SurfaceType::ColorTexture; + const ImageViewId color_view_id = is_color ? view_id : ImageViewId{}; + const ImageViewId depth_view_id = is_color ? ImageViewId{} : view_id; + const Extent3D extent = MipSize(image.info.size, view_info.range.base.level); + const u32 num_samples = image.info.num_samples; + const auto [samples_x, samples_y] = SamplesLog2(num_samples); + const FramebufferId framebuffer_id = GetFramebufferId(RenderTargets{ + .color_buffer_ids = {color_view_id}, + .depth_buffer_id = depth_view_id, + .size = {extent.width >> samples_x, extent.height >> samples_y}, + }); + return {framebuffer_id, view_id}; +} - StagingCache staging_cache; - std::recursive_mutex mutex; -}; +template <class P> +bool TextureCache<P>::IsFullClear(ImageViewId id) { + if (!id) { + return true; + } + const ImageViewBase& image_view = slot_image_views[id]; + const ImageBase& image = slot_images[image_view.image_id]; + const Extent3D size = image_view.size; + const auto& regs = maxwell3d.regs; + const auto& scissor = regs.scissor_test[0]; + if (image.info.resources.levels > 1 || image.info.resources.layers > 1) { + // Images with multiple resources can't be cleared in a single call + return false; + } + if (regs.clear_flags.scissor == 0) { + // If scissor testing is disabled, the clear is always full + return true; + } + // Make sure the clear covers all texels in the subresource + return scissor.min_x == 0 && scissor.min_y == 0 && scissor.max_x >= size.width && + scissor.max_y >= size.height; +} } // namespace VideoCommon diff --git a/src/video_core/texture_cache/types.h b/src/video_core/texture_cache/types.h new file mode 100644 index 000000000..2ad2d72a6 --- /dev/null +++ b/src/video_core/texture_cache/types.h @@ -0,0 +1,140 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/texture_cache/slot_vector.h" + +namespace VideoCommon { + +constexpr size_t NUM_RT = 8; +constexpr size_t MAX_MIP_LEVELS = 14; + +constexpr SlotId CORRUPT_ID{0xfffffffe}; + +using ImageId = SlotId; +using ImageViewId = SlotId; +using ImageAllocId = SlotId; +using SamplerId = SlotId; +using FramebufferId = SlotId; + +enum class ImageType : u32 { + e1D, + e2D, + e3D, + Linear, + Buffer, +}; + +enum class ImageViewType : u32 { + e1D, + e2D, + Cube, + e3D, + e1DArray, + e2DArray, + CubeArray, + Rect, + Buffer, +}; +constexpr size_t NUM_IMAGE_VIEW_TYPES = 9; + +enum class RelaxedOptions : u32 { + Size = 1 << 0, + Format = 1 << 1, + Samples = 1 << 2, +}; +DECLARE_ENUM_FLAG_OPERATORS(RelaxedOptions) + +struct Offset2D { + constexpr auto operator<=>(const Offset2D&) const noexcept = default; + + s32 x; + s32 y; +}; + +struct Offset3D { + constexpr auto operator<=>(const Offset3D&) const noexcept = default; + + s32 x; + s32 y; + s32 z; +}; + +struct Extent2D { + constexpr auto operator<=>(const Extent2D&) const noexcept = default; + + u32 width; + u32 height; +}; + +struct Extent3D { + constexpr auto operator<=>(const Extent3D&) const noexcept = default; + + u32 width; + u32 height; + u32 depth; +}; + +struct SubresourceLayers { + s32 base_level = 0; + s32 base_layer = 0; + s32 num_layers = 1; +}; + +struct SubresourceBase { + constexpr auto operator<=>(const SubresourceBase&) const noexcept = default; + + s32 level = 0; + s32 layer = 0; +}; + +struct SubresourceExtent { + constexpr auto operator<=>(const SubresourceExtent&) const noexcept = default; + + s32 levels = 1; + s32 layers = 1; +}; + +struct SubresourceRange { + constexpr auto operator<=>(const SubresourceRange&) const noexcept = default; + + SubresourceBase base; + SubresourceExtent extent; +}; + +struct ImageCopy { + SubresourceLayers src_subresource; + SubresourceLayers dst_subresource; + Offset3D src_offset; + Offset3D dst_offset; + Extent3D extent; +}; + +struct BufferImageCopy { + size_t buffer_offset; + size_t buffer_size; + u32 buffer_row_length; + u32 buffer_image_height; + SubresourceLayers image_subresource; + Offset3D image_offset; + Extent3D image_extent; +}; + +struct BufferCopy { + size_t src_offset; + size_t dst_offset; + size_t size; +}; + +struct SwizzleParameters { + Extent3D num_tiles; + Extent3D block; + size_t buffer_offset; + s32 level; +}; + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp new file mode 100644 index 000000000..279932778 --- /dev/null +++ b/src/video_core/texture_cache/util.cpp @@ -0,0 +1,1233 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +// This files contains code from Ryujinx +// A copy of the code can be obtained from https://github.com/Ryujinx/Ryujinx +// The sections using code from Ryujinx are marked with a link to the original version + +// MIT License +// +// Copyright (c) Ryujinx Team and Contributors +// +// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +// associated documentation files (the "Software"), to deal in the Software without restriction, +// including without limitation the rights to use, copy, modify, merge, publish, distribute, +// sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all copies or +// substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +// NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + +#include <algorithm> +#include <array> +#include <numeric> +#include <optional> +#include <span> +#include <vector> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/common_types.h" +#include "common/div_ceil.h" +#include "video_core/compatible_formats.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/memory_manager.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/decode_bc4.h" +#include "video_core/texture_cache/format_lookup_table.h" +#include "video_core/texture_cache/formatter.h" +#include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/util.h" +#include "video_core/textures/astc.h" +#include "video_core/textures/decoders.h" + +namespace VideoCommon { + +namespace { + +using Tegra::Texture::GOB_SIZE; +using Tegra::Texture::GOB_SIZE_SHIFT; +using Tegra::Texture::GOB_SIZE_X; +using Tegra::Texture::GOB_SIZE_X_SHIFT; +using Tegra::Texture::GOB_SIZE_Y; +using Tegra::Texture::GOB_SIZE_Y_SHIFT; +using Tegra::Texture::GOB_SIZE_Z; +using Tegra::Texture::GOB_SIZE_Z_SHIFT; +using Tegra::Texture::MsaaMode; +using Tegra::Texture::SwizzleTexture; +using Tegra::Texture::TextureFormat; +using Tegra::Texture::TextureType; +using Tegra::Texture::TICEntry; +using Tegra::Texture::UnswizzleTexture; +using VideoCore::Surface::BytesPerBlock; +using VideoCore::Surface::DefaultBlockHeight; +using VideoCore::Surface::DefaultBlockWidth; +using VideoCore::Surface::IsCopyCompatible; +using VideoCore::Surface::IsPixelFormatASTC; +using VideoCore::Surface::IsViewCompatible; +using VideoCore::Surface::PixelFormatFromDepthFormat; +using VideoCore::Surface::PixelFormatFromRenderTargetFormat; +using VideoCore::Surface::SurfaceType; + +constexpr u32 CONVERTED_BYTES_PER_BLOCK = BytesPerBlock(PixelFormat::A8B8G8R8_UNORM); + +struct LevelInfo { + Extent3D size; + Extent3D block; + Extent2D tile_size; + u32 bpp_log2; + u32 tile_width_spacing; +}; + +[[nodiscard]] constexpr u32 AdjustTileSize(u32 shift, u32 unit_factor, u32 dimension) { + if (shift == 0) { + return 0; + } + u32 x = unit_factor << (shift - 1); + if (x >= dimension) { + while (--shift) { + x >>= 1; + if (x < dimension) { + break; + } + } + } + return shift; +} + +[[nodiscard]] constexpr u32 AdjustMipSize(u32 size, u32 level) { + return std::max<u32>(size >> level, 1); +} + +[[nodiscard]] constexpr Extent3D AdjustMipSize(Extent3D size, s32 level) { + return Extent3D{ + .width = AdjustMipSize(size.width, level), + .height = AdjustMipSize(size.height, level), + .depth = AdjustMipSize(size.depth, level), + }; +} + +[[nodiscard]] Extent3D AdjustSamplesSize(Extent3D size, s32 num_samples) { + const auto [samples_x, samples_y] = SamplesLog2(num_samples); + return Extent3D{ + .width = size.width >> samples_x, + .height = size.height >> samples_y, + .depth = size.depth, + }; +} + +template <u32 GOB_EXTENT> +[[nodiscard]] constexpr u32 AdjustMipBlockSize(u32 num_tiles, u32 block_size, u32 level) { + do { + while (block_size > 0 && num_tiles <= (1U << (block_size - 1)) * GOB_EXTENT) { + --block_size; + } + } while (level--); + return block_size; +} + +[[nodiscard]] constexpr Extent3D AdjustMipBlockSize(Extent3D num_tiles, Extent3D block_size, + u32 level) { + return { + .width = AdjustMipBlockSize<GOB_SIZE_X>(num_tiles.width, block_size.width, level), + .height = AdjustMipBlockSize<GOB_SIZE_Y>(num_tiles.height, block_size.height, level), + .depth = AdjustMipBlockSize<GOB_SIZE_Z>(num_tiles.depth, block_size.depth, level), + }; +} + +[[nodiscard]] constexpr Extent3D AdjustTileSize(Extent3D size, Extent2D tile_size) { + return { + .width = Common::DivCeil(size.width, tile_size.width), + .height = Common::DivCeil(size.height, tile_size.height), + .depth = size.depth, + }; +} + +[[nodiscard]] constexpr u32 BytesPerBlockLog2(u32 bytes_per_block) { + return std::countl_zero(bytes_per_block) ^ 0x1F; +} + +[[nodiscard]] constexpr u32 BytesPerBlockLog2(PixelFormat format) { + return BytesPerBlockLog2(BytesPerBlock(format)); +} + +[[nodiscard]] constexpr u32 NumBlocks(Extent3D size, Extent2D tile_size) { + const Extent3D num_blocks = AdjustTileSize(size, tile_size); + return num_blocks.width * num_blocks.height * num_blocks.depth; +} + +[[nodiscard]] constexpr u32 AdjustSize(u32 size, u32 level, u32 block_size) { + return Common::DivCeil(AdjustMipSize(size, level), block_size); +} + +[[nodiscard]] constexpr u32 LayerSize(const TICEntry& config, PixelFormat format) { + return config.Width() * config.Height() * BytesPerBlock(format); +} + +[[nodiscard]] constexpr bool HasTwoDimsPerLayer(TextureType type) { + switch (type) { + case TextureType::Texture2D: + case TextureType::Texture2DArray: + case TextureType::Texture2DNoMipmap: + case TextureType::Texture3D: + case TextureType::TextureCubeArray: + case TextureType::TextureCubemap: + return true; + case TextureType::Texture1D: + case TextureType::Texture1DArray: + case TextureType::Texture1DBuffer: + return false; + } + return false; +} + +[[nodiscard]] constexpr bool HasTwoDimsPerLayer(ImageType type) { + switch (type) { + case ImageType::e2D: + case ImageType::e3D: + case ImageType::Linear: + return true; + case ImageType::e1D: + case ImageType::Buffer: + return false; + } + UNREACHABLE_MSG("Invalid image type={}", static_cast<int>(type)); +} + +[[nodiscard]] constexpr std::pair<int, int> Samples(int num_samples) { + switch (num_samples) { + case 1: + return {1, 1}; + case 2: + return {2, 1}; + case 4: + return {2, 2}; + case 8: + return {4, 2}; + case 16: + return {4, 4}; + } + UNREACHABLE_MSG("Invalid number of samples={}", num_samples); + return {1, 1}; +} + +[[nodiscard]] constexpr Extent2D DefaultBlockSize(PixelFormat format) { + return {DefaultBlockWidth(format), DefaultBlockHeight(format)}; +} + +[[nodiscard]] constexpr Extent3D NumLevelBlocks(const LevelInfo& info, u32 level) { + return Extent3D{ + .width = AdjustSize(info.size.width, level, info.tile_size.width) << info.bpp_log2, + .height = AdjustSize(info.size.height, level, info.tile_size.height), + .depth = AdjustMipSize(info.size.depth, level), + }; +} + +[[nodiscard]] constexpr Extent3D TileShift(const LevelInfo& info, u32 level) { + const Extent3D blocks = NumLevelBlocks(info, level); + return Extent3D{ + .width = AdjustTileSize(info.block.width, GOB_SIZE_X, blocks.width), + .height = AdjustTileSize(info.block.height, GOB_SIZE_Y, blocks.height), + .depth = AdjustTileSize(info.block.depth, GOB_SIZE_Z, blocks.depth), + }; +} + +[[nodiscard]] constexpr Extent2D GobSize(u32 bpp_log2, u32 block_height, u32 tile_width_spacing) { + return Extent2D{ + .width = GOB_SIZE_X_SHIFT - bpp_log2 + tile_width_spacing, + .height = GOB_SIZE_Y_SHIFT + block_height, + }; +} + +[[nodiscard]] constexpr bool IsSmallerThanGobSize(Extent3D num_tiles, Extent2D gob, + u32 block_depth) { + return num_tiles.width <= (1U << gob.width) || num_tiles.height <= (1U << gob.height) || + num_tiles.depth < (1U << block_depth); +} + +[[nodiscard]] constexpr u32 StrideAlignment(Extent3D num_tiles, Extent3D block, Extent2D gob, + u32 bpp_log2) { + if (IsSmallerThanGobSize(num_tiles, gob, block.depth)) { + return GOB_SIZE_X_SHIFT - bpp_log2; + } else { + return gob.width; + } +} + +[[nodiscard]] constexpr u32 StrideAlignment(Extent3D num_tiles, Extent3D block, u32 bpp_log2, + u32 tile_width_spacing) { + const Extent2D gob = GobSize(bpp_log2, block.height, tile_width_spacing); + return StrideAlignment(num_tiles, block, gob, bpp_log2); +} + +[[nodiscard]] constexpr Extent2D NumGobs(const LevelInfo& info, u32 level) { + const Extent3D blocks = NumLevelBlocks(info, level); + const Extent2D gobs{ + .width = Common::DivCeilLog2(blocks.width, GOB_SIZE_X_SHIFT), + .height = Common::DivCeilLog2(blocks.height, GOB_SIZE_Y_SHIFT), + }; + const Extent2D gob = GobSize(info.bpp_log2, info.block.height, info.tile_width_spacing); + const bool is_small = IsSmallerThanGobSize(blocks, gob, info.block.depth); + const u32 alignment = is_small ? 0 : info.tile_width_spacing; + return Extent2D{ + .width = Common::AlignBits(gobs.width, alignment), + .height = gobs.height, + }; +} + +[[nodiscard]] constexpr Extent3D LevelTiles(const LevelInfo& info, u32 level) { + const Extent3D blocks = NumLevelBlocks(info, level); + const Extent3D tile_shift = TileShift(info, level); + const Extent2D gobs = NumGobs(info, level); + return Extent3D{ + .width = Common::DivCeilLog2(gobs.width, tile_shift.width), + .height = Common::DivCeilLog2(gobs.height, tile_shift.height), + .depth = Common::DivCeilLog2(blocks.depth, tile_shift.depth), + }; +} + +[[nodiscard]] constexpr u32 CalculateLevelSize(const LevelInfo& info, u32 level) { + const Extent3D tile_shift = TileShift(info, level); + const Extent3D tiles = LevelTiles(info, level); + const u32 num_tiles = tiles.width * tiles.height * tiles.depth; + const u32 shift = GOB_SIZE_SHIFT + tile_shift.width + tile_shift.height + tile_shift.depth; + return num_tiles << shift; +} + +[[nodiscard]] constexpr std::array<u32, MAX_MIP_LEVELS> CalculateLevelSizes(const LevelInfo& info, + u32 num_levels) { + ASSERT(num_levels <= MAX_MIP_LEVELS); + std::array<u32, MAX_MIP_LEVELS> sizes{}; + for (u32 level = 0; level < num_levels; ++level) { + sizes[level] = CalculateLevelSize(info, level); + } + return sizes; +} + +[[nodiscard]] constexpr LevelInfo MakeLevelInfo(PixelFormat format, Extent3D size, Extent3D block, + u32 num_samples, u32 tile_width_spacing) { + const auto [samples_x, samples_y] = Samples(num_samples); + const u32 bytes_per_block = BytesPerBlock(format); + return { + .size = + { + .width = size.width * samples_x, + .height = size.height * samples_y, + .depth = size.depth, + }, + .block = block, + .tile_size = DefaultBlockSize(format), + .bpp_log2 = BytesPerBlockLog2(bytes_per_block), + .tile_width_spacing = tile_width_spacing, + }; +} + +[[nodiscard]] constexpr LevelInfo MakeLevelInfo(const ImageInfo& info) { + return MakeLevelInfo(info.format, info.size, info.block, info.num_samples, + info.tile_width_spacing); +} + +[[nodiscard]] constexpr u32 CalculateLevelOffset(PixelFormat format, Extent3D size, Extent3D block, + u32 num_samples, u32 tile_width_spacing, + u32 level) { + const LevelInfo info = MakeLevelInfo(format, size, block, num_samples, tile_width_spacing); + u32 offset = 0; + for (u32 current_level = 0; current_level < level; ++current_level) { + offset += CalculateLevelSize(info, current_level); + } + return offset; +} + +[[nodiscard]] constexpr u32 AlignLayerSize(u32 size_bytes, Extent3D size, Extent3D block, + u32 tile_size_y, u32 tile_width_spacing) { + // https://github.com/Ryujinx/Ryujinx/blob/1c9aba6de1520aea5480c032e0ff5664ac1bb36f/Ryujinx.Graphics.Texture/SizeCalculator.cs#L134 + if (tile_width_spacing > 0) { + const u32 alignment_log2 = GOB_SIZE_SHIFT + tile_width_spacing + block.height + block.depth; + return Common::AlignBits(size_bytes, alignment_log2); + } + const u32 aligned_height = Common::AlignUp(size.height, tile_size_y); + while (block.height != 0 && aligned_height <= (1U << (block.height - 1)) * GOB_SIZE_Y) { + --block.height; + } + while (block.depth != 0 && size.depth <= (1U << (block.depth - 1))) { + --block.depth; + } + const u32 block_shift = GOB_SIZE_SHIFT + block.height + block.depth; + const u32 num_blocks = size_bytes >> block_shift; + if (size_bytes != num_blocks << block_shift) { + return (num_blocks + 1) << block_shift; + } + return size_bytes; +} + +[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapEqualAddress(const ImageInfo& new_info, + const ImageBase& overlap, + bool strict_size) { + const ImageInfo& info = overlap.info; + if (!IsBlockLinearSizeCompatible(new_info, info, 0, 0, strict_size)) { + return std::nullopt; + } + if (new_info.block != info.block) { + return std::nullopt; + } + const SubresourceExtent resources = new_info.resources; + return SubresourceExtent{ + .levels = std::max(resources.levels, info.resources.levels), + .layers = std::max(resources.layers, info.resources.layers), + }; +} + +[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress3D( + const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) { + const std::vector<u32> slice_offsets = CalculateSliceOffsets(new_info); + const u32 diff = static_cast<u32>(overlap.gpu_addr - gpu_addr); + const auto it = std::ranges::find(slice_offsets, diff); + if (it == slice_offsets.end()) { + return std::nullopt; + } + const std::vector subresources = CalculateSliceSubresources(new_info); + const SubresourceBase base = subresources[std::distance(slice_offsets.begin(), it)]; + const ImageInfo& info = overlap.info; + if (!IsBlockLinearSizeCompatible(new_info, info, base.level, 0, strict_size)) { + return std::nullopt; + } + const u32 mip_depth = std::max(1U, new_info.size.depth << base.level); + if (mip_depth < info.size.depth + base.layer) { + return std::nullopt; + } + if (MipBlockSize(new_info, base.level) != info.block) { + return std::nullopt; + } + return SubresourceExtent{ + .levels = std::max(new_info.resources.levels, info.resources.levels + base.level), + .layers = 1, + }; +} + +[[nodiscard]] std::optional<SubresourceExtent> ResolveOverlapRightAddress2D( + const ImageInfo& new_info, GPUVAddr gpu_addr, const ImageBase& overlap, bool strict_size) { + const u32 layer_stride = new_info.layer_stride; + const s32 new_size = layer_stride * new_info.resources.layers; + const s32 diff = static_cast<s32>(overlap.gpu_addr - gpu_addr); + if (diff > new_size) { + return std::nullopt; + } + const s32 base_layer = diff / layer_stride; + const s32 mip_offset = diff % layer_stride; + const std::array offsets = CalculateMipLevelOffsets(new_info); + const auto end = offsets.begin() + new_info.resources.levels; + const auto it = std::find(offsets.begin(), end, mip_offset); + if (it == end) { + // Mipmap is not aligned to any valid size + return std::nullopt; + } + const SubresourceBase base{ + .level = static_cast<s32>(std::distance(offsets.begin(), it)), + .layer = base_layer, + }; + const ImageInfo& info = overlap.info; + if (!IsBlockLinearSizeCompatible(new_info, info, base.level, 0, strict_size)) { + return std::nullopt; + } + if (MipBlockSize(new_info, base.level) != info.block) { + return std::nullopt; + } + return SubresourceExtent{ + .levels = std::max(new_info.resources.levels, info.resources.levels + base.level), + .layers = std::max(new_info.resources.layers, info.resources.layers + base.layer), + }; +} + +[[nodiscard]] std::optional<OverlapResult> ResolveOverlapRightAddress(const ImageInfo& new_info, + GPUVAddr gpu_addr, + VAddr cpu_addr, + const ImageBase& overlap, + bool strict_size) { + std::optional<SubresourceExtent> resources; + if (new_info.type != ImageType::e3D) { + resources = ResolveOverlapRightAddress2D(new_info, gpu_addr, overlap, strict_size); + } else { + resources = ResolveOverlapRightAddress3D(new_info, gpu_addr, overlap, strict_size); + } + if (!resources) { + return std::nullopt; + } + return OverlapResult{ + .gpu_addr = gpu_addr, + .cpu_addr = cpu_addr, + .resources = *resources, + }; +} + +[[nodiscard]] std::optional<OverlapResult> ResolveOverlapLeftAddress(const ImageInfo& new_info, + GPUVAddr gpu_addr, + VAddr cpu_addr, + const ImageBase& overlap, + bool strict_size) { + const std::optional<SubresourceBase> base = overlap.TryFindBase(gpu_addr); + if (!base) { + return std::nullopt; + } + const ImageInfo& info = overlap.info; + if (!IsBlockLinearSizeCompatible(new_info, info, base->level, 0, strict_size)) { + return std::nullopt; + } + if (new_info.block != MipBlockSize(info, base->level)) { + return std::nullopt; + } + const SubresourceExtent resources = new_info.resources; + s32 layers = 1; + if (info.type != ImageType::e3D) { + layers = std::max(resources.layers, info.resources.layers + base->layer); + } + return OverlapResult{ + .gpu_addr = overlap.gpu_addr, + .cpu_addr = overlap.cpu_addr, + .resources = + { + .levels = std::max(resources.levels + base->level, info.resources.levels), + .layers = layers, + }, + }; +} + +[[nodiscard]] Extent2D PitchLinearAlignedSize(const ImageInfo& info) { + // https://github.com/Ryujinx/Ryujinx/blob/1c9aba6de1520aea5480c032e0ff5664ac1bb36f/Ryujinx.Graphics.Texture/SizeCalculator.cs#L212 + static constexpr u32 STRIDE_ALIGNMENT = 32; + ASSERT(info.type == ImageType::Linear); + const Extent2D num_tiles{ + .width = Common::DivCeil(info.size.width, DefaultBlockWidth(info.format)), + .height = Common::DivCeil(info.size.height, DefaultBlockHeight(info.format)), + }; + const u32 width_alignment = STRIDE_ALIGNMENT / BytesPerBlock(info.format); + return Extent2D{ + .width = Common::AlignUp(num_tiles.width, width_alignment), + .height = num_tiles.height, + }; +} + +[[nodiscard]] Extent3D BlockLinearAlignedSize(const ImageInfo& info, u32 level) { + // https://github.com/Ryujinx/Ryujinx/blob/1c9aba6de1520aea5480c032e0ff5664ac1bb36f/Ryujinx.Graphics.Texture/SizeCalculator.cs#L176 + ASSERT(info.type != ImageType::Linear); + const Extent3D size = AdjustMipSize(info.size, level); + const Extent3D num_tiles{ + .width = Common::DivCeil(size.width, DefaultBlockWidth(info.format)), + .height = Common::DivCeil(size.height, DefaultBlockHeight(info.format)), + .depth = size.depth, + }; + const u32 bpp_log2 = BytesPerBlockLog2(info.format); + const u32 alignment = StrideAlignment(num_tiles, info.block, bpp_log2, info.tile_width_spacing); + const Extent3D mip_block = AdjustMipBlockSize(num_tiles, info.block, 0); + return Extent3D{ + .width = Common::AlignBits(num_tiles.width, alignment), + .height = Common::AlignBits(num_tiles.height, GOB_SIZE_Y_SHIFT + mip_block.height), + .depth = Common::AlignBits(num_tiles.depth, GOB_SIZE_Z_SHIFT + mip_block.depth), + }; +} + +[[nodiscard]] constexpr u32 NumBlocksPerLayer(const ImageInfo& info, Extent2D tile_size) noexcept { + u32 num_blocks = 0; + for (s32 level = 0; level < info.resources.levels; ++level) { + const Extent3D mip_size = AdjustMipSize(info.size, level); + num_blocks += NumBlocks(mip_size, tile_size); + } + return num_blocks; +} + +[[nodiscard]] u32 NumSlices(const ImageInfo& info) noexcept { + ASSERT(info.type == ImageType::e3D); + u32 num_slices = 0; + for (s32 level = 0; level < info.resources.levels; ++level) { + num_slices += AdjustMipSize(info.size.depth, level); + } + return num_slices; +} + +void SwizzlePitchLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageInfo& info, const BufferImageCopy& copy, + std::span<const u8> memory) { + ASSERT(copy.image_offset.z == 0); + ASSERT(copy.image_extent.depth == 1); + ASSERT(copy.image_subresource.base_level == 0); + ASSERT(copy.image_subresource.base_layer == 0); + ASSERT(copy.image_subresource.num_layers == 1); + + const u32 bytes_per_block = BytesPerBlock(info.format); + const u32 row_length = copy.image_extent.width * bytes_per_block; + const u32 guest_offset_x = copy.image_offset.x * bytes_per_block; + + for (u32 line = 0; line < copy.image_extent.height; ++line) { + const u32 host_offset_y = line * info.pitch; + const u32 guest_offset_y = (copy.image_offset.y + line) * info.pitch; + const u32 guest_offset = guest_offset_x + guest_offset_y; + gpu_memory.WriteBlockUnsafe(gpu_addr + guest_offset, memory.data() + host_offset_y, + row_length); + } +} + +void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageInfo& info, const BufferImageCopy& copy, + std::span<const u8> input) { + const Extent3D size = info.size; + const LevelInfo level_info = MakeLevelInfo(info); + const Extent2D tile_size = DefaultBlockSize(info.format); + const u32 bytes_per_block = BytesPerBlock(info.format); + + const s32 level = copy.image_subresource.base_level; + const Extent3D level_size = AdjustMipSize(size, level); + const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); + const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block; + + UNIMPLEMENTED_IF(info.tile_width_spacing > 0); + + UNIMPLEMENTED_IF(copy.image_offset.x != 0); + UNIMPLEMENTED_IF(copy.image_offset.y != 0); + UNIMPLEMENTED_IF(copy.image_offset.z != 0); + UNIMPLEMENTED_IF(copy.image_extent != level_size); + + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + + size_t host_offset = copy.buffer_offset; + + const u32 num_levels = info.resources.levels; + const std::array sizes = CalculateLevelSizes(level_info, num_levels); + size_t guest_offset = std::reduce(sizes.begin(), sizes.begin() + level, 0); + const size_t layer_stride = + AlignLayerSize(std::reduce(sizes.begin(), sizes.begin() + num_levels, 0), size, + level_info.block, tile_size.height, info.tile_width_spacing); + const size_t subresource_size = sizes[level]; + + const auto dst_data = std::make_unique<u8[]>(subresource_size); + const std::span<u8> dst(dst_data.get(), subresource_size); + + for (s32 layer = 0; layer < info.resources.layers; ++layer) { + const std::span<const u8> src = input.subspan(host_offset); + SwizzleTexture(dst, src, bytes_per_block, num_tiles.width, num_tiles.height, + num_tiles.depth, block.height, block.depth); + + gpu_memory.WriteBlockUnsafe(gpu_addr + guest_offset, dst.data(), dst.size_bytes()); + + host_offset += host_bytes_per_layer; + guest_offset += layer_stride; + } + ASSERT(host_offset - copy.buffer_offset == copy.buffer_size); +} + +} // Anonymous namespace + +u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept { + if (info.type == ImageType::Buffer) { + return info.size.width * BytesPerBlock(info.format); + } + if (info.type == ImageType::Linear) { + return info.pitch * Common::DivCeil(info.size.height, DefaultBlockHeight(info.format)); + } + if (info.resources.layers > 1) { + ASSERT(info.layer_stride != 0); + return info.layer_stride * info.resources.layers; + } else { + return CalculateLayerSize(info); + } +} + +u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept { + if (info.type == ImageType::Buffer) { + return info.size.width * BytesPerBlock(info.format); + } + if (info.num_samples > 1) { + // Multisample images can't be uploaded or downloaded to the host + return 0; + } + if (info.type == ImageType::Linear) { + return info.pitch * Common::DivCeil(info.size.height, DefaultBlockHeight(info.format)); + } + const Extent2D tile_size = DefaultBlockSize(info.format); + return NumBlocksPerLayer(info, tile_size) * info.resources.layers * BytesPerBlock(info.format); +} + +u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept { + if (info.type == ImageType::Buffer) { + return info.size.width * BytesPerBlock(info.format); + } + static constexpr Extent2D TILE_SIZE{1, 1}; + return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * CONVERTED_BYTES_PER_BLOCK; +} + +u32 CalculateLayerStride(const ImageInfo& info) noexcept { + ASSERT(info.type != ImageType::Linear); + const u32 layer_size = CalculateLayerSize(info); + const Extent3D size = info.size; + const Extent3D block = info.block; + const u32 tile_size_y = DefaultBlockHeight(info.format); + return AlignLayerSize(layer_size, size, block, tile_size_y, info.tile_width_spacing); +} + +u32 CalculateLayerSize(const ImageInfo& info) noexcept { + ASSERT(info.type != ImageType::Linear); + return CalculateLevelOffset(info.format, info.size, info.block, info.num_samples, + info.tile_width_spacing, info.resources.levels); +} + +std::array<u32, MAX_MIP_LEVELS> CalculateMipLevelOffsets(const ImageInfo& info) noexcept { + ASSERT(info.resources.levels <= MAX_MIP_LEVELS); + const LevelInfo level_info = MakeLevelInfo(info); + std::array<u32, MAX_MIP_LEVELS> offsets{}; + u32 offset = 0; + for (s32 level = 0; level < info.resources.levels; ++level) { + offsets[level] = offset; + offset += CalculateLevelSize(level_info, level); + } + return offsets; +} + +std::vector<u32> CalculateSliceOffsets(const ImageInfo& info) { + ASSERT(info.type == ImageType::e3D); + std::vector<u32> offsets; + offsets.reserve(NumSlices(info)); + + const LevelInfo level_info = MakeLevelInfo(info); + u32 mip_offset = 0; + for (s32 level = 0; level < info.resources.levels; ++level) { + const Extent3D tile_shift = TileShift(level_info, level); + const Extent3D tiles = LevelTiles(level_info, level); + const u32 gob_size_shift = tile_shift.height + GOB_SIZE_SHIFT; + const u32 slice_size = (tiles.width * tiles.height) << gob_size_shift; + const u32 z_mask = (1U << tile_shift.depth) - 1; + const u32 depth = AdjustMipSize(info.size.depth, level); + for (u32 slice = 0; slice < depth; ++slice) { + const u32 z_low = slice & z_mask; + const u32 z_high = slice & ~z_mask; + offsets.push_back(mip_offset + (z_low << gob_size_shift) + (z_high * slice_size)); + } + mip_offset += CalculateLevelSize(level_info, level); + } + return offsets; +} + +std::vector<SubresourceBase> CalculateSliceSubresources(const ImageInfo& info) { + ASSERT(info.type == ImageType::e3D); + std::vector<SubresourceBase> subresources; + subresources.reserve(NumSlices(info)); + for (s32 level = 0; level < info.resources.levels; ++level) { + const s32 depth = AdjustMipSize(info.size.depth, level); + for (s32 slice = 0; slice < depth; ++slice) { + subresources.emplace_back(SubresourceBase{ + .level = level, + .layer = slice, + }); + } + } + return subresources; +} + +u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level) { + const Extent2D tile_size = DefaultBlockSize(info.format); + const Extent3D level_size = AdjustMipSize(info.size, level); + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + const Extent3D block = AdjustMipBlockSize(num_tiles, info.block, level); + const u32 bpp_log2 = BytesPerBlockLog2(info.format); + return StrideAlignment(num_tiles, block, bpp_log2, info.tile_width_spacing); +} + +PixelFormat PixelFormatFromTIC(const TICEntry& config) noexcept { + return PixelFormatFromTextureInfo(config.format, config.r_type, config.g_type, config.b_type, + config.a_type, config.srgb_conversion); +} + +ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept { + switch (info.type) { + case ImageType::e2D: + return info.resources.layers > 1 ? ImageViewType::e2DArray : ImageViewType::e2D; + case ImageType::e3D: + return ImageViewType::e2DArray; + case ImageType::Linear: + return ImageViewType::e2D; + default: + UNIMPLEMENTED_MSG("Unimplemented image type={}", static_cast<int>(info.type)); + return ImageViewType{}; + } +} + +std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, const ImageInfo& src, + SubresourceBase base) { + ASSERT(dst.resources.levels >= src.resources.levels); + ASSERT(dst.num_samples == src.num_samples); + + const bool is_dst_3d = dst.type == ImageType::e3D; + if (is_dst_3d) { + ASSERT(src.type == ImageType::e3D); + ASSERT(src.resources.levels == 1); + } + + std::vector<ImageCopy> copies; + copies.reserve(src.resources.levels); + for (s32 level = 0; level < src.resources.levels; ++level) { + ImageCopy& copy = copies.emplace_back(); + copy.src_subresource = SubresourceLayers{ + .base_level = level, + .base_layer = 0, + .num_layers = src.resources.layers, + }; + copy.dst_subresource = SubresourceLayers{ + .base_level = base.level + level, + .base_layer = is_dst_3d ? 0 : base.layer, + .num_layers = is_dst_3d ? 1 : src.resources.layers, + }; + copy.src_offset = Offset3D{ + .x = 0, + .y = 0, + .z = 0, + }; + copy.dst_offset = Offset3D{ + .x = 0, + .y = 0, + .z = is_dst_3d ? base.layer : 0, + }; + const Extent3D mip_size = AdjustMipSize(dst.size, base.level + level); + copy.extent = AdjustSamplesSize(mip_size, dst.num_samples); + if (is_dst_3d) { + copy.extent.depth = src.size.depth; + } + } + return copies; +} + +bool IsValidAddress(const Tegra::MemoryManager& gpu_memory, const TICEntry& config) { + if (config.Address() == 0) { + return false; + } + if (config.Address() > (u64(1) << 48)) { + return false; + } + return gpu_memory.GpuToCpuAddress(config.Address()).has_value(); +} + +std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageInfo& info, std::span<u8> output) { + const size_t guest_size_bytes = CalculateGuestSizeInBytes(info); + const u32 bpp_log2 = BytesPerBlockLog2(info.format); + const Extent3D size = info.size; + + if (info.type == ImageType::Linear) { + gpu_memory.ReadBlockUnsafe(gpu_addr, output.data(), guest_size_bytes); + + ASSERT((info.pitch >> bpp_log2) << bpp_log2 == info.pitch); + return {{ + .buffer_offset = 0, + .buffer_size = guest_size_bytes, + .buffer_row_length = info.pitch >> bpp_log2, + .buffer_image_height = size.height, + .image_subresource = + { + .base_level = 0, + .base_layer = 0, + .num_layers = 1, + }, + .image_offset = {0, 0, 0}, + .image_extent = size, + }}; + } + const auto input_data = std::make_unique<u8[]>(guest_size_bytes); + gpu_memory.ReadBlockUnsafe(gpu_addr, input_data.get(), guest_size_bytes); + const std::span<const u8> input(input_data.get(), guest_size_bytes); + + const LevelInfo level_info = MakeLevelInfo(info); + const s32 num_layers = info.resources.layers; + const s32 num_levels = info.resources.levels; + const Extent2D tile_size = DefaultBlockSize(info.format); + const std::array level_sizes = CalculateLevelSizes(level_info, num_levels); + const Extent2D gob = GobSize(bpp_log2, info.block.height, info.tile_width_spacing); + const u32 layer_size = std::reduce(level_sizes.begin(), level_sizes.begin() + num_levels, 0); + const u32 layer_stride = AlignLayerSize(layer_size, size, level_info.block, tile_size.height, + info.tile_width_spacing); + size_t guest_offset = 0; + u32 host_offset = 0; + std::vector<BufferImageCopy> copies(num_levels); + + for (s32 level = 0; level < num_levels; ++level) { + const Extent3D level_size = AdjustMipSize(size, level); + const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); + const u32 host_bytes_per_layer = num_blocks_per_layer << bpp_log2; + copies[level] = BufferImageCopy{ + .buffer_offset = host_offset, + .buffer_size = static_cast<size_t>(host_bytes_per_layer) * num_layers, + .buffer_row_length = Common::AlignUp(level_size.width, tile_size.width), + .buffer_image_height = Common::AlignUp(level_size.height, tile_size.height), + .image_subresource = + { + .base_level = level, + .base_layer = 0, + .num_layers = info.resources.layers, + }, + .image_offset = {0, 0, 0}, + .image_extent = level_size, + }; + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + const u32 stride_alignment = StrideAlignment(num_tiles, info.block, gob, bpp_log2); + size_t guest_layer_offset = 0; + + for (s32 layer = 0; layer < info.resources.layers; ++layer) { + const std::span<u8> dst = output.subspan(host_offset); + const std::span<const u8> src = input.subspan(guest_offset + guest_layer_offset); + UnswizzleTexture(dst, src, 1U << bpp_log2, num_tiles.width, num_tiles.height, + num_tiles.depth, block.height, block.depth, stride_alignment); + guest_layer_offset += layer_stride; + host_offset += host_bytes_per_layer; + } + guest_offset += level_sizes[level]; + } + return copies; +} + +BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageBase& image, std::span<u8> output) { + gpu_memory.ReadBlockUnsafe(gpu_addr, output.data(), image.guest_size_bytes); + return BufferCopy{ + .src_offset = 0, + .dst_offset = 0, + .size = image.guest_size_bytes, + }; +} + +void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output, + std::span<BufferImageCopy> copies) { + u32 output_offset = 0; + + const Extent2D tile_size = DefaultBlockSize(info.format); + for (BufferImageCopy& copy : copies) { + const u32 level = copy.image_subresource.base_level; + const Extent3D mip_size = AdjustMipSize(info.size, level); + ASSERT(copy.image_offset == Offset3D{}); + ASSERT(copy.image_subresource.base_layer == 0); + ASSERT(copy.image_extent == mip_size); + ASSERT(copy.buffer_row_length == Common::AlignUp(mip_size.width, tile_size.width)); + ASSERT(copy.buffer_image_height == Common::AlignUp(mip_size.height, tile_size.height)); + + if (IsPixelFormatASTC(info.format)) { + ASSERT(copy.image_extent.depth == 1); + Tegra::Texture::ASTC::Decompress(input.subspan(copy.buffer_offset), + copy.image_extent.width, copy.image_extent.height, + copy.image_subresource.num_layers, tile_size.width, + tile_size.height, output.subspan(output_offset)); + } else { + DecompressBC4(input.subspan(copy.buffer_offset), copy.image_extent, + output.subspan(output_offset)); + } + copy.buffer_offset = output_offset; + copy.buffer_row_length = mip_size.width; + copy.buffer_image_height = mip_size.height; + + output_offset += copy.image_extent.width * copy.image_extent.height * + copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK; + } +} + +std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info) { + const Extent3D size = info.size; + const u32 bytes_per_block = BytesPerBlock(info.format); + if (info.type == ImageType::Linear) { + ASSERT(info.pitch % bytes_per_block == 0); + return {{ + .buffer_offset = 0, + .buffer_size = static_cast<size_t>(info.pitch) * size.height, + .buffer_row_length = info.pitch / bytes_per_block, + .buffer_image_height = size.height, + .image_subresource = + { + .base_level = 0, + .base_layer = 0, + .num_layers = 1, + }, + .image_offset = {0, 0, 0}, + .image_extent = size, + }}; + } + UNIMPLEMENTED_IF(info.tile_width_spacing > 0); + + const s32 num_layers = info.resources.layers; + const s32 num_levels = info.resources.levels; + const Extent2D tile_size = DefaultBlockSize(info.format); + + u32 host_offset = 0; + + std::vector<BufferImageCopy> copies(num_levels); + for (s32 level = 0; level < num_levels; ++level) { + const Extent3D level_size = AdjustMipSize(size, level); + const u32 num_blocks_per_layer = NumBlocks(level_size, tile_size); + const u32 host_bytes_per_level = num_blocks_per_layer * bytes_per_block * num_layers; + copies[level] = BufferImageCopy{ + .buffer_offset = host_offset, + .buffer_size = host_bytes_per_level, + .buffer_row_length = level_size.width, + .buffer_image_height = level_size.height, + .image_subresource = + { + .base_level = level, + .base_layer = 0, + .num_layers = info.resources.layers, + }, + .image_offset = {0, 0, 0}, + .image_extent = level_size, + }; + host_offset += host_bytes_per_level; + } + return copies; +} + +Extent3D MipSize(Extent3D size, u32 level) { + return AdjustMipSize(size, level); +} + +Extent3D MipBlockSize(const ImageInfo& info, u32 level) { + const LevelInfo level_info = MakeLevelInfo(info); + const Extent2D tile_size = DefaultBlockSize(info.format); + const Extent3D level_size = AdjustMipSize(info.size, level); + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + return AdjustMipBlockSize(num_tiles, level_info.block, level); +} + +std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info) { + const Extent2D tile_size = DefaultBlockSize(info.format); + if (info.type == ImageType::Linear) { + return std::vector{SwizzleParameters{ + .num_tiles = AdjustTileSize(info.size, tile_size), + .block = {}, + .buffer_offset = 0, + .level = 0, + }}; + } + const LevelInfo level_info = MakeLevelInfo(info); + const Extent3D size = info.size; + const s32 num_levels = info.resources.levels; + + u32 guest_offset = 0; + std::vector<SwizzleParameters> params(num_levels); + for (s32 level = 0; level < num_levels; ++level) { + const Extent3D level_size = AdjustMipSize(size, level); + const Extent3D num_tiles = AdjustTileSize(level_size, tile_size); + const Extent3D block = AdjustMipBlockSize(num_tiles, level_info.block, level); + params[level] = SwizzleParameters{ + .num_tiles = num_tiles, + .block = block, + .buffer_offset = guest_offset, + .level = level, + }; + guest_offset += CalculateLevelSize(level_info, level); + } + return params; +} + +void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, + std::span<const BufferImageCopy> copies, std::span<const u8> memory) { + const bool is_pitch_linear = info.type == ImageType::Linear; + for (const BufferImageCopy& copy : copies) { + if (is_pitch_linear) { + SwizzlePitchLinearImage(gpu_memory, gpu_addr, info, copy, memory); + } else { + SwizzleBlockLinearImage(gpu_memory, gpu_addr, info, copy, memory); + } + } +} + +bool IsBlockLinearSizeCompatible(const ImageInfo& lhs, const ImageInfo& rhs, u32 lhs_level, + u32 rhs_level, bool strict_size) noexcept { + ASSERT(lhs.type != ImageType::Linear); + ASSERT(rhs.type != ImageType::Linear); + if (strict_size) { + const Extent3D lhs_size = AdjustMipSize(lhs.size, lhs_level); + const Extent3D rhs_size = AdjustMipSize(rhs.size, rhs_level); + return lhs_size.width == rhs_size.width && lhs_size.height == rhs_size.height; + } else { + const Extent3D lhs_size = BlockLinearAlignedSize(lhs, lhs_level); + const Extent3D rhs_size = BlockLinearAlignedSize(rhs, rhs_level); + return lhs_size.width == rhs_size.width && lhs_size.height == rhs_size.height; + } +} + +bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, bool strict_size) noexcept { + ASSERT(lhs.type == ImageType::Linear); + ASSERT(rhs.type == ImageType::Linear); + if (strict_size) { + return lhs.size.width == rhs.size.width && lhs.size.height == rhs.size.height; + } else { + const Extent2D lhs_size = PitchLinearAlignedSize(lhs); + const Extent2D rhs_size = PitchLinearAlignedSize(rhs); + return lhs_size == rhs_size; + } +} + +std::optional<OverlapResult> ResolveOverlap(const ImageInfo& new_info, GPUVAddr gpu_addr, + VAddr cpu_addr, const ImageBase& overlap, + bool strict_size, bool broken_views) { + ASSERT(new_info.type != ImageType::Linear); + ASSERT(overlap.info.type != ImageType::Linear); + if (!IsLayerStrideCompatible(new_info, overlap.info)) { + return std::nullopt; + } + if (!IsViewCompatible(overlap.info.format, new_info.format, broken_views)) { + return std::nullopt; + } + if (gpu_addr == overlap.gpu_addr) { + const std::optional solution = ResolveOverlapEqualAddress(new_info, overlap, strict_size); + if (!solution) { + return std::nullopt; + } + return OverlapResult{ + .gpu_addr = gpu_addr, + .cpu_addr = cpu_addr, + .resources = *solution, + }; + } + if (overlap.gpu_addr > gpu_addr) { + return ResolveOverlapRightAddress(new_info, gpu_addr, cpu_addr, overlap, strict_size); + } + // if overlap.gpu_addr < gpu_addr + return ResolveOverlapLeftAddress(new_info, gpu_addr, cpu_addr, overlap, strict_size); +} + +bool IsLayerStrideCompatible(const ImageInfo& lhs, const ImageInfo& rhs) { + // If either of the layer strides is zero, we can assume they are compatible + // These images generally come from rendertargets + if (lhs.layer_stride == 0) { + return true; + } + if (rhs.layer_stride == 0) { + return true; + } + // It's definitely compatible if the layer stride matches + if (lhs.layer_stride == rhs.layer_stride) { + return true; + } + // Although we also have to compare for cases where it can be unaligned + // This can happen if the image doesn't have layers, so the stride is not aligned + if (lhs.maybe_unaligned_layer_stride == rhs.maybe_unaligned_layer_stride) { + return true; + } + return false; +} + +std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, const ImageBase& image, + GPUVAddr candidate_addr, RelaxedOptions options, + bool broken_views) { + const std::optional<SubresourceBase> base = image.TryFindBase(candidate_addr); + if (!base) { + return std::nullopt; + } + const ImageInfo& existing = image.info; + if (False(options & RelaxedOptions::Format)) { + if (!IsViewCompatible(existing.format, candidate.format, broken_views)) { + return std::nullopt; + } + } + if (!IsLayerStrideCompatible(existing, candidate)) { + return std::nullopt; + } + if (existing.type != candidate.type) { + return std::nullopt; + } + if (False(options & RelaxedOptions::Samples)) { + if (existing.num_samples != candidate.num_samples) { + return std::nullopt; + } + } + if (existing.resources.levels < candidate.resources.levels + base->level) { + return std::nullopt; + } + if (existing.type == ImageType::e3D) { + const u32 mip_depth = std::max(1U, existing.size.depth << base->level); + if (mip_depth < candidate.size.depth + base->layer) { + return std::nullopt; + } + } else { + if (existing.resources.layers < candidate.resources.layers + base->layer) { + return std::nullopt; + } + } + const bool strict_size = False(options & RelaxedOptions::Size); + if (!IsBlockLinearSizeCompatible(existing, candidate, base->level, 0, strict_size)) { + return std::nullopt; + } + // TODO: compare block sizes + return base; +} + +bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, GPUVAddr candidate_addr, + RelaxedOptions options, bool broken_views) { + return FindSubresource(candidate, image, candidate_addr, options, broken_views).has_value(); +} + +void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, + const ImageBase* src) { + if (src && GetFormatType(src->info.format) != SurfaceType::ColorTexture) { + src_info.format = src->info.format; + } + if (dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { + dst_info.format = dst->info.format; + } + if (!dst && src && GetFormatType(src->info.format) != SurfaceType::ColorTexture) { + dst_info.format = src->info.format; + } + if (!src && dst && GetFormatType(dst->info.format) != SurfaceType::ColorTexture) { + src_info.format = src->info.format; + } +} + +u32 MapSizeBytes(const ImageBase& image) { + if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { + return image.guest_size_bytes; + } else if (True(image.flags & ImageFlagBits::Converted)) { + return image.converted_size_bytes; + } else { + return image.unswizzled_size_bytes; + } +} + +using P = PixelFormat; + +static_assert(CalculateLevelSize(LevelInfo{{1920, 1080}, {0, 2, 0}, {1, 1}, 2, 0}, 0) == 0x7f8000); +static_assert(CalculateLevelSize(LevelInfo{{32, 32}, {0, 0, 4}, {1, 1}, 4, 0}, 0) == 0x4000); + +static_assert(CalculateLevelOffset(P::R8_SINT, {1920, 1080}, {0, 2}, 1, 0, 7) == 0x2afc00); +static_assert(CalculateLevelOffset(P::ASTC_2D_12X12_UNORM, {8192, 4096}, {0, 2}, 1, 0, 12) == + 0x50d200); + +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 0) == 0); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 1) == 0x400000); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 2) == 0x500000); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 3) == 0x540000); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 4) == 0x550000); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 5) == 0x554000); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 6) == 0x555000); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 7) == 0x555400); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 8) == 0x555600); +static_assert(CalculateLevelOffset(P::A8B8G8R8_UNORM, {1024, 1024}, {0, 4}, 1, 0, 9) == 0x555800); + +constexpr u32 ValidateLayerSize(PixelFormat format, u32 width, u32 height, u32 block_height, + u32 tile_width_spacing, u32 level) { + const Extent3D size{width, height, 1}; + const Extent3D block{0, block_height, 0}; + const u32 offset = CalculateLevelOffset(format, size, block, 1, tile_width_spacing, level); + return AlignLayerSize(offset, size, block, DefaultBlockHeight(format), tile_width_spacing); +} + +static_assert(ValidateLayerSize(P::ASTC_2D_12X12_UNORM, 8192, 4096, 2, 0, 12) == 0x50d800); +static_assert(ValidateLayerSize(P::A8B8G8R8_UNORM, 1024, 1024, 2, 0, 10) == 0x556000); +static_assert(ValidateLayerSize(P::BC3_UNORM, 128, 128, 2, 0, 8) == 0x6000); + +static_assert(ValidateLayerSize(P::A8B8G8R8_UNORM, 518, 572, 4, 3, 1) == 0x190000, + "Tile width spacing is not working"); +static_assert(ValidateLayerSize(P::BC5_UNORM, 1024, 1024, 3, 4, 11) == 0x160000, + "Compressed tile width spacing is not working"); + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/util.h b/src/video_core/texture_cache/util.h new file mode 100644 index 000000000..52a9207d6 --- /dev/null +++ b/src/video_core/texture_cache/util.h @@ -0,0 +1,109 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <optional> +#include <span> + +#include "common/common_types.h" + +#include "video_core/engines/maxwell_3d.h" +#include "video_core/surface.h" +#include "video_core/texture_cache/image_base.h" +#include "video_core/texture_cache/image_view_base.h" +#include "video_core/texture_cache/types.h" +#include "video_core/textures/texture.h" + +namespace VideoCommon { + +using Tegra::Texture::TICEntry; + +struct OverlapResult { + GPUVAddr gpu_addr; + VAddr cpu_addr; + SubresourceExtent resources; +}; + +[[nodiscard]] u32 CalculateGuestSizeInBytes(const ImageInfo& info) noexcept; + +[[nodiscard]] u32 CalculateUnswizzledSizeBytes(const ImageInfo& info) noexcept; + +[[nodiscard]] u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept; + +[[nodiscard]] u32 CalculateLayerStride(const ImageInfo& info) noexcept; + +[[nodiscard]] u32 CalculateLayerSize(const ImageInfo& info) noexcept; + +[[nodiscard]] std::array<u32, MAX_MIP_LEVELS> CalculateMipLevelOffsets( + const ImageInfo& info) noexcept; + +[[nodiscard]] std::vector<u32> CalculateSliceOffsets(const ImageInfo& info); + +[[nodiscard]] std::vector<SubresourceBase> CalculateSliceSubresources(const ImageInfo& info); + +[[nodiscard]] u32 CalculateLevelStrideAlignment(const ImageInfo& info, u32 level); + +[[nodiscard]] VideoCore::Surface::PixelFormat PixelFormatFromTIC( + const Tegra::Texture::TICEntry& config) noexcept; + +[[nodiscard]] ImageViewType RenderTargetImageViewType(const ImageInfo& info) noexcept; + +[[nodiscard]] std::vector<ImageCopy> MakeShrinkImageCopies(const ImageInfo& dst, + const ImageInfo& src, + SubresourceBase base); + +[[nodiscard]] bool IsValidAddress(const Tegra::MemoryManager& gpu_memory, const TICEntry& config); + +[[nodiscard]] std::vector<BufferImageCopy> UnswizzleImage(Tegra::MemoryManager& gpu_memory, + GPUVAddr gpu_addr, const ImageInfo& info, + std::span<u8> output); + +[[nodiscard]] BufferCopy UploadBufferCopy(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, + const ImageBase& image, std::span<u8> output); + +void ConvertImage(std::span<const u8> input, const ImageInfo& info, std::span<u8> output, + std::span<BufferImageCopy> copies); + +[[nodiscard]] std::vector<BufferImageCopy> FullDownloadCopies(const ImageInfo& info); + +[[nodiscard]] Extent3D MipSize(Extent3D size, u32 level); + +[[nodiscard]] Extent3D MipBlockSize(const ImageInfo& info, u32 level); + +[[nodiscard]] std::vector<SwizzleParameters> FullUploadSwizzles(const ImageInfo& info); + +void SwizzleImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr, const ImageInfo& info, + std::span<const BufferImageCopy> copies, std::span<const u8> memory); + +[[nodiscard]] bool IsBlockLinearSizeCompatible(const ImageInfo& new_info, + const ImageInfo& overlap_info, u32 new_level, + u32 overlap_level, bool strict_size) noexcept; + +[[nodiscard]] bool IsPitchLinearSameSize(const ImageInfo& lhs, const ImageInfo& rhs, + bool strict_size) noexcept; + +[[nodiscard]] std::optional<OverlapResult> ResolveOverlap(const ImageInfo& new_info, + GPUVAddr gpu_addr, VAddr cpu_addr, + const ImageBase& overlap, + bool strict_size, bool broken_views); + +[[nodiscard]] bool IsLayerStrideCompatible(const ImageInfo& lhs, const ImageInfo& rhs); + +[[nodiscard]] std::optional<SubresourceBase> FindSubresource(const ImageInfo& candidate, + const ImageBase& image, + GPUVAddr candidate_addr, + RelaxedOptions options, + bool broken_views); + +[[nodiscard]] bool IsSubresource(const ImageInfo& candidate, const ImageBase& image, + GPUVAddr candidate_addr, RelaxedOptions options, + bool broken_views); + +void DeduceBlitImages(ImageInfo& dst_info, ImageInfo& src_info, const ImageBase* dst, + const ImageBase* src); + +[[nodiscard]] u32 MapSizeBytes(const ImageBase& image); + +} // namespace VideoCommon diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index 365bde2f1..acd5bdd78 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -18,6 +18,7 @@ #include <algorithm> #include <cassert> #include <cstring> +#include <span> #include <vector> #include <boost/container/static_vector.hpp> @@ -600,7 +601,7 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { return params; } -static void FillVoidExtentLDR(InputBitStream& strm, u32* const outBuf, u32 blockWidth, +static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { // Don't actually care about the void extent, just read the bits... for (s32 i = 0; i < 4; ++i) { @@ -623,7 +624,7 @@ static void FillVoidExtentLDR(InputBitStream& strm, u32* const outBuf, u32 block } } -static void FillError(u32* outBuf, u32 blockWidth, u32 blockHeight) { +static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { for (u32 j = 0; j < blockHeight; j++) { for (u32 i = 0; i < blockWidth; i++) { outBuf[j * blockWidth + i] = 0xFFFF00FF; @@ -1438,9 +1439,9 @@ static void ComputeEndpos32s(Pixel& ep1, Pixel& ep2, const u32*& colorValues, #undef READ_INT_VALUES } -static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 blockHeight, - u32* outBuf) { - InputBitStream strm(inBuf); +static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, + const u32 blockHeight, std::span<u32, 12 * 12> outBuf) { + InputBitStream strm(inBuf.data()); TexelWeightParams weightParams = DecodeBlockInfo(strm); // Was there an error? @@ -1601,8 +1602,8 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 } // Read the texel weight data.. - u8 texelWeightData[16]; - memcpy(texelWeightData, inBuf, sizeof(texelWeightData)); + std::array<u8, 16> texelWeightData; + std::ranges::copy(inBuf, texelWeightData.begin()); // Reverse everything for (u32 i = 0; i < 8; i++) { @@ -1618,14 +1619,15 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 // Make sure that higher non-texel bits are set to zero const u32 clearByteStart = (weightParams.GetPackedBitSize() >> 3) + 1; - texelWeightData[clearByteStart - 1] = - texelWeightData[clearByteStart - 1] & - static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); - memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart); + if (clearByteStart > 0) { + texelWeightData[clearByteStart - 1] &= + static_cast<u8>((1 << (weightParams.GetPackedBitSize() % 8)) - 1); + } + std::memset(texelWeightData.data() + clearByteStart, 0, std::min(16U - clearByteStart, 16U)); IntegerEncodedVector texelWeightValues; - InputBitStream weightStream(texelWeightData); + InputBitStream weightStream(texelWeightData.data()); DecodeIntegerSequence(texelWeightValues, weightStream, weightParams.m_MaxWeight, weightParams.GetNumWeightValues()); @@ -1672,36 +1674,32 @@ static void DecompressBlock(const u8 inBuf[16], const u32 blockWidth, const u32 namespace Tegra::Texture::ASTC { -std::vector<u8> Decompress(const u8* data, u32 width, u32 height, u32 depth, u32 block_width, - u32 block_height) { - u32 blockIdx = 0; +void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, + uint32_t block_width, uint32_t block_height, std::span<uint8_t> output) { + u32 block_index = 0; std::size_t depth_offset = 0; - std::vector<u8> outData(height * width * depth * 4); - for (u32 k = 0; k < depth; k++) { - for (u32 j = 0; j < height; j += block_height) { - for (u32 i = 0; i < width; i += block_width) { - - const u8* blockPtr = data + blockIdx * 16; + for (u32 z = 0; z < depth; z++) { + for (u32 y = 0; y < height; y += block_height) { + for (u32 x = 0; x < width; x += block_width) { + const std::span<const u8, 16> blockPtr{data.subspan(block_index * 16, 16)}; // Blocks can be at most 12x12 - u32 uncompData[144]; + std::array<u32, 12 * 12> uncompData; ASTCC::DecompressBlock(blockPtr, block_width, block_height, uncompData); - u32 decompWidth = std::min(block_width, width - i); - u32 decompHeight = std::min(block_height, height - j); + u32 decompWidth = std::min(block_width, width - x); + u32 decompHeight = std::min(block_height, height - y); - u8* outRow = depth_offset + outData.data() + (j * width + i) * 4; + const std::span<u8> outRow = output.subspan(depth_offset + (y * width + x) * 4); for (u32 jj = 0; jj < decompHeight; jj++) { - memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4); + std::memcpy(outRow.data() + jj * width * 4, + uncompData.data() + jj * block_width, decompWidth * 4); } - - blockIdx++; + ++block_index; } } depth_offset += height * width * 4; } - - return outData; } } // namespace Tegra::Texture::ASTC diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h index 991cdba72..9105119bc 100644 --- a/src/video_core/textures/astc.h +++ b/src/video_core/textures/astc.h @@ -5,11 +5,10 @@ #pragma once #include <cstdint> -#include <vector> namespace Tegra::Texture::ASTC { -std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height, - uint32_t depth, uint32_t block_width, uint32_t block_height); +void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, + uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); } // namespace Tegra::Texture::ASTC diff --git a/src/video_core/textures/convert.cpp b/src/video_core/textures/convert.cpp deleted file mode 100644 index 962921483..000000000 --- a/src/video_core/textures/convert.cpp +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <cstring> -#include <tuple> -#include <vector> - -#include "common/assert.h" -#include "common/common_types.h" -#include "common/logging/log.h" -#include "video_core/surface.h" -#include "video_core/textures/astc.h" -#include "video_core/textures/convert.h" - -namespace Tegra::Texture { - -using VideoCore::Surface::PixelFormat; - -template <bool reverse> -void SwapS8Z24ToZ24S8(u8* data, u32 width, u32 height) { - union S8Z24 { - BitField<0, 24, u32> z24; - BitField<24, 8, u32> s8; - }; - static_assert(sizeof(S8Z24) == 4, "S8Z24 is incorrect size"); - - union Z24S8 { - BitField<0, 8, u32> s8; - BitField<8, 24, u32> z24; - }; - static_assert(sizeof(Z24S8) == 4, "Z24S8 is incorrect size"); - - S8Z24 s8z24_pixel{}; - Z24S8 z24s8_pixel{}; - constexpr auto bpp{ - VideoCore::Surface::GetBytesPerPixel(VideoCore::Surface::PixelFormat::S8_UINT_D24_UNORM)}; - for (std::size_t y = 0; y < height; ++y) { - for (std::size_t x = 0; x < width; ++x) { - const std::size_t offset{bpp * (y * width + x)}; - if constexpr (reverse) { - std::memcpy(&z24s8_pixel, &data[offset], sizeof(Z24S8)); - s8z24_pixel.s8.Assign(z24s8_pixel.s8); - s8z24_pixel.z24.Assign(z24s8_pixel.z24); - std::memcpy(&data[offset], &s8z24_pixel, sizeof(S8Z24)); - } else { - std::memcpy(&s8z24_pixel, &data[offset], sizeof(S8Z24)); - z24s8_pixel.s8.Assign(s8z24_pixel.s8); - z24s8_pixel.z24.Assign(s8z24_pixel.z24); - std::memcpy(&data[offset], &z24s8_pixel, sizeof(Z24S8)); - } - } - } -} - -static void ConvertS8Z24ToZ24S8(u8* data, u32 width, u32 height) { - SwapS8Z24ToZ24S8<false>(data, width, height); -} - -static void ConvertZ24S8ToS8Z24(u8* data, u32 width, u32 height) { - SwapS8Z24ToZ24S8<true>(data, width, height); -} - -void ConvertFromGuestToHost(u8* in_data, u8* out_data, PixelFormat pixel_format, u32 width, - u32 height, u32 depth, bool convert_astc, bool convert_s8z24) { - if (convert_astc && IsPixelFormatASTC(pixel_format)) { - // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC. - u32 block_width{}; - u32 block_height{}; - std::tie(block_width, block_height) = GetASTCBlockSize(pixel_format); - const std::vector<u8> rgba8_data = Tegra::Texture::ASTC::Decompress( - in_data, width, height, depth, block_width, block_height); - std::copy(rgba8_data.begin(), rgba8_data.end(), out_data); - - } else if (convert_s8z24 && pixel_format == PixelFormat::S8_UINT_D24_UNORM) { - Tegra::Texture::ConvertS8Z24ToZ24S8(in_data, width, height); - } -} - -void ConvertFromHostToGuest(u8* data, PixelFormat pixel_format, u32 width, u32 height, u32 depth, - bool convert_astc, bool convert_s8z24) { - if (convert_astc && IsPixelFormatASTC(pixel_format)) { - LOG_CRITICAL(HW_GPU, "Conversion of format {} after texture flushing is not implemented", - static_cast<u32>(pixel_format)); - UNREACHABLE(); - - } else if (convert_s8z24 && pixel_format == PixelFormat::S8_UINT_D24_UNORM) { - Tegra::Texture::ConvertZ24S8ToS8Z24(data, width, height); - } -} - -} // namespace Tegra::Texture diff --git a/src/video_core/textures/convert.h b/src/video_core/textures/convert.h deleted file mode 100644 index d5d6c77bb..000000000 --- a/src/video_core/textures/convert.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright 2019 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include "common/common_types.h" - -namespace VideoCore::Surface { -enum class PixelFormat; -} - -namespace Tegra::Texture { - -void ConvertFromGuestToHost(u8* in_data, u8* out_data, VideoCore::Surface::PixelFormat pixel_format, - u32 width, u32 height, u32 depth, bool convert_astc, - bool convert_s8z24); - -void ConvertFromHostToGuest(u8* data, VideoCore::Surface::PixelFormat pixel_format, u32 width, - u32 height, u32 depth, bool convert_astc, bool convert_s8z24); - -} // namespace Tegra::Texture diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 16d46a018..9f5181318 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -2,204 +2,111 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <array> #include <cmath> #include <cstring> +#include <span> +#include <utility> + #include "common/alignment.h" #include "common/assert.h" #include "common/bit_util.h" +#include "common/div_ceil.h" #include "video_core/gpu.h" #include "video_core/textures/decoders.h" #include "video_core/textures/texture.h" namespace Tegra::Texture { -namespace { +namespace { /** - * This table represents the internal swizzle of a gob, - * in format 16 bytes x 2 sector packing. + * This table represents the internal swizzle of a gob, in format 16 bytes x 2 sector packing. * Calculates the offset of an (x, y) position within a swizzled texture. * Taken from the Tegra X1 Technical Reference Manual. pages 1187-1188 */ -template <std::size_t N, std::size_t M, u32 Align> -struct alignas(64) SwizzleTable { - static_assert(M * Align == 64, "Swizzle Table does not align to GOB"); - constexpr SwizzleTable() { - for (u32 y = 0; y < N; ++y) { - for (u32 x = 0; x < M; ++x) { - const u32 x2 = x * Align; - values[y][x] = static_cast<u16>(((x2 % 64) / 32) * 256 + ((y % 8) / 2) * 64 + - ((x2 % 32) / 16) * 32 + (y % 2) * 16 + (x2 % 16)); - } +constexpr SwizzleTable MakeSwizzleTableConst() { + SwizzleTable table{}; + for (u32 y = 0; y < table.size(); ++y) { + for (u32 x = 0; x < table[0].size(); ++x) { + table[y][x] = ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + + (y % 2) * 16 + (x % 16); } } - const std::array<u16, M>& operator[](std::size_t index) const { - return values[index]; - } - std::array<std::array<u16, M>, N> values{}; -}; + return table; +} -constexpr u32 FAST_SWIZZLE_ALIGN = 16; +constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTableConst(); -constexpr auto LEGACY_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_X, GOB_SIZE_X, GOB_SIZE_Z>(); -constexpr auto FAST_SWIZZLE_TABLE = SwizzleTable<GOB_SIZE_Y, 4, FAST_SWIZZLE_ALIGN>(); +template <bool TO_LINEAR> +void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, + u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { + // The origin of the transformation can be configured here, leave it as zero as the current API + // doesn't expose it. + static constexpr u32 origin_x = 0; + static constexpr u32 origin_y = 0; + static constexpr u32 origin_z = 0; -/** - * This function manages ALL the GOBs(Group of Bytes) Inside a single block. - * Instead of going gob by gob, we map the coordinates inside a block and manage from - * those. Block_Width is assumed to be 1. - */ -void PreciseProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, - const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end, - const u32 y_end, const u32 z_end, const u32 tile_offset, - const u32 xy_block_size, const u32 layer_z, const u32 stride_x, - const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) { - std::array<u8*, 2> data_ptrs; - u32 z_address = tile_offset; - - for (u32 z = z_start; z < z_end; z++) { - u32 y_address = z_address; - u32 pixel_base = layer_z * z + y_start * stride_x; - for (u32 y = y_start; y < y_end; y++) { - const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y]; - for (u32 x = x_start; x < x_end; x++) { - const u32 swizzle_offset{y_address + table[x * bytes_per_pixel % GOB_SIZE_X]}; - const u32 pixel_index{x * out_bytes_per_pixel + pixel_base}; - data_ptrs[unswizzle] = swizzled_data + swizzle_offset; - data_ptrs[!unswizzle] = unswizzled_data + pixel_index; - std::memcpy(data_ptrs[0], data_ptrs[1], bytes_per_pixel); - } - pixel_base += stride_x; - if ((y + 1) % GOB_SIZE_Y == 0) - y_address += GOB_SIZE; - } - z_address += xy_block_size; - } -} + // We can configure here a custom pitch + // As it's not exposed 'width * bpp' will be the expected pitch. + const u32 pitch = width * bytes_per_pixel; + const u32 stride = Common::AlignBits(width, stride_alignment) * bytes_per_pixel; -/** - * This function manages ALL the GOBs(Group of Bytes) Inside a single block. - * Instead of going gob by gob, we map the coordinates inside a block and manage from - * those. Block_Width is assumed to be 1. - */ -void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, - const u32 x_start, const u32 y_start, const u32 z_start, const u32 x_end, - const u32 y_end, const u32 z_end, const u32 tile_offset, - const u32 xy_block_size, const u32 layer_z, const u32 stride_x, - const u32 bytes_per_pixel, const u32 out_bytes_per_pixel) { - std::array<u8*, 2> data_ptrs; - u32 z_address = tile_offset; - const u32 x_startb = x_start * bytes_per_pixel; - const u32 x_endb = x_end * bytes_per_pixel; - - for (u32 z = z_start; z < z_end; z++) { - u32 y_address = z_address; - u32 pixel_base = layer_z * z + y_start * stride_x; - for (u32 y = y_start; y < y_end; y++) { - const auto& table = FAST_SWIZZLE_TABLE[y % GOB_SIZE_Y]; - for (u32 xb = x_startb; xb < x_endb; xb += FAST_SWIZZLE_ALIGN) { - const u32 swizzle_offset{y_address + table[(xb / FAST_SWIZZLE_ALIGN) % 4]}; - const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel; - const u32 pixel_index{out_x + pixel_base}; - data_ptrs[unswizzle ? 1 : 0] = swizzled_data + swizzle_offset; - data_ptrs[unswizzle ? 0 : 1] = unswizzled_data + pixel_index; - std::memcpy(data_ptrs[0], data_ptrs[1], FAST_SWIZZLE_ALIGN); - } - pixel_base += stride_x; - if ((y + 1) % GOB_SIZE_Y == 0) - y_address += GOB_SIZE; - } - z_address += xy_block_size; - } -} + const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); + const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); + const u32 slice_size = + Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size; -/** - * This function unswizzles or swizzles a texture by mapping Linear to BlockLinear Textue. - * The body of this function takes care of splitting the swizzled texture into blocks, - * and managing the extents of it. Once all the parameters of a single block are obtained, - * the function calls 'ProcessBlock' to process that particular Block. - * - * Documentation for the memory layout and decoding can be found at: - * https://envytools.readthedocs.io/en/latest/hw/memory/g80-surface.html#blocklinear-surfaces - */ -template <bool fast> -void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool unswizzle, - const u32 width, const u32 height, const u32 depth, const u32 bytes_per_pixel, - const u32 out_bytes_per_pixel, const u32 block_height, const u32 block_depth, - const u32 width_spacing) { - auto div_ceil = [](const u32 x, const u32 y) { return ((x + y - 1) / y); }; - const u32 stride_x = width * out_bytes_per_pixel; - const u32 layer_z = height * stride_x; - const u32 gob_elements_x = GOB_SIZE_X / bytes_per_pixel; - constexpr u32 gob_elements_y = GOB_SIZE_Y; - constexpr u32 gob_elements_z = GOB_SIZE_Z; - const u32 block_x_elements = gob_elements_x; - const u32 block_y_elements = gob_elements_y * block_height; - const u32 block_z_elements = gob_elements_z * block_depth; - const u32 aligned_width = Common::AlignUp(width, gob_elements_x * width_spacing); - const u32 blocks_on_x = div_ceil(aligned_width, block_x_elements); - const u32 blocks_on_y = div_ceil(height, block_y_elements); - const u32 blocks_on_z = div_ceil(depth, block_z_elements); - const u32 xy_block_size = GOB_SIZE * block_height; - const u32 block_size = xy_block_size * block_depth; - u32 tile_offset = 0; - for (u32 zb = 0; zb < blocks_on_z; zb++) { - const u32 z_start = zb * block_z_elements; - const u32 z_end = std::min(depth, z_start + block_z_elements); - for (u32 yb = 0; yb < blocks_on_y; yb++) { - const u32 y_start = yb * block_y_elements; - const u32 y_end = std::min(height, y_start + block_y_elements); - for (u32 xb = 0; xb < blocks_on_x; xb++) { - const u32 x_start = xb * block_x_elements; - const u32 x_end = std::min(width, x_start + block_x_elements); - if constexpr (fast) { - FastProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start, - z_start, x_end, y_end, z_end, tile_offset, xy_block_size, - layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel); - } else { - PreciseProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start, - z_start, x_end, y_end, z_end, tile_offset, xy_block_size, - layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel); - } - tile_offset += block_size; + const u32 block_height_mask = (1U << block_height) - 1; + const u32 block_depth_mask = (1U << block_depth) - 1; + const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth; + + for (u32 slice = 0; slice < depth; ++slice) { + const u32 z = slice + origin_z; + const u32 offset_z = (z >> block_depth) * slice_size + + ((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height)); + for (u32 line = 0; line < height; ++line) { + const u32 y = line + origin_y; + const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y]; + + const u32 block_y = y >> GOB_SIZE_Y_SHIFT; + const u32 offset_y = (block_y >> block_height) * block_size + + ((block_y & block_height_mask) << GOB_SIZE_SHIFT); + + for (u32 column = 0; column < width; ++column) { + const u32 x = (column + origin_x) * bytes_per_pixel; + const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; + + const u32 base_swizzled_offset = offset_z + offset_y + offset_x; + const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X]; + + const u32 unswizzled_offset = + slice * pitch * height + line * pitch + column * bytes_per_pixel; + + u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; + const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; + std::memcpy(dst, src, bytes_per_pixel); } } } } - } // Anonymous namespace -void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, - u32 out_bytes_per_pixel, u8* const swizzled_data, u8* const unswizzled_data, - bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing) { - const u32 block_height_size{1U << block_height}; - const u32 block_depth_size{1U << block_depth}; - if (bytes_per_pixel % 3 != 0 && (width * bytes_per_pixel) % FAST_SWIZZLE_ALIGN == 0) { - SwizzledData<true>(swizzled_data, unswizzled_data, unswizzle, width, height, depth, - bytes_per_pixel, out_bytes_per_pixel, block_height_size, - block_depth_size, width_spacing); - } else { - SwizzledData<false>(swizzled_data, unswizzled_data, unswizzle, width, height, depth, - bytes_per_pixel, out_bytes_per_pixel, block_height_size, - block_depth_size, width_spacing); - } +SwizzleTable MakeSwizzleTable() { + return SWIZZLE_TABLE; } -void UnswizzleTexture(u8* const unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y, - u32 bytes_per_pixel, u32 width, u32 height, u32 depth, u32 block_height, - u32 block_depth, u32 width_spacing) { - CopySwizzledData((width + tile_size_x - 1) / tile_size_x, - (height + tile_size_y - 1) / tile_size_y, depth, bytes_per_pixel, - bytes_per_pixel, address, unswizzled_data, true, block_height, block_depth, - width_spacing); +void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, + u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment) { + Swizzle<false>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth, + stride_alignment); } -std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel, - u32 width, u32 height, u32 depth, u32 block_height, - u32 block_depth, u32 width_spacing) { - std::vector<u8> unswizzled_data(width * height * depth * bytes_per_pixel); - UnswizzleTexture(unswizzled_data.data(), address, tile_size_x, tile_size_y, bytes_per_pixel, - width, height, depth, block_height, block_depth, width_spacing); - return unswizzled_data; +void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, + u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment) { + Swizzle<true>(output, input, bytes_per_pixel, width, height, depth, block_height, block_depth, + stride_alignment); } void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width, @@ -213,7 +120,7 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 const u32 gob_address_y = (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; - const auto& table = LEGACY_SWIZZLE_TABLE[dst_y % GOB_SIZE_Y]; + const auto& table = SWIZZLE_TABLE[dst_y % GOB_SIZE_Y]; for (u32 x = 0; x < subrect_width; ++x) { const u32 dst_x = x + offset_x; const u32 gob_address = @@ -235,11 +142,11 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height); const u32 block_height_mask = (1U << block_height) - 1; - const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height; + const u32 x_shift = GOB_SIZE_SHIFT + block_height; for (u32 line = 0; line < line_count; ++line) { const u32 src_y = line + origin_y; - const auto& table = LEGACY_SWIZZLE_TABLE[src_y % GOB_SIZE_Y]; + const auto& table = SWIZZLE_TABLE[src_y % GOB_SIZE_Y]; const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT; const u32 src_offset_y = (block_y >> block_height) * block_size + @@ -270,7 +177,7 @@ void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 widt const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth; for (u32 line = 0; line < line_count; ++line) { - const auto& table = LEGACY_SWIZZLE_TABLE[line % GOB_SIZE_Y]; + const auto& table = SWIZZLE_TABLE[line % GOB_SIZE_Y]; const u32 block_y = line / GOB_SIZE_Y; const u32 dst_offset_y = (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE; @@ -293,7 +200,7 @@ void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 const std::size_t gob_address_y = (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs + ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE; - const auto& table = LEGACY_SWIZZLE_TABLE[y % GOB_SIZE_Y]; + const auto& table = SWIZZLE_TABLE[y % GOB_SIZE_Y]; for (std::size_t x = dst_x; x < width && count < copy_size; ++x) { const std::size_t gob_address = gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height; diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index 01e156bc8..d7cdc81e8 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -4,7 +4,8 @@ #pragma once -#include <vector> +#include <span> + #include "common/common_types.h" #include "video_core/textures/texture.h" @@ -15,28 +16,25 @@ constexpr u32 GOB_SIZE_Y = 8; constexpr u32 GOB_SIZE_Z = 1; constexpr u32 GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; -constexpr std::size_t GOB_SIZE_X_SHIFT = 6; -constexpr std::size_t GOB_SIZE_Y_SHIFT = 3; -constexpr std::size_t GOB_SIZE_Z_SHIFT = 0; -constexpr std::size_t GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; - -/// Unswizzles a swizzled texture without changing its format. -void UnswizzleTexture(u8* unswizzled_data, u8* address, u32 tile_size_x, u32 tile_size_y, - u32 bytes_per_pixel, u32 width, u32 height, u32 depth, - u32 block_height = TICEntry::DefaultBlockHeight, - u32 block_depth = TICEntry::DefaultBlockHeight, u32 width_spacing = 0); - -/// Unswizzles a swizzled texture without changing its format. -std::vector<u8> UnswizzleTexture(u8* address, u32 tile_size_x, u32 tile_size_y, u32 bytes_per_pixel, - u32 width, u32 height, u32 depth, - u32 block_height = TICEntry::DefaultBlockHeight, - u32 block_depth = TICEntry::DefaultBlockHeight, - u32 width_spacing = 0); - -/// Copies texture data from a buffer and performs swizzling/unswizzling as necessary. -void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel, - u32 out_bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, - bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing); +constexpr u32 GOB_SIZE_X_SHIFT = 6; +constexpr u32 GOB_SIZE_Y_SHIFT = 3; +constexpr u32 GOB_SIZE_Z_SHIFT = 0; +constexpr u32 GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; + +using SwizzleTable = std::array<std::array<u32, GOB_SIZE_X>, GOB_SIZE_Y>; + +/// Returns a z-order swizzle table +SwizzleTable MakeSwizzleTable(); + +/// Unswizzles a block linear texture into linear memory. +void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, + u32 width, u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment = 1); + +/// Swizzles linear memory into a block linear texture. +void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, + u32 height, u32 depth, u32 block_height, u32 block_depth, + u32 stride_alignment = 1); /// This function calculates the correct size of a texture depending if it's tiled or not. std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth, diff --git a/src/video_core/textures/texture.cpp b/src/video_core/textures/texture.cpp index 4171e3ef2..ae5621a7d 100644 --- a/src/video_core/textures/texture.cpp +++ b/src/video_core/textures/texture.cpp @@ -5,9 +5,13 @@ #include <algorithm> #include <array> +#include "common/cityhash.h" #include "core/settings.h" #include "video_core/textures/texture.h" +using Tegra::Texture::TICEntry; +using Tegra::Texture::TSCEntry; + namespace Tegra::Texture { namespace { @@ -65,7 +69,7 @@ unsigned SettingsMinimumAnisotropy() noexcept { } // Anonymous namespace -std::array<float, 4> TSCEntry::GetBorderColor() const noexcept { +std::array<float, 4> TSCEntry::BorderColor() const noexcept { if (!srgb_conversion) { return border_color; } @@ -73,8 +77,16 @@ std::array<float, 4> TSCEntry::GetBorderColor() const noexcept { SRGB_CONVERSION_LUT[srgb_border_color_b], border_color[3]}; } -float TSCEntry::GetMaxAnisotropy() const noexcept { +float TSCEntry::MaxAnisotropy() const noexcept { return static_cast<float>(std::max(1U << max_anisotropy, SettingsMinimumAnisotropy())); } } // namespace Tegra::Texture + +size_t std::hash<TICEntry>::operator()(const TICEntry& tic) const noexcept { + return Common::CityHash64(reinterpret_cast<const char*>(&tic), sizeof tic); +} + +size_t std::hash<TSCEntry>::operator()(const TSCEntry& tsc) const noexcept { + return Common::CityHash64(reinterpret_cast<const char*>(&tsc), sizeof tsc); +} diff --git a/src/video_core/textures/texture.h b/src/video_core/textures/texture.h index 0574fef12..c1d14335e 100644 --- a/src/video_core/textures/texture.h +++ b/src/video_core/textures/texture.h @@ -53,27 +53,27 @@ enum class TextureFormat : u32 { BC4 = 0x27, BC5 = 0x28, S8D24 = 0x29, - X8Z24 = 0x2a, + X8D24 = 0x2a, D24S8 = 0x2b, - X4V4Z24__COV4R4V = 0x2c, - X4V4Z24__COV8R8V = 0x2d, - V8Z24__COV4R12V = 0x2e, + X4V4D24__COV4R4V = 0x2c, + X4V4D24__COV8R8V = 0x2d, + V8D24__COV4R12V = 0x2e, D32 = 0x2f, D32S8 = 0x30, - X8Z24_X20V4S8__COV4R4V = 0x31, - X8Z24_X20V4S8__COV8R8V = 0x32, - ZF32_X20V4X8__COV4R4V = 0x33, - ZF32_X20V4X8__COV8R8V = 0x34, - ZF32_X20V4S8__COV4R4V = 0x35, - ZF32_X20V4S8__COV8R8V = 0x36, - X8Z24_X16V8S8__COV4R12V = 0x37, - ZF32_X16V8X8__COV4R12V = 0x38, - ZF32_X16V8S8__COV4R12V = 0x39, + X8D24_X20V4S8__COV4R4V = 0x31, + X8D24_X20V4S8__COV8R8V = 0x32, + D32_X20V4X8__COV4R4V = 0x33, + D32_X20V4X8__COV8R8V = 0x34, + D32_X20V4S8__COV4R4V = 0x35, + D32_X20V4S8__COV8R8V = 0x36, + X8D24_X16V8S8__COV4R12V = 0x37, + D32_X16V8X8__COV4R12V = 0x38, + D32_X16V8S8__COV4R12V = 0x39, D16 = 0x3a, - V8Z24__COV8R24V = 0x3b, - X8Z24_X16V8S8__COV8R24V = 0x3c, - ZF32_X16V8X8__COV8R24V = 0x3d, - ZF32_X16V8S8__COV8R24V = 0x3e, + V8D24__COV8R24V = 0x3b, + X8D24_X16V8S8__COV8R24V = 0x3c, + D32_X16V8X8__COV8R24V = 0x3d, + D32_X16V8S8__COV8R24V = 0x3e, ASTC_2D_4X4 = 0x40, ASTC_2D_5X5 = 0x41, ASTC_2D_6X6 = 0x42, @@ -146,7 +146,7 @@ enum class MsaaMode : u32 { }; union TextureHandle { - TextureHandle(u32 raw) : raw{raw} {} + /* implicit */ constexpr TextureHandle(u32 raw_) : raw{raw_} {} u32 raw; BitField<0, 20, u32> tic_id; @@ -155,124 +155,124 @@ union TextureHandle { static_assert(sizeof(TextureHandle) == 4, "TextureHandle has wrong size"); struct TICEntry { - static constexpr u32 DefaultBlockHeight = 16; - static constexpr u32 DefaultBlockDepth = 1; - - union { - u32 raw; - BitField<0, 7, TextureFormat> format; - BitField<7, 3, ComponentType> r_type; - BitField<10, 3, ComponentType> g_type; - BitField<13, 3, ComponentType> b_type; - BitField<16, 3, ComponentType> a_type; - - BitField<19, 3, SwizzleSource> x_source; - BitField<22, 3, SwizzleSource> y_source; - BitField<25, 3, SwizzleSource> z_source; - BitField<28, 3, SwizzleSource> w_source; - }; - u32 address_low; union { - BitField<0, 16, u32> address_high; - BitField<21, 3, TICHeaderVersion> header_version; - }; - union { - BitField<0, 3, u32> block_width; - BitField<3, 3, u32> block_height; - BitField<6, 3, u32> block_depth; + struct { + union { + BitField<0, 7, TextureFormat> format; + BitField<7, 3, ComponentType> r_type; + BitField<10, 3, ComponentType> g_type; + BitField<13, 3, ComponentType> b_type; + BitField<16, 3, ComponentType> a_type; + + BitField<19, 3, SwizzleSource> x_source; + BitField<22, 3, SwizzleSource> y_source; + BitField<25, 3, SwizzleSource> z_source; + BitField<28, 3, SwizzleSource> w_source; + }; + u32 address_low; + union { + BitField<0, 16, u32> address_high; + BitField<16, 5, u32> layer_base_3_7; + BitField<21, 3, TICHeaderVersion> header_version; + BitField<24, 1, u32> load_store_hint; + BitField<25, 4, u32> view_coherency_hash; + BitField<29, 3, u32> layer_base_8_10; + }; + union { + BitField<0, 3, u32> block_width; + BitField<3, 3, u32> block_height; + BitField<6, 3, u32> block_depth; - BitField<10, 3, u32> tile_width_spacing; + BitField<10, 3, u32> tile_width_spacing; - // High 16 bits of the pitch value - BitField<0, 16, u32> pitch_high; - BitField<26, 1, u32> use_header_opt_control; - BitField<27, 1, u32> depth_texture; - BitField<28, 4, u32> max_mip_level; + // High 16 bits of the pitch value + BitField<0, 16, u32> pitch_high; + BitField<26, 1, u32> use_header_opt_control; + BitField<27, 1, u32> depth_texture; + BitField<28, 4, u32> max_mip_level; - BitField<0, 16, u32> buffer_high_width_minus_one; - }; - union { - BitField<0, 16, u32> width_minus_1; - BitField<22, 1, u32> srgb_conversion; - BitField<23, 4, TextureType> texture_type; - BitField<29, 3, u32> border_size; + BitField<0, 16, u32> buffer_high_width_minus_one; + }; + union { + BitField<0, 16, u32> width_minus_one; + BitField<16, 3, u32> layer_base_0_2; + BitField<22, 1, u32> srgb_conversion; + BitField<23, 4, TextureType> texture_type; + BitField<29, 3, u32> border_size; - BitField<0, 16, u32> buffer_low_width_minus_one; - }; - union { - BitField<0, 16, u32> height_minus_1; - BitField<16, 14, u32> depth_minus_1; - }; - union { - BitField<6, 13, u32> mip_lod_bias; - BitField<27, 3, u32> max_anisotropy; + BitField<0, 16, u32> buffer_low_width_minus_one; + }; + union { + BitField<0, 16, u32> height_minus_1; + BitField<16, 14, u32> depth_minus_1; + BitField<30, 1, u32> is_sparse; + BitField<31, 1, u32> normalized_coords; + }; + union { + BitField<6, 13, u32> mip_lod_bias; + BitField<27, 3, u32> max_anisotropy; + }; + union { + BitField<0, 4, u32> res_min_mip_level; + BitField<4, 4, u32> res_max_mip_level; + BitField<8, 4, MsaaMode> msaa_mode; + BitField<12, 12, u32> min_lod_clamp; + }; + }; + std::array<u64, 4> raw; }; - union { - BitField<0, 4, u32> res_min_mip_level; - BitField<4, 4, u32> res_max_mip_level; - BitField<8, 4, MsaaMode> msaa_mode; - BitField<12, 12, u32> min_lod_clamp; - }; + constexpr bool operator==(const TICEntry& rhs) const noexcept { + return raw == rhs.raw; + } - GPUVAddr Address() const { + constexpr bool operator!=(const TICEntry& rhs) const noexcept { + return raw != rhs.raw; + } + + constexpr GPUVAddr Address() const { return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); } - u32 Pitch() const { + constexpr u32 Pitch() const { ASSERT(header_version == TICHeaderVersion::Pitch || header_version == TICHeaderVersion::PitchColorKey); // The pitch value is 21 bits, and is 32B aligned. return pitch_high << 5; } - u32 Width() const { + constexpr u32 Width() const { if (header_version != TICHeaderVersion::OneDBuffer) { - return width_minus_1 + 1; + return width_minus_one + 1; } - return ((buffer_high_width_minus_one << 16) | buffer_low_width_minus_one) + 1; + return (buffer_high_width_minus_one << 16 | buffer_low_width_minus_one) + 1; } - u32 Height() const { + constexpr u32 Height() const { return height_minus_1 + 1; } - u32 Depth() const { + constexpr u32 Depth() const { return depth_minus_1 + 1; } - u32 BlockWidth() const { - ASSERT(IsTiled()); - return block_width; - } - - u32 BlockHeight() const { - ASSERT(IsTiled()); - return block_height; - } - - u32 BlockDepth() const { - ASSERT(IsTiled()); - return block_depth; + constexpr u32 BaseLayer() const { + return layer_base_0_2 | layer_base_3_7 << 3 | layer_base_8_10 << 8; } - bool IsTiled() const { + constexpr bool IsBlockLinear() const { return header_version == TICHeaderVersion::BlockLinear || header_version == TICHeaderVersion::BlockLinearColorKey; } - bool IsLineal() const { + constexpr bool IsPitchLinear() const { return header_version == TICHeaderVersion::Pitch || header_version == TICHeaderVersion::PitchColorKey; } - bool IsBuffer() const { + constexpr bool IsBuffer() const { return header_version == TICHeaderVersion::OneDBuffer; } - - bool IsSrgbConversionEnabled() const { - return srgb_conversion != 0; - } }; static_assert(sizeof(TICEntry) == 0x20, "TICEntry has wrong size"); @@ -309,6 +309,12 @@ enum class TextureMipmapFilter : u32 { Linear = 3, }; +enum class SamplerReduction : u32 { + WeightedAverage = 0, + Min = 1, + Max = 2, +}; + enum class Anisotropy { Default, Filter2x, @@ -333,8 +339,12 @@ struct TSCEntry { BitField<0, 2, TextureFilter> mag_filter; BitField<4, 2, TextureFilter> min_filter; BitField<6, 2, TextureMipmapFilter> mipmap_filter; + BitField<8, 1, u32> cubemap_anisotropy; BitField<9, 1, u32> cubemap_interface_filtering; + BitField<10, 2, SamplerReduction> reduction_filter; BitField<12, 13, u32> mip_lod_bias; + BitField<25, 1, u32> float_coord_normalization; + BitField<26, 5, u32> trilin_opt; }; union { BitField<0, 12, u32> min_lod_clamp; @@ -347,32 +357,45 @@ struct TSCEntry { }; std::array<f32, 4> border_color; }; - std::array<u8, 0x20> raw; + std::array<u64, 4> raw; }; - std::array<float, 4> GetBorderColor() const noexcept; + constexpr bool operator==(const TSCEntry& rhs) const noexcept { + return raw == rhs.raw; + } + + constexpr bool operator!=(const TSCEntry& rhs) const noexcept { + return raw != rhs.raw; + } + + std::array<float, 4> BorderColor() const noexcept; - float GetMaxAnisotropy() const noexcept; + float MaxAnisotropy() const noexcept; - float GetMinLod() const { + float MinLod() const { return static_cast<float>(min_lod_clamp) / 256.0f; } - float GetMaxLod() const { + float MaxLod() const { return static_cast<float>(max_lod_clamp) / 256.0f; } - float GetLodBias() const { + float LodBias() const { // Sign extend the 13-bit value. - constexpr u32 mask = 1U << (13 - 1); + static constexpr u32 mask = 1U << (13 - 1); return static_cast<float>(static_cast<s32>((mip_lod_bias ^ mask) - mask)) / 256.0f; } }; static_assert(sizeof(TSCEntry) == 0x20, "TSCEntry has wrong size"); -struct FullTextureInfo { - TICEntry tic; - TSCEntry tsc; +} // namespace Tegra::Texture + +template <> +struct std::hash<Tegra::Texture::TICEntry> { + size_t operator()(const Tegra::Texture::TICEntry& tic) const noexcept; }; -} // namespace Tegra::Texture +template <> +struct std::hash<Tegra::Texture::TSCEntry> { + size_t operator()(const Tegra::Texture::TSCEntry& tsc) const noexcept; +}; diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index a14df06a3..53444e945 100644 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -7,13 +7,9 @@ #include "common/logging/log.h" #include "core/core.h" #include "core/settings.h" -#include "video_core/gpu_asynch.h" -#include "video_core/gpu_synch.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/renderer_opengl.h" -#ifdef HAS_VULKAN #include "video_core/renderer_vulkan/renderer_vulkan.h" -#endif #include "video_core/video_core.h" namespace { @@ -28,11 +24,9 @@ std::unique_ptr<VideoCore::RendererBase> CreateRenderer( case Settings::RendererBackend::OpenGL: return std::make_unique<OpenGL::RendererOpenGL>(telemetry_session, emu_window, cpu_memory, gpu, std::move(context)); -#ifdef HAS_VULKAN case Settings::RendererBackend::Vulkan: return std::make_unique<Vulkan::RendererVulkan>(telemetry_session, emu_window, cpu_memory, gpu, std::move(context)); -#endif default: return nullptr; } @@ -43,12 +37,9 @@ std::unique_ptr<VideoCore::RendererBase> CreateRenderer( namespace VideoCore { std::unique_ptr<Tegra::GPU> CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) { - std::unique_ptr<Tegra::GPU> gpu; - if (Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - gpu = std::make_unique<VideoCommon::GPUAsynch>(system); - } else { - gpu = std::make_unique<VideoCommon::GPUSynch>(system); - } + const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue(); + std::unique_ptr<Tegra::GPU> gpu = std::make_unique<Tegra::GPU>( + system, Settings::values.use_asynchronous_gpu_emulation.GetValue(), use_nvdec); auto context = emu_window.CreateSharedContext(); const auto scope = context->Acquire(); diff --git a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp index 5b01020ec..8d10ac29e 100644 --- a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.cpp +++ b/src/video_core/vulkan_common/nsight_aftermath_tracker.cpp @@ -32,20 +32,11 @@ namespace Vulkan { static constexpr char AFTERMATH_LIB_NAME[] = "GFSDK_Aftermath_Lib.x64.dll"; -NsightAftermathTracker::NsightAftermathTracker() = default; - -NsightAftermathTracker::~NsightAftermathTracker() { - if (initialized) { - (void)GFSDK_Aftermath_DisableGpuCrashDumps(); - } -} - -bool NsightAftermathTracker::Initialize() { +NsightAftermathTracker::NsightAftermathTracker() { if (!dl.Open(AFTERMATH_LIB_NAME)) { LOG_ERROR(Render_Vulkan, "Failed to load Nsight Aftermath DLL"); - return false; + return; } - if (!dl.GetSymbol("GFSDK_Aftermath_DisableGpuCrashDumps", &GFSDK_Aftermath_DisableGpuCrashDumps) || !dl.GetSymbol("GFSDK_Aftermath_EnableGpuCrashDumps", @@ -64,27 +55,28 @@ bool NsightAftermathTracker::Initialize() { LOG_ERROR(Render_Vulkan, "Failed to load Nsight Aftermath function pointers"); return false; } - dump_dir = Common::FS::GetUserPath(Common::FS::UserPath::LogDir) + "gpucrash"; - (void)Common::FS::DeleteDirRecursively(dump_dir); + void(Common::FS::DeleteDirRecursively(dump_dir)); if (!Common::FS::CreateDir(dump_dir)) { LOG_ERROR(Render_Vulkan, "Failed to create Nsight Aftermath dump directory"); - return false; + return; } - if (!GFSDK_Aftermath_SUCCEED(GFSDK_Aftermath_EnableGpuCrashDumps( GFSDK_Aftermath_Version_API, GFSDK_Aftermath_GpuCrashDumpWatchedApiFlags_Vulkan, GFSDK_Aftermath_GpuCrashDumpFeatureFlags_Default, GpuCrashDumpCallback, ShaderDebugInfoCallback, CrashDumpDescriptionCallback, this))) { LOG_ERROR(Render_Vulkan, "GFSDK_Aftermath_EnableGpuCrashDumps failed"); - return false; + return; } - LOG_INFO(Render_Vulkan, "Nsight Aftermath dump directory is \"{}\"", dump_dir); - initialized = true; - return true; +} + +NsightAftermathTracker::~NsightAftermathTracker() { + if (initialized) { + (void)GFSDK_Aftermath_DisableGpuCrashDumps(); + } } void NsightAftermathTracker::SaveShader(const std::vector<u32>& spirv) const { diff --git a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h b/src/video_core/vulkan_common/nsight_aftermath_tracker.h index afe7ae99e..cee3847fb 100644 --- a/src/video_core/renderer_vulkan/nsight_aftermath_tracker.h +++ b/src/video_core/vulkan_common/nsight_aftermath_tracker.h @@ -34,8 +34,6 @@ public: NsightAftermathTracker(NsightAftermathTracker&&) = delete; NsightAftermathTracker& operator=(NsightAftermathTracker&&) = delete; - bool Initialize(); - void SaveShader(const std::vector<u32>& spirv) const; private: @@ -78,9 +76,6 @@ private: #ifndef HAS_NSIGHT_AFTERMATH inline NsightAftermathTracker::NsightAftermathTracker() = default; inline NsightAftermathTracker::~NsightAftermathTracker() = default; -inline bool NsightAftermathTracker::Initialize() { - return false; -} inline void NsightAftermathTracker::SaveShader(const std::vector<u32>&) const {} #endif diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp new file mode 100644 index 000000000..ea7af8ad4 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp @@ -0,0 +1,45 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <string_view> +#include "common/logging/log.h" +#include "video_core/vulkan_common/vulkan_debug_callback.h" + +namespace Vulkan { +namespace { +VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, + VkDebugUtilsMessageTypeFlagsEXT type, + const VkDebugUtilsMessengerCallbackDataEXT* data, + [[maybe_unused]] void* user_data) { + const std::string_view message{data->pMessage}; + if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) { + LOG_CRITICAL(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) { + LOG_WARNING(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT) { + LOG_INFO(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT) { + LOG_DEBUG(Render_Vulkan, "{}", message); + } + return VK_FALSE; +} +} // Anonymous namespace + +vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance) { + return instance.CreateDebugUtilsMessenger(VkDebugUtilsMessengerCreateInfoEXT{ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, + .pNext = nullptr, + .flags = 0, + .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT, + .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | + VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, + .pfnUserCallback = Callback, + }); +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.h b/src/video_core/vulkan_common/vulkan_debug_callback.h new file mode 100644 index 000000000..2efcd244c --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_debug_callback.h @@ -0,0 +1,11 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance); + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 05e31f1de..75173324e 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -13,8 +13,9 @@ #include "common/assert.h" #include "core/settings.h" -#include "video_core/renderer_vulkan/vk_device.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/nsight_aftermath_tracker.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -38,11 +39,15 @@ constexpr std::array Depth16UnormS8_UINT{ constexpr std::array REQUIRED_EXTENSIONS{ VK_KHR_SWAPCHAIN_EXTENSION_NAME, + VK_KHR_MAINTENANCE1_EXTENSION_NAME, + VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME, + VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME, VK_KHR_16BIT_STORAGE_EXTENSION_NAME, VK_KHR_8BIT_STORAGE_EXTENSION_NAME, VK_KHR_DRIVER_PROPERTIES_EXTENSION_NAME, VK_KHR_DESCRIPTOR_UPDATE_TEMPLATE_EXTENSION_NAME, VK_KHR_TIMELINE_SEMAPHORE_EXTENSION_NAME, + VK_KHR_SAMPLER_MIRROR_CLAMP_TO_EDGE_EXTENSION_NAME, VK_EXT_VERTEX_ATTRIBUTE_DIVISOR_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME, @@ -79,6 +84,21 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType } } +[[nodiscard]] bool IsRDNA(std::string_view device_name, VkDriverIdKHR driver_id) { + static constexpr std::array RDNA_DEVICES{ + "5700", + "5600", + "5500", + "5300", + }; + if (driver_id != VK_DRIVER_ID_AMD_PROPRIETARY_KHR) { + return false; + } + return std::any_of(RDNA_DEVICES.begin(), RDNA_DEVICES.end(), [device_name](const char* name) { + return device_name.find(name) != std::string_view::npos; + }); +} + std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) { static constexpr std::array formats{ @@ -104,6 +124,7 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( VK_FORMAT_R16G16_UNORM, VK_FORMAT_R16G16_SNORM, VK_FORMAT_R16G16_SFLOAT, + VK_FORMAT_R16G16_SINT, VK_FORMAT_R16_UNORM, VK_FORMAT_R16_UINT, VK_FORMAT_R8G8B8A8_SRGB, @@ -143,18 +164,32 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( VK_FORMAT_BC2_SRGB_BLOCK, VK_FORMAT_BC3_SRGB_BLOCK, VK_FORMAT_BC7_SRGB_BLOCK, + VK_FORMAT_ASTC_4x4_UNORM_BLOCK, VK_FORMAT_ASTC_4x4_SRGB_BLOCK, - VK_FORMAT_ASTC_8x8_SRGB_BLOCK, - VK_FORMAT_ASTC_8x5_SRGB_BLOCK, + VK_FORMAT_ASTC_5x4_UNORM_BLOCK, VK_FORMAT_ASTC_5x4_SRGB_BLOCK, VK_FORMAT_ASTC_5x5_UNORM_BLOCK, VK_FORMAT_ASTC_5x5_SRGB_BLOCK, - VK_FORMAT_ASTC_10x8_UNORM_BLOCK, - VK_FORMAT_ASTC_10x8_SRGB_BLOCK, + VK_FORMAT_ASTC_6x5_UNORM_BLOCK, + VK_FORMAT_ASTC_6x5_SRGB_BLOCK, VK_FORMAT_ASTC_6x6_UNORM_BLOCK, VK_FORMAT_ASTC_6x6_SRGB_BLOCK, + VK_FORMAT_ASTC_8x5_UNORM_BLOCK, + VK_FORMAT_ASTC_8x5_SRGB_BLOCK, + VK_FORMAT_ASTC_8x6_UNORM_BLOCK, + VK_FORMAT_ASTC_8x6_SRGB_BLOCK, + VK_FORMAT_ASTC_8x8_UNORM_BLOCK, + VK_FORMAT_ASTC_8x8_SRGB_BLOCK, + VK_FORMAT_ASTC_10x5_UNORM_BLOCK, + VK_FORMAT_ASTC_10x5_SRGB_BLOCK, + VK_FORMAT_ASTC_10x6_UNORM_BLOCK, + VK_FORMAT_ASTC_10x6_SRGB_BLOCK, + VK_FORMAT_ASTC_10x8_UNORM_BLOCK, + VK_FORMAT_ASTC_10x8_SRGB_BLOCK, VK_FORMAT_ASTC_10x10_UNORM_BLOCK, VK_FORMAT_ASTC_10x10_SRGB_BLOCK, + VK_FORMAT_ASTC_12x10_UNORM_BLOCK, + VK_FORMAT_ASTC_12x10_SRGB_BLOCK, VK_FORMAT_ASTC_12x12_UNORM_BLOCK, VK_FORMAT_ASTC_12x12_SRGB_BLOCK, VK_FORMAT_ASTC_8x6_UNORM_BLOCK, @@ -172,17 +207,14 @@ std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( } // Anonymous namespace -VKDevice::VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, - const vk::InstanceDispatch& dld) - : dld{dld}, physical{physical}, properties{physical.GetProperties()}, +Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR surface, + const vk::InstanceDispatch& dld_) + : instance{instance_}, dld{dld_}, physical{physical_}, properties{physical.GetProperties()}, format_properties{GetFormatProperties(physical, dld)} { + CheckSuitability(); SetupFamilies(surface); SetupFeatures(); -} - -VKDevice::~VKDevice() = default; -bool VKDevice::Create() { const auto queue_cis = GetDeviceQueueCreateInfos(); const std::vector extensions = LoadExtensions(); @@ -196,7 +228,7 @@ bool VKDevice::Create() { features2.features = { .robustBufferAccess = false, .fullDrawIndexUint32 = false, - .imageCubeArray = false, + .imageCubeArray = true, .independentBlend = true, .geometryShader = true, .tessellationShader = true, @@ -224,7 +256,7 @@ bool VKDevice::Create() { .shaderTessellationAndGeometryPointSize = false, .shaderImageGatherExtended = true, .shaderStorageImageExtendedFormats = false, - .shaderStorageImageMultisample = false, + .shaderStorageImageMultisample = true, .shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported, .shaderStorageImageWriteWithoutFormat = true, .shaderUniformBufferArrayDynamicIndexing = false, @@ -250,7 +282,6 @@ bool VKDevice::Create() { .variableMultisampleRate = false, .inheritedQueries = false, }; - VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR, .pNext = nullptr, @@ -362,13 +393,27 @@ bool VKDevice::Create() { LOG_INFO(Render_Vulkan, "Device doesn't support extended dynamic state"); } + VkPhysicalDeviceRobustness2FeaturesEXT robustness2; + if (ext_robustness2) { + robustness2 = { + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT, + .pNext = nullptr, + .robustBufferAccess2 = false, + .robustImageAccess2 = true, + .nullDescriptor = true, + }; + SetNext(next, robustness2); + } else { + LOG_INFO(Render_Vulkan, "Device doesn't support robustness2"); + } + if (!ext_depth_range_unrestricted) { LOG_INFO(Render_Vulkan, "Device doesn't support depth range unrestricted"); } VkDeviceDiagnosticsConfigCreateInfoNV diagnostics_nv; if (nv_device_diagnostics_config) { - nsight_aftermath_tracker.Initialize(); + nsight_aftermath_tracker = std::make_unique<NsightAftermathTracker>(); diagnostics_nv = { .sType = VK_STRUCTURE_TYPE_DEVICE_DIAGNOSTICS_CONFIG_CREATE_INFO_NV, @@ -379,20 +424,23 @@ bool VKDevice::Create() { }; first_next = &diagnostics_nv; } - logical = vk::Device::Create(physical, queue_cis, extensions, first_next, dld); - if (!logical) { - LOG_ERROR(Render_Vulkan, "Failed to create logical device"); - return false; - } CollectTelemetryParameters(); + CollectToolingInfo(); - if (ext_extended_dynamic_state && driver_id == VK_DRIVER_ID_AMD_PROPRIETARY_KHR) { - // AMD's proprietary driver supports VK_EXT_extended_dynamic_state but the <stride> field - // seems to be bugged. Blacklisting it for now. - LOG_WARNING(Render_Vulkan, - "Blacklisting AMD proprietary from VK_EXT_extended_dynamic_state"); + if (ext_extended_dynamic_state && driver_id == VK_DRIVER_ID_MESA_RADV) { + LOG_WARNING( + Render_Vulkan, + "Blacklisting RADV for VK_EXT_extended_dynamic state, likely due to a bug in yuzu"); + ext_extended_dynamic_state = false; + } + if (ext_extended_dynamic_state && IsRDNA(properties.deviceName, driver_id)) { + // AMD's proprietary driver supports VK_EXT_extended_dynamic_state but on RDNA devices it + // seems to cause stability issues + LOG_WARNING( + Render_Vulkan, + "Blacklisting AMD proprietary on RDNA devices from VK_EXT_extended_dynamic_state"); ext_extended_dynamic_state = false; } @@ -400,11 +448,12 @@ bool VKDevice::Create() { present_queue = logical.GetQueue(present_family); use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); - return true; } -VkFormat VKDevice::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, - FormatType format_type) const { +Device::~Device() = default; + +VkFormat Device::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, + FormatType format_type) const { if (IsFormatSupported(wanted_format, wanted_usage, format_type)) { return wanted_format; } @@ -435,18 +484,20 @@ VkFormat VKDevice::GetSupportedFormat(VkFormat wanted_format, VkFormatFeatureFla return wanted_format; } -void VKDevice::ReportLoss() const { +void Device::ReportLoss() const { LOG_CRITICAL(Render_Vulkan, "Device loss occured!"); // Wait for the log to flush and for Nsight Aftermath to dump the results - std::this_thread::sleep_for(std::chrono::seconds{3}); + std::this_thread::sleep_for(std::chrono::seconds{15}); } -void VKDevice::SaveShader(const std::vector<u32>& spirv) const { - nsight_aftermath_tracker.SaveShader(spirv); +void Device::SaveShader(const std::vector<u32>& spirv) const { + if (nsight_aftermath_tracker) { + nsight_aftermath_tracker->SaveShader(spirv); + } } -bool VKDevice::IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) const { +bool Device::IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) const { // Disable for now to avoid converting ASTC twice. static constexpr std::array astc_formats = { VK_FORMAT_ASTC_4x4_UNORM_BLOCK, VK_FORMAT_ASTC_4x4_SRGB_BLOCK, @@ -472,16 +523,26 @@ bool VKDevice::IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) VK_FORMAT_FEATURE_BLIT_DST_BIT | VK_FORMAT_FEATURE_TRANSFER_SRC_BIT | VK_FORMAT_FEATURE_TRANSFER_DST_BIT}; for (const auto format : astc_formats) { - const auto format_properties{physical.GetFormatProperties(format)}; - if (!(format_properties.optimalTilingFeatures & format_feature_usage)) { + const auto physical_format_properties{physical.GetFormatProperties(format)}; + if ((physical_format_properties.optimalTilingFeatures & format_feature_usage) == 0) { return false; } } return true; } -bool VKDevice::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, - FormatType format_type) const { +bool Device::TestDepthStencilBlits() const { + static constexpr VkFormatFeatureFlags required_features = + VK_FORMAT_FEATURE_BLIT_SRC_BIT | VK_FORMAT_FEATURE_BLIT_DST_BIT; + const auto test_features = [](VkFormatProperties props) { + return (props.optimalTilingFeatures & required_features) == required_features; + }; + return test_features(format_properties.at(VK_FORMAT_D32_SFLOAT_S8_UINT)) && + test_features(format_properties.at(VK_FORMAT_D24_UNORM_S8_UINT)); +} + +bool Device::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, + FormatType format_type) const { const auto it = format_properties.find(wanted_format); if (it == format_properties.end()) { UNIMPLEMENTED_MSG("Unimplemented format query={}", wanted_format); @@ -491,65 +552,47 @@ bool VKDevice::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wa return (supported_usage & wanted_usage) == wanted_usage; } -bool VKDevice::IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface) { - bool is_suitable = true; +void Device::CheckSuitability() const { std::bitset<REQUIRED_EXTENSIONS.size()> available_extensions; - - for (const auto& prop : physical.EnumerateDeviceExtensionProperties()) { + for (const VkExtensionProperties& property : physical.EnumerateDeviceExtensionProperties()) { for (std::size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { if (available_extensions[i]) { continue; } - const std::string_view name{prop.extensionName}; + const std::string_view name{property.extensionName}; available_extensions[i] = name == REQUIRED_EXTENSIONS[i]; } } - if (!available_extensions.all()) { - for (std::size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { - if (available_extensions[i]) { - continue; - } - LOG_ERROR(Render_Vulkan, "Missing required extension: {}", REQUIRED_EXTENSIONS[i]); - is_suitable = false; - } - } - - bool has_graphics{}, has_present{}; - const std::vector queue_family_properties = physical.GetQueueFamilyProperties(); - for (u32 i = 0; i < static_cast<u32>(queue_family_properties.size()); ++i) { - const auto& family = queue_family_properties[i]; - if (family.queueCount == 0) { + for (size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { + if (available_extensions[i]) { continue; } - has_graphics |= family.queueFlags & VK_QUEUE_GRAPHICS_BIT; - has_present |= physical.GetSurfaceSupportKHR(i, surface); + LOG_ERROR(Render_Vulkan, "Missing required extension: {}", REQUIRED_EXTENSIONS[i]); + throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); } - if (!has_graphics || !has_present) { - LOG_ERROR(Render_Vulkan, "Device lacks a graphics and present queue"); - is_suitable = false; - } - - // TODO(Rodrigo): Check if the device matches all requeriments. - const auto properties{physical.GetProperties()}; - const auto& limits{properties.limits}; - - constexpr u32 required_ubo_size = 65536; - if (limits.maxUniformBufferRange < required_ubo_size) { - LOG_ERROR(Render_Vulkan, "Device UBO size {} is too small, {} is required", - limits.maxUniformBufferRange, required_ubo_size); - is_suitable = false; - } - - constexpr u32 required_num_viewports = 16; - if (limits.maxViewports < required_num_viewports) { - LOG_INFO(Render_Vulkan, "Device number of viewports {} is too small, {} is required", - limits.maxViewports, required_num_viewports); - is_suitable = false; + struct LimitTuple { + u32 minimum; + u32 value; + const char* name; + }; + const VkPhysicalDeviceLimits& limits{properties.limits}; + const std::array limits_report{ + LimitTuple{65536, limits.maxUniformBufferRange, "maxUniformBufferRange"}, + LimitTuple{16, limits.maxViewports, "maxViewports"}, + LimitTuple{8, limits.maxColorAttachments, "maxColorAttachments"}, + LimitTuple{8, limits.maxClipDistances, "maxClipDistances"}, + }; + for (const auto& tuple : limits_report) { + if (tuple.value < tuple.minimum) { + LOG_ERROR(Render_Vulkan, "{} has to be {} or greater but it is {}", tuple.name, + tuple.minimum, tuple.value); + throw vk::Exception(VK_ERROR_FEATURE_NOT_PRESENT); + } } - - const auto features{physical.GetFeatures()}; - const std::array feature_report = { + const VkPhysicalDeviceFeatures features{physical.GetFeatures()}; + const std::array feature_report{ std::make_pair(features.vertexPipelineStoresAndAtomics, "vertexPipelineStoresAndAtomics"), + std::make_pair(features.imageCubeArray, "imageCubeArray"), std::make_pair(features.independentBlend, "independentBlend"), std::make_pair(features.depthClamp, "depthClamp"), std::make_pair(features.samplerAnisotropy, "samplerAnisotropy"), @@ -561,40 +604,21 @@ bool VKDevice::IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface) { std::make_pair(features.occlusionQueryPrecise, "occlusionQueryPrecise"), std::make_pair(features.fragmentStoresAndAtomics, "fragmentStoresAndAtomics"), std::make_pair(features.shaderImageGatherExtended, "shaderImageGatherExtended"), + std::make_pair(features.shaderStorageImageMultisample, "shaderStorageImageMultisample"), std::make_pair(features.shaderStorageImageWriteWithoutFormat, "shaderStorageImageWriteWithoutFormat"), }; - for (const auto& [supported, name] : feature_report) { - if (supported) { + for (const auto& [is_supported, name] : feature_report) { + if (is_supported) { continue; } LOG_ERROR(Render_Vulkan, "Missing required feature: {}", name); - is_suitable = false; - } - - if (!is_suitable) { - LOG_ERROR(Render_Vulkan, "{} is not suitable", properties.deviceName); + throw vk::Exception(VK_ERROR_FEATURE_NOT_PRESENT); } - - return is_suitable; } -std::vector<const char*> VKDevice::LoadExtensions() { +std::vector<const char*> Device::LoadExtensions() { std::vector<const char*> extensions; - const auto Test = [&](const VkExtensionProperties& extension, - std::optional<std::reference_wrapper<bool>> status, const char* name, - bool push) { - if (extension.extensionName != std::string_view(name)) { - return; - } - if (push) { - extensions.push_back(name); - } - if (status) { - status->get() = true; - } - }; - extensions.reserve(7 + REQUIRED_EXTENSIONS.size()); extensions.insert(extensions.begin(), REQUIRED_EXTENSIONS.begin(), REQUIRED_EXTENSIONS.end()); @@ -603,36 +627,47 @@ std::vector<const char*> VKDevice::LoadExtensions() { bool has_ext_transform_feedback{}; bool has_ext_custom_border_color{}; bool has_ext_extended_dynamic_state{}; - for (const auto& extension : physical.EnumerateDeviceExtensionProperties()) { - Test(extension, nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true); - Test(extension, khr_uniform_buffer_standard_layout, + bool has_ext_robustness2{}; + for (const VkExtensionProperties& extension : physical.EnumerateDeviceExtensionProperties()) { + const auto test = [&](std::optional<std::reference_wrapper<bool>> status, const char* name, + bool push) { + if (extension.extensionName != std::string_view(name)) { + return; + } + if (push) { + extensions.push_back(name); + } + if (status) { + status->get() = true; + } + }; + test(nv_viewport_swizzle, VK_NV_VIEWPORT_SWIZZLE_EXTENSION_NAME, true); + test(khr_uniform_buffer_standard_layout, VK_KHR_UNIFORM_BUFFER_STANDARD_LAYOUT_EXTENSION_NAME, true); - Test(extension, has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, - false); - Test(extension, ext_depth_range_unrestricted, - VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); - Test(extension, ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); - Test(extension, ext_shader_viewport_index_layer, - VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, true); - Test(extension, has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, - false); - Test(extension, has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, - false); - Test(extension, has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, - false); - Test(extension, has_ext_extended_dynamic_state, - VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false); + test(has_khr_shader_float16_int8, VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME, false); + test(ext_depth_range_unrestricted, VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME, true); + test(ext_index_type_uint8, VK_EXT_INDEX_TYPE_UINT8_EXTENSION_NAME, true); + test(ext_sampler_filter_minmax, VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME, true); + test(ext_shader_viewport_index_layer, VK_EXT_SHADER_VIEWPORT_INDEX_LAYER_EXTENSION_NAME, + true); + test(ext_tooling_info, VK_EXT_TOOLING_INFO_EXTENSION_NAME, true); + test(ext_shader_stencil_export, VK_EXT_SHADER_STENCIL_EXPORT_EXTENSION_NAME, true); + test(has_ext_transform_feedback, VK_EXT_TRANSFORM_FEEDBACK_EXTENSION_NAME, false); + test(has_ext_custom_border_color, VK_EXT_CUSTOM_BORDER_COLOR_EXTENSION_NAME, false); + test(has_ext_extended_dynamic_state, VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME, false); + test(has_ext_robustness2, VK_EXT_ROBUSTNESS_2_EXTENSION_NAME, false); + test(has_ext_subgroup_size_control, VK_EXT_SUBGROUP_SIZE_CONTROL_EXTENSION_NAME, false); if (Settings::values.renderer_debug) { - Test(extension, nv_device_diagnostics_config, - VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, true); + test(nv_device_diagnostics_config, VK_NV_DEVICE_DIAGNOSTICS_CONFIG_EXTENSION_NAME, + true); } } VkPhysicalDeviceFeatures2KHR features; features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2_KHR; - VkPhysicalDeviceProperties2KHR properties; - properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; + VkPhysicalDeviceProperties2KHR physical_properties; + physical_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; if (has_khr_shader_float16_int8) { VkPhysicalDeviceFloat16Int8FeaturesKHR float16_int8_features; @@ -657,8 +692,8 @@ std::vector<const char*> VKDevice::LoadExtensions() { subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_SIZE_CONTROL_PROPERTIES_EXT; subgroup_properties.pNext = nullptr; - properties.pNext = &subgroup_properties; - physical.GetProperties2KHR(properties); + physical_properties.pNext = &subgroup_properties; + physical.GetProperties2KHR(physical_properties); is_warp_potentially_bigger = subgroup_properties.maxSubgroupSize > GuestWarpSize; @@ -682,8 +717,8 @@ std::vector<const char*> VKDevice::LoadExtensions() { VkPhysicalDeviceTransformFeedbackPropertiesEXT tfb_properties; tfb_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TRANSFORM_FEEDBACK_PROPERTIES_EXT; tfb_properties.pNext = nullptr; - properties.pNext = &tfb_properties; - physical.GetProperties2KHR(properties); + physical_properties.pNext = &tfb_properties; + physical.GetProperties2KHR(physical_properties); if (tfb_features.transformFeedback && tfb_features.geometryStreams && tfb_properties.maxTransformFeedbackStreams >= 4 && @@ -720,51 +755,75 @@ std::vector<const char*> VKDevice::LoadExtensions() { } } + if (has_ext_robustness2) { + VkPhysicalDeviceRobustness2FeaturesEXT robustness2; + robustness2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT; + robustness2.pNext = nullptr; + features.pNext = &robustness2; + physical.GetFeatures2KHR(features); + if (robustness2.nullDescriptor && robustness2.robustImageAccess2) { + extensions.push_back(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME); + ext_robustness2 = true; + } + } + return extensions; } -void VKDevice::SetupFamilies(VkSurfaceKHR surface) { - std::optional<u32> graphics_family_, present_family_; - +void Device::SetupFamilies(VkSurfaceKHR surface) { const std::vector queue_family_properties = physical.GetQueueFamilyProperties(); - for (u32 i = 0; i < static_cast<u32>(queue_family_properties.size()); ++i) { - if (graphics_family_ && present_family_) + std::optional<u32> graphics; + std::optional<u32> present; + for (u32 index = 0; index < static_cast<u32>(queue_family_properties.size()); ++index) { + if (graphics && (present || !surface)) { break; - - const auto& queue_family = queue_family_properties[i]; - if (queue_family.queueCount == 0) + } + const VkQueueFamilyProperties& queue_family = queue_family_properties[index]; + if (queue_family.queueCount == 0) { continue; - + } if (queue_family.queueFlags & VK_QUEUE_GRAPHICS_BIT) { - graphics_family_ = i; + graphics = index; } - if (physical.GetSurfaceSupportKHR(i, surface)) { - present_family_ = i; + if (surface && physical.GetSurfaceSupportKHR(index, surface)) { + present = index; } } - ASSERT(graphics_family_ && present_family_); - - graphics_family = *graphics_family_; - present_family = *present_family_; + if (!graphics) { + LOG_ERROR(Render_Vulkan, "Device lacks a graphics queue"); + throw vk::Exception(VK_ERROR_FEATURE_NOT_PRESENT); + } + if (surface && !present) { + LOG_ERROR(Render_Vulkan, "Device lacks a present queue"); + throw vk::Exception(VK_ERROR_FEATURE_NOT_PRESENT); + } + graphics_family = *graphics; + present_family = *present; } -void VKDevice::SetupFeatures() { +void Device::SetupFeatures() { const auto supported_features{physical.GetFeatures()}; is_formatless_image_load_supported = supported_features.shaderStorageImageReadWithoutFormat; + is_blit_depth_stencil_supported = TestDepthStencilBlits(); is_optimal_astc_supported = IsOptimalAstcSupported(supported_features); } -void VKDevice::CollectTelemetryParameters() { +void Device::CollectTelemetryParameters() { VkPhysicalDeviceDriverPropertiesKHR driver{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES_KHR, .pNext = nullptr, + .driverID = {}, + .driverName = {}, + .driverInfo = {}, + .conformanceVersion = {}, }; - VkPhysicalDeviceProperties2KHR properties{ + VkPhysicalDeviceProperties2KHR device_properties{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, .pNext = &driver, + .properties = {}, }; - physical.GetProperties2KHR(properties); + physical.GetProperties2KHR(device_properties); driver_id = driver.driverID; vendor_name = driver.driverName; @@ -776,7 +835,33 @@ void VKDevice::CollectTelemetryParameters() { } } -std::vector<VkDeviceQueueCreateInfo> VKDevice::GetDeviceQueueCreateInfos() const { +void Device::CollectToolingInfo() { + if (!ext_tooling_info) { + return; + } + const auto vkGetPhysicalDeviceToolPropertiesEXT = + reinterpret_cast<PFN_vkGetPhysicalDeviceToolPropertiesEXT>( + dld.vkGetInstanceProcAddr(instance, "vkGetPhysicalDeviceToolPropertiesEXT")); + if (!vkGetPhysicalDeviceToolPropertiesEXT) { + return; + } + u32 tool_count = 0; + if (vkGetPhysicalDeviceToolPropertiesEXT(physical, &tool_count, nullptr) != VK_SUCCESS) { + return; + } + std::vector<VkPhysicalDeviceToolPropertiesEXT> tools(tool_count); + if (vkGetPhysicalDeviceToolPropertiesEXT(physical, &tool_count, tools.data()) != VK_SUCCESS) { + return; + } + for (const VkPhysicalDeviceToolPropertiesEXT& tool : tools) { + const std::string_view name = tool.name; + LOG_INFO(Render_Vulkan, "{}", name); + has_renderdoc = has_renderdoc || name == "RenderDoc"; + has_nsight_graphics = has_nsight_graphics || name == "NVIDIA Nsight Graphics"; + } +} + +std::vector<VkDeviceQueueCreateInfo> Device::GetDeviceQueueCreateInfos() const { static constexpr float QUEUE_PRIORITY = 1.0f; std::unordered_set<u32> unique_queue_families{graphics_family, present_family}; diff --git a/src/video_core/renderer_vulkan/vk_device.h b/src/video_core/vulkan_common/vulkan_device.h index 26a233db1..a973c3ce4 100644 --- a/src/video_core/renderer_vulkan/vk_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -10,11 +10,12 @@ #include <vector> #include "common/common_types.h" -#include "video_core/renderer_vulkan/nsight_aftermath_tracker.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { +class NsightAftermathTracker; + /// Format usage descriptor. enum class FormatType { Linear, Optimal, Buffer }; @@ -22,14 +23,11 @@ enum class FormatType { Linear, Optimal, Buffer }; const u32 GuestWarpSize = 32; /// Handles data specific to a physical device. -class VKDevice final { +class Device final { public: - explicit VKDevice(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, - const vk::InstanceDispatch& dld); - ~VKDevice(); - - /// Initializes the device. Returns true on success. - bool Create(); + explicit Device(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, + const vk::InstanceDispatch& dld); + ~Device(); /** * Returns a format supported by the device for the passed requeriments. @@ -83,7 +81,7 @@ public: } /// Returns the current Vulkan API version provided in Vulkan-formatted version numbers. - u32 GetApiVersion() const { + u32 ApiVersion() const { return properties.apiVersion; } @@ -152,6 +150,11 @@ public: return is_formatless_image_load_supported; } + /// Returns true when blitting from and to depth stencil images is supported. + bool IsBlitDepthStencilSupported() const { + return is_blit_depth_stencil_supported; + } + /// Returns true if the device supports VK_NV_viewport_swizzle. bool IsNvViewportSwizzleSupported() const { return nv_viewport_swizzle; @@ -167,6 +170,11 @@ public: return ext_index_type_uint8; } + /// Returns true if the device supports VK_EXT_sampler_filter_minmax. + bool IsExtSamplerFilterMinmaxSupported() const { + return ext_sampler_filter_minmax; + } + /// Returns true if the device supports VK_EXT_depth_range_unrestricted. bool IsExtDepthRangeUnrestrictedSupported() const { return ext_depth_range_unrestricted; @@ -192,6 +200,16 @@ public: return ext_extended_dynamic_state; } + /// Returns true if the device supports VK_EXT_shader_stencil_export. + bool IsExtShaderStencilExportSupported() const { + return ext_shader_stencil_export; + } + + /// Returns true when a known debugging tool is attached. + bool HasDebuggingToolAttached() const { + return has_renderdoc || has_nsight_graphics; + } + /// Returns the vendor name reported from Vulkan. std::string_view GetVendorName() const { return vendor_name; @@ -207,10 +225,10 @@ public: return use_asynchronous_shaders; } +private: /// Checks if the physical device is suitable. - static bool IsSuitable(vk::PhysicalDevice physical, VkSurfaceKHR surface); + void CheckSuitability() const; -private: /// Loads extensions into a vector and stores available ones in this object. std::vector<const char*> LoadExtensions(); @@ -223,22 +241,30 @@ private: /// Collects telemetry information from the device. void CollectTelemetryParameters(); + /// Collects information about attached tools. + void CollectToolingInfo(); + /// Returns a list of queue initialization descriptors. std::vector<VkDeviceQueueCreateInfo> GetDeviceQueueCreateInfos() const; /// Returns true if ASTC textures are natively supported. bool IsOptimalAstcSupported(const VkPhysicalDeviceFeatures& features) const; + /// Returns true if the device natively supports blitting depth stencil images. + bool TestDepthStencilBlits() const; + /// Returns true if a format is supported. bool IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags wanted_usage, FormatType format_type) const; + VkInstance instance; ///< Vulkan instance. vk::DeviceDispatch dld; ///< Device function pointers. vk::PhysicalDevice physical; ///< Physical device. VkPhysicalDeviceProperties properties; ///< Device properties. vk::Device logical; ///< Logical device. vk::Queue graphics_queue; ///< Main graphics queue. vk::Queue present_queue; ///< Main present queue. + u32 instance_version{}; ///< Vulkan onstance version. u32 graphics_family{}; ///< Main graphics queue family index. u32 present_family{}; ///< Main present queue family index. VkDriverIdKHR driver_id{}; ///< Driver ID. @@ -247,15 +273,22 @@ private: bool is_float16_supported{}; ///< Support for float16 arithmetics. bool is_warp_potentially_bigger{}; ///< Host warp size can be bigger than guest. bool is_formatless_image_load_supported{}; ///< Support for shader image read without format. + bool is_blit_depth_stencil_supported{}; ///< Support for blitting from and to depth stencil. bool nv_viewport_swizzle{}; ///< Support for VK_NV_viewport_swizzle. bool khr_uniform_buffer_standard_layout{}; ///< Support for std430 on UBOs. bool ext_index_type_uint8{}; ///< Support for VK_EXT_index_type_uint8. + bool ext_sampler_filter_minmax{}; ///< Support for VK_EXT_sampler_filter_minmax. bool ext_depth_range_unrestricted{}; ///< Support for VK_EXT_depth_range_unrestricted. bool ext_shader_viewport_index_layer{}; ///< Support for VK_EXT_shader_viewport_index_layer. + bool ext_tooling_info{}; ///< Support for VK_EXT_tooling_info. bool ext_transform_feedback{}; ///< Support for VK_EXT_transform_feedback. bool ext_custom_border_color{}; ///< Support for VK_EXT_custom_border_color. bool ext_extended_dynamic_state{}; ///< Support for VK_EXT_extended_dynamic_state. + bool ext_robustness2{}; ///< Support for VK_EXT_robustness2. + bool ext_shader_stencil_export{}; ///< Support for VK_EXT_shader_stencil_export. bool nv_device_diagnostics_config{}; ///< Support for VK_NV_device_diagnostics_config. + bool has_renderdoc{}; ///< Has RenderDoc attached + bool has_nsight_graphics{}; ///< Has Nsight Graphics attached // Asynchronous Graphics Pipeline setting bool use_asynchronous_shaders{}; ///< Setting to use asynchronous shaders/graphics pipeline @@ -268,7 +301,7 @@ private: std::unordered_map<VkFormat, VkFormatProperties> format_properties; /// Nsight Aftermath GPU crash tracker - NsightAftermathTracker nsight_aftermath_tracker; + std::unique_ptr<NsightAftermathTracker> nsight_aftermath_tracker; }; } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp new file mode 100644 index 000000000..889ecda0c --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_instance.cpp @@ -0,0 +1,151 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <algorithm> +#include <optional> +#include <span> +#include <utility> +#include <vector> + +#include "common/common_types.h" +#include "common/dynamic_library.h" +#include "common/logging/log.h" +#include "core/frontend/emu_window.h" +#include "video_core/vulkan_common/vulkan_instance.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +// Include these late to avoid polluting previous headers +#ifdef _WIN32 +#include <windows.h> +// ensure include order +#include <vulkan/vulkan_win32.h> +#endif + +#if !defined(_WIN32) && !defined(__APPLE__) +#include <X11/Xlib.h> +#include <vulkan/vulkan_wayland.h> +#include <vulkan/vulkan_xlib.h> +#endif + +namespace Vulkan { +namespace { +[[nodiscard]] std::vector<const char*> RequiredExtensions( + Core::Frontend::WindowSystemType window_type, bool enable_debug_utils) { + std::vector<const char*> extensions; + extensions.reserve(6); + switch (window_type) { + case Core::Frontend::WindowSystemType::Headless: + break; +#ifdef _WIN32 + case Core::Frontend::WindowSystemType::Windows: + extensions.push_back(VK_KHR_WIN32_SURFACE_EXTENSION_NAME); + break; +#endif +#if !defined(_WIN32) && !defined(__APPLE__) + case Core::Frontend::WindowSystemType::X11: + extensions.push_back(VK_KHR_XLIB_SURFACE_EXTENSION_NAME); + break; + case Core::Frontend::WindowSystemType::Wayland: + extensions.push_back(VK_KHR_WAYLAND_SURFACE_EXTENSION_NAME); + break; +#endif + default: + LOG_ERROR(Render_Vulkan, "Presentation not supported on this platform"); + break; + } + if (window_type != Core::Frontend::WindowSystemType::Headless) { + extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); + } + if (enable_debug_utils) { + extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + } + extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); + return extensions; +} + +[[nodiscard]] bool AreExtensionsSupported(const vk::InstanceDispatch& dld, + std::span<const char* const> extensions) { + const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); + if (!properties) { + LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); + return false; + } + for (const char* extension : extensions) { + const auto it = std::ranges::find_if(*properties, [extension](const auto& prop) { + return std::strcmp(extension, prop.extensionName) == 0; + }); + if (it == properties->end()) { + LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); + return false; + } + } + return true; +} + +[[nodiscard]] std::vector<const char*> Layers(bool enable_layers) { + std::vector<const char*> layers; + if (enable_layers) { + layers.push_back("VK_LAYER_KHRONOS_validation"); + } + return layers; +} + +void RemoveUnavailableLayers(const vk::InstanceDispatch& dld, std::vector<const char*>& layers) { + const std::optional layer_properties = vk::EnumerateInstanceLayerProperties(dld); + if (!layer_properties) { + LOG_ERROR(Render_Vulkan, "Failed to query layer properties, disabling layers"); + layers.clear(); + } + std::erase_if(layers, [&layer_properties](const char* layer) { + const auto comp = [layer](const VkLayerProperties& layer_property) { + return std::strcmp(layer, layer_property.layerName) == 0; + }; + const auto it = std::ranges::find_if(*layer_properties, comp); + if (it == layer_properties->end()) { + LOG_ERROR(Render_Vulkan, "Layer {} not available, removing it", layer); + return true; + } + return false; + }); +} +} // Anonymous namespace + +vk::Instance CreateInstance(const Common::DynamicLibrary& library, vk::InstanceDispatch& dld, + u32 required_version, Core::Frontend::WindowSystemType window_type, + bool enable_debug_utils, bool enable_layers) { + if (!library.IsOpen()) { + LOG_ERROR(Render_Vulkan, "Vulkan library not available"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + if (!library.GetSymbol("vkGetInstanceProcAddr", &dld.vkGetInstanceProcAddr)) { + LOG_ERROR(Render_Vulkan, "vkGetInstanceProcAddr not present in Vulkan"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + if (!vk::Load(dld)) { + LOG_ERROR(Render_Vulkan, "Failed to load Vulkan function pointers"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + const std::vector<const char*> extensions = RequiredExtensions(window_type, enable_debug_utils); + if (!AreExtensionsSupported(dld, extensions)) { + throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); + } + std::vector<const char*> layers = Layers(enable_layers); + RemoveUnavailableLayers(dld, layers); + + const u32 available_version = vk::AvailableVersion(dld); + if (available_version < required_version) { + LOG_ERROR(Render_Vulkan, "Vulkan {}.{} is not supported, {}.{} is required", + VK_VERSION_MAJOR(available_version), VK_VERSION_MINOR(available_version), + VK_VERSION_MAJOR(required_version), VK_VERSION_MINOR(required_version)); + throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER); + } + vk::Instance instance = vk::Instance::Create(required_version, layers, extensions, dld); + if (!vk::Load(*instance, dld)) { + LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + return instance; +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_instance.h b/src/video_core/vulkan_common/vulkan_instance.h new file mode 100644 index 000000000..e5e3a7144 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_instance.h @@ -0,0 +1,32 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/common_types.h" +#include "common/dynamic_library.h" +#include "core/frontend/emu_window.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Vulkan { + +/** + * Create a Vulkan instance + * + * @param library Dynamic library to load the Vulkan instance from + * @param dld Dispatch table to load function pointers into + * @param required_version Required Vulkan version (for example, VK_API_VERSION_1_1) + * @param window_type Window system type's enabled extension + * @param enable_debug_utils Whether to enable VK_EXT_debug_utils_extension_name or not + * @param enable_layers Whether to enable Vulkan validation layers or not + * + * @return A new Vulkan instance + * @throw vk::Exception on failure + */ +[[nodiscard]] vk::Instance CreateInstance( + const Common::DynamicLibrary& library, vk::InstanceDispatch& dld, u32 required_version, + Core::Frontend::WindowSystemType window_type = Core::Frontend::WindowSystemType::Headless, + bool enable_debug_utils = false, bool enable_layers = false); + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_library.cpp b/src/video_core/vulkan_common/vulkan_library.cpp new file mode 100644 index 000000000..557871d81 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_library.cpp @@ -0,0 +1,36 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstdlib> +#include <string> + +#include "common/dynamic_library.h" +#include "common/file_util.h" +#include "video_core/vulkan_common/vulkan_library.h" + +namespace Vulkan { + +Common::DynamicLibrary OpenLibrary() { + Common::DynamicLibrary library; +#ifdef __APPLE__ + // Check if a path to a specific Vulkan library has been specified. + char* const libvulkan_env = std::getenv("LIBVULKAN_PATH"); + if (!libvulkan_env || !library.Open(libvulkan_env)) { + // Use the libvulkan.dylib from the application bundle. + const std::string filename = + Common::FS::GetBundleDirectory() + "/Contents/Frameworks/libvulkan.dylib"; + void(library.Open(filename.c_str())); + } +#else + std::string filename = Common::DynamicLibrary::GetVersionedFilename("vulkan", 1); + if (!library.Open(filename.c_str())) { + // Android devices may not have libvulkan.so.1, only libvulkan.so. + filename = Common::DynamicLibrary::GetVersionedFilename("vulkan"); + void(library.Open(filename.c_str())); + } +#endif + return library; +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_library.h b/src/video_core/vulkan_common/vulkan_library.h new file mode 100644 index 000000000..8b28b0e17 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_library.h @@ -0,0 +1,13 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "common/dynamic_library.h" + +namespace Vulkan { + +Common::DynamicLibrary OpenLibrary(); + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_surface.cpp b/src/video_core/vulkan_common/vulkan_surface.cpp new file mode 100644 index 000000000..3c3238f96 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_surface.cpp @@ -0,0 +1,81 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/logging/log.h" +#include "core/frontend/emu_window.h" +#include "video_core/vulkan_common/vulkan_surface.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" + +// Include these late to avoid polluting previous headers +#ifdef _WIN32 +#include <windows.h> +// ensure include order +#include <vulkan/vulkan_win32.h> +#endif + +#if !defined(_WIN32) && !defined(__APPLE__) +#include <X11/Xlib.h> +#include <vulkan/vulkan_wayland.h> +#include <vulkan/vulkan_xlib.h> +#endif + +namespace Vulkan { + +vk::SurfaceKHR CreateSurface(const vk::Instance& instance, + const Core::Frontend::EmuWindow& emu_window) { + [[maybe_unused]] const vk::InstanceDispatch& dld = instance.Dispatch(); + [[maybe_unused]] const auto& window_info = emu_window.GetWindowInfo(); + VkSurfaceKHR unsafe_surface = nullptr; + +#ifdef _WIN32 + if (window_info.type == Core::Frontend::WindowSystemType::Windows) { + const HWND hWnd = static_cast<HWND>(window_info.render_surface); + const VkWin32SurfaceCreateInfoKHR win32_ci{VK_STRUCTURE_TYPE_WIN32_SURFACE_CREATE_INFO_KHR, + nullptr, 0, nullptr, hWnd}; + const auto vkCreateWin32SurfaceKHR = reinterpret_cast<PFN_vkCreateWin32SurfaceKHR>( + dld.vkGetInstanceProcAddr(*instance, "vkCreateWin32SurfaceKHR")); + if (!vkCreateWin32SurfaceKHR || + vkCreateWin32SurfaceKHR(*instance, &win32_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "Failed to initialize Win32 surface"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + } +#endif +#if !defined(_WIN32) && !defined(__APPLE__) + if (window_info.type == Core::Frontend::WindowSystemType::X11) { + const VkXlibSurfaceCreateInfoKHR xlib_ci{ + VK_STRUCTURE_TYPE_XLIB_SURFACE_CREATE_INFO_KHR, nullptr, 0, + static_cast<Display*>(window_info.display_connection), + reinterpret_cast<Window>(window_info.render_surface)}; + const auto vkCreateXlibSurfaceKHR = reinterpret_cast<PFN_vkCreateXlibSurfaceKHR>( + dld.vkGetInstanceProcAddr(*instance, "vkCreateXlibSurfaceKHR")); + if (!vkCreateXlibSurfaceKHR || + vkCreateXlibSurfaceKHR(*instance, &xlib_ci, nullptr, &unsafe_surface) != VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "Failed to initialize Xlib surface"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + } + if (window_info.type == Core::Frontend::WindowSystemType::Wayland) { + const VkWaylandSurfaceCreateInfoKHR wayland_ci{ + VK_STRUCTURE_TYPE_WAYLAND_SURFACE_CREATE_INFO_KHR, nullptr, 0, + static_cast<wl_display*>(window_info.display_connection), + static_cast<wl_surface*>(window_info.render_surface)}; + const auto vkCreateWaylandSurfaceKHR = reinterpret_cast<PFN_vkCreateWaylandSurfaceKHR>( + dld.vkGetInstanceProcAddr(*instance, "vkCreateWaylandSurfaceKHR")); + if (!vkCreateWaylandSurfaceKHR || + vkCreateWaylandSurfaceKHR(*instance, &wayland_ci, nullptr, &unsafe_surface) != + VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "Failed to initialize Wayland surface"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + } +#endif + if (!unsafe_surface) { + LOG_ERROR(Render_Vulkan, "Presentation not supported on this platform"); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + return vk::SurfaceKHR(unsafe_surface, *instance, dld); +} + +} // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_surface.h b/src/video_core/vulkan_common/vulkan_surface.h new file mode 100644 index 000000000..05a169e32 --- /dev/null +++ b/src/video_core/vulkan_common/vulkan_surface.h @@ -0,0 +1,18 @@ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include "video_core/vulkan_common/vulkan_wrapper.h" + +namespace Core::Frontend { +class EmuWindow; +} + +namespace Vulkan { + +[[nodiscard]] vk::SurfaceKHR CreateSurface(const vk::Instance& instance, + const Core::Frontend::EmuWindow& emu_window); + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 1fb14e190..5e15ad607 100644 --- a/src/video_core/renderer_vulkan/wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -6,32 +6,55 @@ #include <exception> #include <memory> #include <optional> +#include <string_view> #include <utility> #include <vector> #include "common/common_types.h" +#include "common/logging/log.h" -#include "video_core/renderer_vulkan/wrapper.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan::vk { namespace { +template <typename Func> +void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld, + Func&& func) { + // Calling GetProperties calls Vulkan more than needed. But they are supposed to be cheap + // functions. + std::stable_sort(devices.begin(), devices.end(), + [&dld, &func](VkPhysicalDevice lhs, VkPhysicalDevice rhs) { + return func(vk::PhysicalDevice(lhs, dld).GetProperties(), + vk::PhysicalDevice(rhs, dld).GetProperties()); + }); +} + +void SortPhysicalDevicesPerVendor(std::vector<VkPhysicalDevice>& devices, + const InstanceDispatch& dld, + std::initializer_list<u32> vendor_ids) { + for (auto it = vendor_ids.end(); it != vendor_ids.begin();) { + --it; + SortPhysicalDevices(devices, dld, [id = *it](const auto& lhs, const auto& rhs) { + return lhs.vendorID == id && rhs.vendorID != id; + }); + } +} + void SortPhysicalDevices(std::vector<VkPhysicalDevice>& devices, const InstanceDispatch& dld) { - std::stable_sort(devices.begin(), devices.end(), [&](auto lhs, auto rhs) { - // This will call Vulkan more than needed, but these calls are cheap. - const auto lhs_properties = vk::PhysicalDevice(lhs, dld).GetProperties(); - const auto rhs_properties = vk::PhysicalDevice(rhs, dld).GetProperties(); - - // Prefer discrete GPUs, Nvidia over AMD, AMD over Intel, Intel over the rest. - const bool preferred = - (lhs_properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU && - rhs_properties.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) || - (lhs_properties.vendorID == 0x10DE && rhs_properties.vendorID != 0x10DE) || - (lhs_properties.vendorID == 0x1002 && rhs_properties.vendorID != 0x1002) || - (lhs_properties.vendorID == 0x8086 && rhs_properties.vendorID != 0x8086); - return !preferred; + // Sort by name, this will set a base and make GPUs with higher numbers appear first + // (e.g. GTX 1650 will intentionally be listed before a GTX 1080). + SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) { + return std::string_view{lhs.deviceName} > std::string_view{rhs.deviceName}; + }); + // Prefer discrete over non-discrete + SortPhysicalDevices(devices, dld, [](const auto& lhs, const auto& rhs) { + return lhs.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU && + rhs.deviceType != VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU; }); + // Prefer Nvidia over AMD, AMD over Intel, Intel over the rest. + SortPhysicalDevicesPerVendor(devices, dld, {0x10DE, 0x1002, 0x8086}); } template <typename T> @@ -58,6 +81,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdBeginQuery); X(vkCmdBeginRenderPass); X(vkCmdBeginTransformFeedbackEXT); + X(vkCmdBeginDebugUtilsLabelEXT); X(vkCmdBindDescriptorSets); X(vkCmdBindIndexBuffer); X(vkCmdBindPipeline); @@ -75,6 +99,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdEndQuery); X(vkCmdEndRenderPass); X(vkCmdEndTransformFeedbackEXT); + X(vkCmdEndDebugUtilsLabelEXT); X(vkCmdFillBuffer); X(vkCmdPipelineBarrier); X(vkCmdPushConstants); @@ -98,6 +123,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkCmdSetPrimitiveTopologyEXT); X(vkCmdSetStencilOpEXT); X(vkCmdSetStencilTestEnableEXT); + X(vkCmdResolveImage); X(vkCreateBuffer); X(vkCreateBufferView); X(vkCreateCommandPool); @@ -153,6 +179,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkQueueSubmit); X(vkResetFences); X(vkResetQueryPoolEXT); + X(vkSetDebugUtilsObjectNameEXT); + X(vkSetDebugUtilsObjectTagEXT); X(vkUnmapMemory); X(vkUpdateDescriptorSetWithTemplateKHR); X(vkUpdateDescriptorSets); @@ -161,6 +189,19 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { #undef X } +template <typename T> +void SetObjectName(const DeviceDispatch* dld, VkDevice device, T handle, VkObjectType type, + const char* name) { + const VkDebugUtilsObjectNameInfoEXT name_info{ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT, + .pNext = nullptr, + .objectType = VK_OBJECT_TYPE_IMAGE, + .objectHandle = reinterpret_cast<u64>(handle), + .pObjectName = name, + }; + Check(dld->vkSetDebugUtilsObjectNameEXT(device, &name_info)); +} + } // Anonymous namespace bool Load(InstanceDispatch& dld) noexcept { @@ -393,18 +434,17 @@ VkResult Free(VkDevice device, VkCommandPool handle, Span<VkCommandBuffer> buffe return VK_SUCCESS; } -Instance Instance::Create(Span<const char*> layers, Span<const char*> extensions, - InstanceDispatch& dld) noexcept { - static constexpr VkApplicationInfo application_info{ +Instance Instance::Create(u32 version, Span<const char*> layers, Span<const char*> extensions, + InstanceDispatch& dispatch) { + const VkApplicationInfo application_info{ .sType = VK_STRUCTURE_TYPE_APPLICATION_INFO, .pNext = nullptr, .pApplicationName = "yuzu Emulator", .applicationVersion = VK_MAKE_VERSION(0, 1, 0), .pEngineName = "yuzu Emulator", .engineVersion = VK_MAKE_VERSION(0, 1, 0), - .apiVersion = VK_API_VERSION_1_1, + .apiVersion = version, }; - const VkInstanceCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO, .pNext = nullptr, @@ -415,66 +455,68 @@ Instance Instance::Create(Span<const char*> layers, Span<const char*> extensions .enabledExtensionCount = extensions.size(), .ppEnabledExtensionNames = extensions.data(), }; - VkInstance instance; - if (dld.vkCreateInstance(&ci, nullptr, &instance) != VK_SUCCESS) { - // Failed to create the instance. - return {}; - } - if (!Proc(dld.vkDestroyInstance, dld, "vkDestroyInstance", instance)) { + Check(dispatch.vkCreateInstance(&ci, nullptr, &instance)); + if (!Proc(dispatch.vkDestroyInstance, dispatch, "vkDestroyInstance", instance)) { // We successfully created an instance but the destroy function couldn't be loaded. // This is a good moment to panic. - return {}; + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); } - - return Instance(instance, dld); + return Instance(instance, dispatch); } -std::optional<std::vector<VkPhysicalDevice>> Instance::EnumeratePhysicalDevices() { +std::vector<VkPhysicalDevice> Instance::EnumeratePhysicalDevices() const { u32 num; - if (dld->vkEnumeratePhysicalDevices(handle, &num, nullptr) != VK_SUCCESS) { - return std::nullopt; - } + Check(dld->vkEnumeratePhysicalDevices(handle, &num, nullptr)); std::vector<VkPhysicalDevice> physical_devices(num); - if (dld->vkEnumeratePhysicalDevices(handle, &num, physical_devices.data()) != VK_SUCCESS) { - return std::nullopt; - } + Check(dld->vkEnumeratePhysicalDevices(handle, &num, physical_devices.data())); SortPhysicalDevices(physical_devices, *dld); - return std::make_optional(std::move(physical_devices)); + return physical_devices; } -DebugCallback Instance::TryCreateDebugCallback( - PFN_vkDebugUtilsMessengerCallbackEXT callback) noexcept { - const VkDebugUtilsMessengerCreateInfoEXT ci{ - .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, - .pNext = nullptr, - .flags = 0, - .messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_INFO_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT, - .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | - VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, - .pfnUserCallback = callback, - .pUserData = nullptr, - }; - - VkDebugUtilsMessengerEXT messenger; - if (dld->vkCreateDebugUtilsMessengerEXT(handle, &ci, nullptr, &messenger) != VK_SUCCESS) { - return {}; - } - return DebugCallback(messenger, handle, *dld); +DebugUtilsMessenger Instance::CreateDebugUtilsMessenger( + const VkDebugUtilsMessengerCreateInfoEXT& create_info) const { + VkDebugUtilsMessengerEXT object; + Check(dld->vkCreateDebugUtilsMessengerEXT(handle, &create_info, nullptr, &object)); + return DebugUtilsMessenger(object, handle, *dld); } void Buffer::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { Check(dld->vkBindBufferMemory(owner, handle, memory, offset)); } +void Buffer::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER, name); +} + +void BufferView::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_BUFFER_VIEW, name); +} + void Image::BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const { Check(dld->vkBindImageMemory(owner, handle, memory, offset)); } +void Image::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE, name); +} + +void ImageView::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE_VIEW, name); +} + +void DeviceMemory::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_DEVICE_MEMORY, name); +} + +void Fence::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_FENCE, name); +} + +void Framebuffer::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_FRAMEBUFFER, name); +} + DescriptorSets DescriptorPool::Allocate(const VkDescriptorSetAllocateInfo& ai) const { const std::size_t num = ai.descriptorSetCount; std::unique_ptr sets = std::make_unique<VkDescriptorSet[]>(num); @@ -488,6 +530,10 @@ DescriptorSets DescriptorPool::Allocate(const VkDescriptorSetAllocateInfo& ai) c } } +void DescriptorPool::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_DESCRIPTOR_POOL, name); +} + CommandBuffers CommandPool::Allocate(std::size_t num_buffers, VkCommandBufferLevel level) const { const VkCommandBufferAllocateInfo ai{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO, @@ -508,6 +554,10 @@ CommandBuffers CommandPool::Allocate(std::size_t num_buffers, VkCommandBufferLev } } +void CommandPool::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_COMMAND_POOL, name); +} + std::vector<VkImage> SwapchainKHR::GetImages() const { u32 num; Check(dld->vkGetSwapchainImagesKHR(owner, handle, &num, nullptr)); @@ -516,9 +566,21 @@ std::vector<VkImage> SwapchainKHR::GetImages() const { return images; } +void Event::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_EVENT, name); +} + +void ShaderModule::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_SHADER_MODULE, name); +} + +void Semaphore::SetObjectNameEXT(const char* name) const { + SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_SEMAPHORE, name); +} + Device Device::Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreateInfo> queues_ci, Span<const char*> enabled_extensions, const void* next, - DeviceDispatch& dld) noexcept { + DeviceDispatch& dispatch) { const VkDeviceCreateInfo ci{ .sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO, .pNext = next, @@ -531,13 +593,10 @@ Device Device::Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreate .ppEnabledExtensionNames = enabled_extensions.data(), .pEnabledFeatures = nullptr, }; - VkDevice device; - if (dld.vkCreateDevice(physical_device, &ci, nullptr, &device) != VK_SUCCESS) { - return {}; - } - Load(device, dld); - return Device(device, dld); + Check(dispatch.vkCreateDevice(physical_device, &ci, nullptr, &device)); + Load(device, dispatch); + return Device(device, dispatch); } Queue Device::GetQueue(u32 family_index) const noexcept { @@ -796,6 +855,21 @@ VkPhysicalDeviceMemoryProperties PhysicalDevice::GetMemoryProperties() const noe return properties; } +u32 AvailableVersion(const InstanceDispatch& dld) noexcept { + PFN_vkEnumerateInstanceVersion vkEnumerateInstanceVersion; + if (!Proc(vkEnumerateInstanceVersion, dld, "vkEnumerateInstanceVersion")) { + // If the procedure is not found, Vulkan 1.0 is assumed + return VK_API_VERSION_1_0; + } + u32 version; + if (const VkResult result = vkEnumerateInstanceVersion(&version); result != VK_SUCCESS) { + LOG_ERROR(Render_Vulkan, "vkEnumerateInstanceVersion returned {}, assuming Vulkan 1.1", + ToString(result)); + return VK_API_VERSION_1_1; + } + return version; +} + std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties( const InstanceDispatch& dld) { u32 num; @@ -807,7 +881,7 @@ std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProp VK_SUCCESS) { return std::nullopt; } - return std::move(properties); + return properties; } std::optional<std::vector<VkLayerProperties>> EnumerateInstanceLayerProperties( diff --git a/src/video_core/renderer_vulkan/wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 234e01693..912cab46c 100644 --- a/src/video_core/renderer_vulkan/wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -9,6 +9,7 @@ #include <limits> #include <memory> #include <optional> +#include <span> #include <type_traits> #include <utility> #include <vector> @@ -18,6 +19,10 @@ #include "common/common_types.h" +#ifdef _MSC_VER +#pragma warning(disable : 26812) // Disable prefer enum class over enum +#endif + namespace Vulkan::vk { /** @@ -41,6 +46,9 @@ public: /// Construct an empty span. constexpr Span() noexcept = default; + /// Construct an empty span + constexpr Span(std::nullptr_t) noexcept {} + /// Construct a span from a single element. constexpr Span(const T& value) noexcept : ptr{&value}, num{1} {} @@ -52,7 +60,7 @@ public: /// Construct a span from a pointer and a size. /// This is inteded for subranges. - constexpr Span(const T* ptr, std::size_t num) noexcept : ptr{ptr}, num{num} {} + constexpr Span(const T* ptr_, std::size_t num_) noexcept : ptr{ptr_}, num{num_} {} /// Returns the data pointer by the span. constexpr const T* data() const noexcept { @@ -177,6 +185,7 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkCmdBeginQuery vkCmdBeginQuery; PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass; PFN_vkCmdBeginTransformFeedbackEXT vkCmdBeginTransformFeedbackEXT; + PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT; PFN_vkCmdBindDescriptorSets vkCmdBindDescriptorSets; PFN_vkCmdBindIndexBuffer vkCmdBindIndexBuffer; PFN_vkCmdBindPipeline vkCmdBindPipeline; @@ -194,6 +203,7 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkCmdEndQuery vkCmdEndQuery; PFN_vkCmdEndRenderPass vkCmdEndRenderPass; PFN_vkCmdEndTransformFeedbackEXT vkCmdEndTransformFeedbackEXT; + PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT; PFN_vkCmdFillBuffer vkCmdFillBuffer; PFN_vkCmdPipelineBarrier vkCmdPipelineBarrier; PFN_vkCmdPushConstants vkCmdPushConstants; @@ -217,6 +227,7 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkCmdSetPrimitiveTopologyEXT vkCmdSetPrimitiveTopologyEXT; PFN_vkCmdSetStencilOpEXT vkCmdSetStencilOpEXT; PFN_vkCmdSetStencilTestEnableEXT vkCmdSetStencilTestEnableEXT; + PFN_vkCmdResolveImage vkCmdResolveImage; PFN_vkCreateBuffer vkCreateBuffer; PFN_vkCreateBufferView vkCreateBufferView; PFN_vkCreateCommandPool vkCreateCommandPool; @@ -272,6 +283,8 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkQueueSubmit vkQueueSubmit; PFN_vkResetFences vkResetFences; PFN_vkResetQueryPoolEXT vkResetQueryPoolEXT; + PFN_vkSetDebugUtilsObjectNameEXT vkSetDebugUtilsObjectNameEXT; + PFN_vkSetDebugUtilsObjectTagEXT vkSetDebugUtilsObjectTagEXT; PFN_vkUnmapMemory vkUnmapMemory; PFN_vkUpdateDescriptorSetWithTemplateKHR vkUpdateDescriptorSetWithTemplateKHR; PFN_vkUpdateDescriptorSets vkUpdateDescriptorSets; @@ -469,9 +482,10 @@ public: PoolAllocations() = default; /// Construct an allocation. Errors are reported through IsOutOfPoolMemory(). - explicit PoolAllocations(std::unique_ptr<AllocationType[]> allocations, std::size_t num, - VkDevice device, PoolType pool, const DeviceDispatch& dld) noexcept - : allocations{std::move(allocations)}, num{num}, device{device}, pool{pool}, dld{&dld} {} + explicit PoolAllocations(std::unique_ptr<AllocationType[]> allocations_, std::size_t num_, + VkDevice device_, PoolType pool_, const DeviceDispatch& dld_) noexcept + : allocations{std::move(allocations_)}, num{num_}, device{device_}, pool{pool_}, + dld{&dld_} {} /// Copying Vulkan allocations is not supported and will never be. PoolAllocations(const PoolAllocations&) = delete; @@ -541,18 +555,14 @@ private: const DeviceDispatch* dld = nullptr; }; -using BufferView = Handle<VkBufferView, VkDevice, DeviceDispatch>; -using DebugCallback = Handle<VkDebugUtilsMessengerEXT, VkInstance, InstanceDispatch>; +using DebugUtilsMessenger = Handle<VkDebugUtilsMessengerEXT, VkInstance, InstanceDispatch>; using DescriptorSetLayout = Handle<VkDescriptorSetLayout, VkDevice, DeviceDispatch>; using DescriptorUpdateTemplateKHR = Handle<VkDescriptorUpdateTemplateKHR, VkDevice, DeviceDispatch>; -using Framebuffer = Handle<VkFramebuffer, VkDevice, DeviceDispatch>; -using ImageView = Handle<VkImageView, VkDevice, DeviceDispatch>; using Pipeline = Handle<VkPipeline, VkDevice, DeviceDispatch>; using PipelineLayout = Handle<VkPipelineLayout, VkDevice, DeviceDispatch>; using QueryPool = Handle<VkQueryPool, VkDevice, DeviceDispatch>; using RenderPass = Handle<VkRenderPass, VkDevice, DeviceDispatch>; using Sampler = Handle<VkSampler, VkDevice, DeviceDispatch>; -using ShaderModule = Handle<VkShaderModule, VkDevice, DeviceDispatch>; using SurfaceKHR = Handle<VkSurfaceKHR, VkInstance, InstanceDispatch>; using DescriptorSets = PoolAllocations<VkDescriptorSet, VkDescriptorPool>; @@ -563,16 +573,25 @@ class Instance : public Handle<VkInstance, NoOwner, InstanceDispatch> { using Handle<VkInstance, NoOwner, InstanceDispatch>::Handle; public: - /// Creates a Vulkan instance. Use "operator bool" for error handling. - static Instance Create(Span<const char*> layers, Span<const char*> extensions, - InstanceDispatch& dld) noexcept; + /// Creates a Vulkan instance. + /// @throw Exception on initialization error. + static Instance Create(u32 version, Span<const char*> layers, Span<const char*> extensions, + InstanceDispatch& dispatch); /// Enumerates physical devices. /// @return Physical devices and an empty handle on failure. - std::optional<std::vector<VkPhysicalDevice>> EnumeratePhysicalDevices(); + /// @throw Exception on Vulkan error. + std::vector<VkPhysicalDevice> EnumeratePhysicalDevices() const; + + /// Creates a debug callback messenger. + /// @throw Exception on creation failure. + DebugUtilsMessenger CreateDebugUtilsMessenger( + const VkDebugUtilsMessengerCreateInfoEXT& create_info) const; - /// Tries to create a debug callback messenger. Returns an empty handle on failure. - DebugCallback TryCreateDebugCallback(PFN_vkDebugUtilsMessengerCallbackEXT callback) noexcept; + /// Returns dispatch table. + const InstanceDispatch& Dispatch() const noexcept { + return *dld; + } }; class Queue { @@ -581,7 +600,8 @@ public: constexpr Queue() noexcept = default; /// Construct a queue handle. - constexpr Queue(VkQueue queue, const DeviceDispatch& dld) noexcept : queue{queue}, dld{&dld} {} + constexpr Queue(VkQueue queue_, const DeviceDispatch& dld_) noexcept + : queue{queue_}, dld{&dld_} {} VkResult Submit(Span<VkSubmitInfo> submit_infos, VkFence fence = VK_NULL_HANDLE) const noexcept { @@ -603,6 +623,17 @@ class Buffer : public Handle<VkBuffer, VkDevice, DeviceDispatch> { public: /// Attaches a memory allocation. void BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const; + + /// Set object name. + void SetObjectNameEXT(const char* name) const; +}; + +class BufferView : public Handle<VkBufferView, VkDevice, DeviceDispatch> { + using Handle<VkBufferView, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; }; class Image : public Handle<VkImage, VkDevice, DeviceDispatch> { @@ -611,12 +642,26 @@ class Image : public Handle<VkImage, VkDevice, DeviceDispatch> { public: /// Attaches a memory allocation. void BindMemory(VkDeviceMemory memory, VkDeviceSize offset) const; + + /// Set object name. + void SetObjectNameEXT(const char* name) const; +}; + +class ImageView : public Handle<VkImageView, VkDevice, DeviceDispatch> { + using Handle<VkImageView, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; }; class DeviceMemory : public Handle<VkDeviceMemory, VkDevice, DeviceDispatch> { using Handle<VkDeviceMemory, VkDevice, DeviceDispatch>::Handle; public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; + u8* Map(VkDeviceSize offset, VkDeviceSize size) const { void* data; Check(dld->vkMapMemory(owner, handle, offset, size, 0, &data)); @@ -632,6 +677,9 @@ class Fence : public Handle<VkFence, VkDevice, DeviceDispatch> { using Handle<VkFence, VkDevice, DeviceDispatch>::Handle; public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; + VkResult Wait(u64 timeout = std::numeric_limits<u64>::max()) const noexcept { return dld->vkWaitForFences(owner, 1, &handle, true, timeout); } @@ -645,11 +693,22 @@ public: } }; +class Framebuffer : public Handle<VkFramebuffer, VkDevice, DeviceDispatch> { + using Handle<VkFramebuffer, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; +}; + class DescriptorPool : public Handle<VkDescriptorPool, VkDevice, DeviceDispatch> { using Handle<VkDescriptorPool, VkDevice, DeviceDispatch>::Handle; public: DescriptorSets Allocate(const VkDescriptorSetAllocateInfo& ai) const; + + /// Set object name. + void SetObjectNameEXT(const char* name) const; }; class CommandPool : public Handle<VkCommandPool, VkDevice, DeviceDispatch> { @@ -658,6 +717,9 @@ class CommandPool : public Handle<VkCommandPool, VkDevice, DeviceDispatch> { public: CommandBuffers Allocate(std::size_t num_buffers, VkCommandBufferLevel level = VK_COMMAND_BUFFER_LEVEL_PRIMARY) const; + + /// Set object name. + void SetObjectNameEXT(const char* name) const; }; class SwapchainKHR : public Handle<VkSwapchainKHR, VkDevice, DeviceDispatch> { @@ -671,15 +733,29 @@ class Event : public Handle<VkEvent, VkDevice, DeviceDispatch> { using Handle<VkEvent, VkDevice, DeviceDispatch>::Handle; public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; + VkResult GetStatus() const noexcept { return dld->vkGetEventStatus(owner, handle); } }; +class ShaderModule : public Handle<VkShaderModule, VkDevice, DeviceDispatch> { + using Handle<VkShaderModule, VkDevice, DeviceDispatch>::Handle; + +public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; +}; + class Semaphore : public Handle<VkSemaphore, VkDevice, DeviceDispatch> { using Handle<VkSemaphore, VkDevice, DeviceDispatch>::Handle; public: + /// Set object name. + void SetObjectNameEXT(const char* name) const; + [[nodiscard]] u64 GetCounter() const { u64 value; Check(dld->vkGetSemaphoreCounterValueKHR(owner, handle, &value)); @@ -720,7 +796,7 @@ class Device : public Handle<VkDevice, NoOwner, DeviceDispatch> { public: static Device Create(VkPhysicalDevice physical_device, Span<VkDeviceQueueCreateInfo> queues_ci, Span<const char*> enabled_extensions, const void* next, - DeviceDispatch& dld) noexcept; + DeviceDispatch& dispatch); Queue GetQueue(u32 family_index) const noexcept; @@ -809,8 +885,9 @@ class PhysicalDevice { public: constexpr PhysicalDevice() noexcept = default; - constexpr PhysicalDevice(VkPhysicalDevice physical_device, const InstanceDispatch& dld) noexcept - : physical_device{physical_device}, dld{&dld} {} + constexpr PhysicalDevice(VkPhysicalDevice physical_device_, + const InstanceDispatch& dld_) noexcept + : physical_device{physical_device_}, dld{&dld_} {} constexpr operator VkPhysicalDevice() const noexcept { return physical_device; @@ -849,8 +926,8 @@ class CommandBuffer { public: CommandBuffer() noexcept = default; - explicit CommandBuffer(VkCommandBuffer handle, const DeviceDispatch& dld) noexcept - : handle{handle}, dld{&dld} {} + explicit CommandBuffer(VkCommandBuffer handle_, const DeviceDispatch& dld_) noexcept + : handle{handle_}, dld{&dld_} {} const VkCommandBuffer* address() const noexcept { return &handle; @@ -929,6 +1006,12 @@ public: regions.data(), filter); } + void ResolveImage(VkImage src_image, VkImageLayout src_layout, VkImage dst_image, + VkImageLayout dst_layout, Span<VkImageResolve> regions) { + dld->vkCmdResolveImage(handle, src_image, src_layout, dst_image, dst_layout, regions.size(), + regions.data()); + } + void Dispatch(u32 x, u32 y, u32 z) const noexcept { dld->vkCmdDispatch(handle, x, y, z); } @@ -943,6 +1026,23 @@ public: image_barriers.size(), image_barriers.data()); } + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, + VkDependencyFlags dependency_flags = 0) const noexcept { + PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, {}, {}); + } + + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, + VkDependencyFlags dependency_flags, + const VkBufferMemoryBarrier& buffer_barrier) const noexcept { + PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, buffer_barrier, {}); + } + + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, + VkDependencyFlags dependency_flags, + const VkImageMemoryBarrier& image_barrier) const noexcept { + PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, {}, image_barrier); + } + void CopyBufferToImage(VkBuffer src_buffer, VkImage dst_image, VkImageLayout dst_image_layout, Span<VkBufferImageCopy> regions) const noexcept { dld->vkCmdCopyBufferToImage(handle, src_buffer, dst_image, dst_image_layout, regions.size(), @@ -976,6 +1076,13 @@ public: dld->vkCmdPushConstants(handle, layout, flags, offset, size, values); } + template <typename T> + void PushConstants(VkPipelineLayout layout, VkShaderStageFlags flags, + const T& data) const noexcept { + static_assert(std::is_trivially_copyable_v<T>, "<data> is not trivially copyable"); + dld->vkCmdPushConstants(handle, layout, flags, 0, static_cast<u32>(sizeof(T)), &data); + } + void SetViewport(u32 first, Span<VkViewport> viewports) const noexcept { dld->vkCmdSetViewport(handle, first, viewports.size(), viewports.data()); } @@ -1085,11 +1192,27 @@ public: counter_buffers, counter_buffer_offsets); } + void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { + const VkDebugUtilsLabelEXT label_info{ + .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, + .pNext = nullptr, + .pLabelName = label, + .color{color[0], color[1], color[2], color[3]}, + }; + dld->vkCmdBeginDebugUtilsLabelEXT(handle, &label_info); + } + + void EndDebugUtilsLabelEXT() const noexcept { + dld->vkCmdEndDebugUtilsLabelEXT(handle); + } + private: VkCommandBuffer handle; const DeviceDispatch* dld; }; +u32 AvailableVersion(const InstanceDispatch& dld) noexcept; + std::optional<std::vector<VkExtensionProperties>> EnumerateInstanceExtensionProperties( const InstanceDispatch& dld); |