diff options
author | Fernando S <fsahmkow27@gmail.com> | 2022-10-06 21:29:53 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-10-06 21:29:53 +0200 |
commit | 1effa578f12f79d7816e3543291f302f126cc1d2 (patch) | |
tree | 14803b31b6817294d40d57446f6fa94c5ff3fe9a /src/video_core/engines | |
parent | Merge pull request #9025 from FernandoS27/slava-ukrayini (diff) | |
parent | vulkan_blitter: Fix pool allocation double free. (diff) | |
download | yuzu-1effa578f12f79d7816e3543291f302f126cc1d2.tar yuzu-1effa578f12f79d7816e3543291f302f126cc1d2.tar.gz yuzu-1effa578f12f79d7816e3543291f302f126cc1d2.tar.bz2 yuzu-1effa578f12f79d7816e3543291f302f126cc1d2.tar.lz yuzu-1effa578f12f79d7816e3543291f302f126cc1d2.tar.xz yuzu-1effa578f12f79d7816e3543291f302f126cc1d2.tar.zst yuzu-1effa578f12f79d7816e3543291f302f126cc1d2.zip |
Diffstat (limited to 'src/video_core/engines')
-rw-r--r-- | src/video_core/engines/engine_upload.cpp | 46 | ||||
-rw-r--r-- | src/video_core/engines/engine_upload.h | 6 | ||||
-rw-r--r-- | src/video_core/engines/kepler_compute.cpp | 13 | ||||
-rw-r--r-- | src/video_core/engines/kepler_memory.cpp | 13 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_3d.cpp | 43 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_dma.cpp | 111 | ||||
-rw-r--r-- | src/video_core/engines/maxwell_dma.h | 6 | ||||
-rw-r--r-- | src/video_core/engines/puller.cpp | 306 | ||||
-rw-r--r-- | src/video_core/engines/puller.h | 177 |
9 files changed, 648 insertions, 73 deletions
diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp index 6ff5b1eca..a34819234 100644 --- a/src/video_core/engines/engine_upload.cpp +++ b/src/video_core/engines/engine_upload.cpp @@ -3,6 +3,7 @@ #include <cstring> +#include "common/algorithm.h" #include "common/assert.h" #include "video_core/engines/engine_upload.h" #include "video_core/memory_manager.h" @@ -34,21 +35,48 @@ void State::ProcessData(const u32 data, const bool is_last_call) { if (!is_last_call) { return; } + ProcessData(inner_buffer); +} + +void State::ProcessData(const u32* data, size_t num_data) { + std::span<const u8> read_buffer(reinterpret_cast<const u8*>(data), num_data * sizeof(u32)); + ProcessData(read_buffer); +} + +void State::ProcessData(std::span<const u8> read_buffer) { const GPUVAddr address{regs.dest.Address()}; if (is_linear) { - rasterizer->AccelerateInlineToMemory(address, copy_size, inner_buffer); + if (regs.line_count == 1) { + rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer); + } else { + for (u32 line = 0; line < regs.line_count; ++line) { + const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch; + memory_manager.WriteBlockUnsafe( + dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in, + regs.line_length_in); + } + memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count); + } } else { - UNIMPLEMENTED_IF(regs.dest.z != 0); - UNIMPLEMENTED_IF(regs.dest.depth != 1); - UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0); - UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0); + u32 width = regs.dest.width; + u32 x_elements = regs.line_length_in; + u32 x_offset = regs.dest.x; + const u32 bpp_shift = Common::FoldRight( + 4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); }, + width, x_elements, x_offset, static_cast<u32>(address)); + width >>= bpp_shift; + x_elements >>= bpp_shift; + x_offset >>= bpp_shift; + const u32 bytes_per_pixel = 1U << bpp_shift; const std::size_t dst_size = Tegra::Texture::CalculateSize( - true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0); + true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth, + regs.dest.BlockHeight(), regs.dest.BlockDepth()); tmp_buffer.resize(dst_size); memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size); - Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y, - regs.dest.BlockHeight(), copy_size, inner_buffer.data(), - tmp_buffer.data()); + Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width, + regs.dest.height, regs.dest.depth, x_offset, regs.dest.y, + x_elements, regs.line_count, regs.dest.BlockHeight(), + regs.dest.BlockDepth(), regs.line_length_in); memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); } } diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h index 94ff3314a..f08f6e36a 100644 --- a/src/video_core/engines/engine_upload.h +++ b/src/video_core/engines/engine_upload.h @@ -3,6 +3,7 @@ #pragma once +#include <span> #include <vector> #include "common/bit_field.h" #include "common/common_types.h" @@ -33,7 +34,7 @@ struct Registers { u32 width; u32 height; u32 depth; - u32 z; + u32 layer; u32 x; u32 y; @@ -62,11 +63,14 @@ public: void ProcessExec(bool is_linear_); void ProcessData(u32 data, bool is_last_call); + void ProcessData(const u32* data, size_t num_data); /// Binds a rasterizer to this engine. void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); private: + void ProcessData(std::span<const u8> read_buffer); + u32 write_offset = 0; u32 copy_size = 0; std::vector<u8> inner_buffer; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index 5db254d94..7c50bdbe0 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -36,8 +36,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal } case KEPLER_COMPUTE_REG_INDEX(data_upload): { upload_state.ProcessData(method_argument, is_last_call); - if (is_last_call) { - } break; } case KEPLER_COMPUTE_REG_INDEX(launch): @@ -50,8 +48,15 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) { - for (std::size_t i = 0; i < amount; i++) { - CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + switch (method) { + case KEPLER_COMPUTE_REG_INDEX(data_upload): + upload_state.ProcessData(base_start, static_cast<size_t>(amount)); + return; + default: + for (std::size_t i = 0; i < amount; i++) { + CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + } + break; } } diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index e2b029542..a3fbab1e5 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -33,8 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call } case KEPLERMEMORY_REG_INDEX(data): { upload_state.ProcessData(method_argument, is_last_call); - if (is_last_call) { - } break; } } @@ -42,8 +40,15 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount, u32 methods_pending) { - for (std::size_t i = 0; i < amount; i++) { - CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + switch (method) { + case KEPLERMEMORY_REG_INDEX(data): + upload_state.ProcessData(base_start, static_cast<size_t>(amount)); + return; + default: + for (std::size_t i = 0; i < amount; i++) { + CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); + } + break; } } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 3a4646289..3c6e44a25 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -219,6 +219,8 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume regs.index_array.count = regs.small_index_2.count; regs.index_array.first = regs.small_index_2.first; dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; + // a macro calls this one over and over, should it increase instancing? + // Used by Hades and likely other Vulkan games. return DrawArrays(); case MAXWELL3D_REG_INDEX(topology_override): use_topology_override = true; @@ -237,11 +239,12 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume return upload_state.ProcessExec(regs.exec_upload.linear != 0); case MAXWELL3D_REG_INDEX(data_upload): upload_state.ProcessData(argument, is_last_call); - if (is_last_call) { - } return; case MAXWELL3D_REG_INDEX(fragment_barrier): return rasterizer->FragmentBarrier(); + case MAXWELL3D_REG_INDEX(invalidate_texture_data_cache): + rasterizer->InvalidateGPUCache(); + return rasterizer->WaitForIdle(); case MAXWELL3D_REG_INDEX(tiled_cache_barrier): return rasterizer->TiledCacheBarrier(); } @@ -311,6 +314,9 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15: ProcessCBMultiData(base_start, amount); break; + case MAXWELL3D_REG_INDEX(data_upload): + upload_state.ProcessData(base_start, static_cast<size_t>(amount)); + return; default: for (std::size_t i = 0; i < amount; i++) { CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1); @@ -447,18 +453,10 @@ void Maxwell3D::ProcessFirmwareCall4() { } void Maxwell3D::StampQueryResult(u64 payload, bool long_query) { - struct LongQueryResult { - u64_le value; - u64_le timestamp; - }; - static_assert(sizeof(LongQueryResult) == 16, "LongQueryResult has wrong size"); const GPUVAddr sequence_address{regs.query.QueryAddress()}; if (long_query) { - // Write the 128-bit result structure in long mode. Note: We emulate an infinitely fast - // GPU, this command may actually take a while to complete in real hardware due to GPU - // wait queues. - LongQueryResult query_result{payload, system.GPU().GetTicks()}; - memory_manager.WriteBlock(sequence_address, &query_result, sizeof(query_result)); + memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks()); + memory_manager.Write<u64>(sequence_address, payload); } else { memory_manager.Write<u32>(sequence_address, static_cast<u32>(payload)); } @@ -472,10 +470,25 @@ void Maxwell3D::ProcessQueryGet() { switch (regs.query.query_get.operation) { case Regs::QueryOperation::Release: - if (regs.query.query_get.fence == 1) { - rasterizer->SignalSemaphore(regs.query.QueryAddress(), regs.query.query_sequence); + if (regs.query.query_get.fence == 1 || regs.query.query_get.short_query != 0) { + const GPUVAddr sequence_address{regs.query.QueryAddress()}; + const u32 payload = regs.query.query_sequence; + std::function<void()> operation([this, sequence_address, payload] { + memory_manager.Write<u32>(sequence_address, payload); + }); + rasterizer->SignalFence(std::move(operation)); } else { - StampQueryResult(regs.query.query_sequence, regs.query.query_get.short_query == 0); + struct LongQueryResult { + u64_le value; + u64_le timestamp; + }; + const GPUVAddr sequence_address{regs.query.QueryAddress()}; + const u32 payload = regs.query.query_sequence; + std::function<void()> operation([this, sequence_address, payload] { + memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks()); + memory_manager.Write<u64>(sequence_address, payload); + }); + rasterizer->SyncOperation(std::move(operation)); } break; case Regs::QueryOperation::Acquire: diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 0efe58282..3909d36c1 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -1,6 +1,7 @@ // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later +#include "common/algorithm.h" #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" @@ -54,8 +55,6 @@ void MaxwellDMA::Launch() { const LaunchDMA& launch = regs.launch_dma; ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE); ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED); - ASSERT(regs.dst_params.origin.x == 0); - ASSERT(regs.dst_params.origin.y == 0); const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH; const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH; @@ -121,23 +120,40 @@ void MaxwellDMA::CopyPitchToPitch() { void MaxwellDMA::CopyBlockLinearToPitch() { UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0); - UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0); UNIMPLEMENTED_IF(regs.src_params.layer != 0); + const bool is_remapping = regs.launch_dma.remap_enable != 0; + // Optimized path for micro copies. const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; - if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X && + if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X && regs.src_params.height > GOB_SIZE_Y) { FastCopyBlockLinearToPitch(); return; } // Deswizzle the input and copy it over. - UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); - const u32 bytes_per_pixel = - regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1; const Parameters& src_params = regs.src_params; - const u32 width = src_params.width; + + const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; + const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; + + const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size; + + u32 width = src_params.width; + u32 x_elements = regs.line_length_in; + u32 x_offset = src_params.origin.x; + u32 bpp_shift = 0U; + if (!is_remapping) { + bpp_shift = Common::FoldRight( + 4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); }, + width, x_elements, x_offset, static_cast<u32>(regs.offset_in)); + width >>= bpp_shift; + x_elements >>= bpp_shift; + x_offset >>= bpp_shift; + } + + const u32 bytes_per_pixel = base_bpp << bpp_shift; const u32 height = src_params.height; const u32 depth = src_params.depth; const u32 block_height = src_params.block_size.height; @@ -155,30 +171,45 @@ void MaxwellDMA::CopyBlockLinearToPitch() { memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); - UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel, - block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(), - read_buffer.data()); + UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, + src_params.origin.y, x_elements, regs.line_count, block_height, block_depth, + regs.pitch_out); memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::CopyPitchToBlockLinear() { UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one"); - UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); + UNIMPLEMENTED_IF(regs.dst_params.layer != 0); + + const bool is_remapping = regs.launch_dma.remap_enable != 0; + const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1; + const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1; const auto& dst_params = regs.dst_params; - const u32 bytes_per_pixel = - regs.launch_dma.remap_enable ? regs.pitch_in / regs.line_length_in : 1; - const u32 width = dst_params.width; + + const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size; + + u32 width = dst_params.width; + u32 x_elements = regs.line_length_in; + u32 x_offset = dst_params.origin.x; + u32 bpp_shift = 0U; + if (!is_remapping) { + bpp_shift = Common::FoldRight( + 4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); }, + width, x_elements, x_offset, static_cast<u32>(regs.offset_out)); + width >>= bpp_shift; + x_elements >>= bpp_shift; + x_offset >>= bpp_shift; + } + + const u32 bytes_per_pixel = base_bpp << bpp_shift; const u32 height = dst_params.height; const u32 depth = dst_params.depth; const u32 block_height = dst_params.block_size.height; const u32 block_depth = dst_params.block_size.depth; const size_t dst_size = CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth); - const size_t dst_layer_size = - CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth); - const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count; if (read_buffer.size() < src_size) { @@ -188,32 +219,23 @@ void MaxwellDMA::CopyPitchToBlockLinear() { write_buffer.resize(dst_size); } + memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); if (Settings::IsGPULevelExtreme()) { - memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size); memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size); } else { - memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size); memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); } // If the input is linear and the output is tiled, swizzle the input and copy it over. - if (regs.dst_params.block_size.depth > 0) { - ASSERT(dst_params.layer == 0); - SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height, - bytes_per_pixel, block_height, block_depth, dst_params.origin.x, - dst_params.origin.y, write_buffer.data(), read_buffer.data()); - } else { - SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel, - write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(), - block_height, dst_params.origin.x, dst_params.origin.y); - } + SwizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset, + dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth, + regs.pitch_in); memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); } void MaxwellDMA::FastCopyBlockLinearToPitch() { - const u32 bytes_per_pixel = - regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1; + const u32 bytes_per_pixel = 1U; const size_t src_size = GOB_SIZE; const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count; u32 pos_x = regs.src_params.origin.x; @@ -239,9 +261,10 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() { memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size); } - UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width, - bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y, - write_buffer.data(), read_buffer.data()); + UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width, + regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count, + regs.src_params.block_size.height, regs.src_params.block_size.depth, + regs.pitch_out); memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size); } @@ -249,16 +272,24 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() { void MaxwellDMA::ReleaseSemaphore() { const auto type = regs.launch_dma.semaphore_type; const GPUVAddr address = regs.semaphore.address; + const u32 payload = regs.semaphore.payload; switch (type) { case LaunchDMA::SemaphoreType::NONE: break; - case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: - memory_manager.Write<u32>(address, regs.semaphore.payload); + case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { + std::function<void()> operation( + [this, address, payload] { memory_manager.Write<u32>(address, payload); }); + rasterizer->SignalFence(std::move(operation)); break; - case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: - memory_manager.Write<u64>(address, static_cast<u64>(regs.semaphore.payload)); - memory_manager.Write<u64>(address + 8, system.GPU().GetTicks()); + } + case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { + std::function<void()> operation([this, address, payload] { + memory_manager.Write<u64>(address + sizeof(u64), system.GPU().GetTicks()); + memory_manager.Write<u64>(address, payload); + }); + rasterizer->SignalFence(std::move(operation)); break; + } default: ASSERT_MSG(false, "Unknown semaphore type: {}", static_cast<u32>(type.Value())); } diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 074bac92c..bc48320ce 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -189,10 +189,16 @@ public: BitField<4, 3, Swizzle> dst_y; BitField<8, 3, Swizzle> dst_z; BitField<12, 3, Swizzle> dst_w; + BitField<0, 12, u32> dst_components_raw; BitField<16, 2, u32> component_size_minus_one; BitField<20, 2, u32> num_src_components_minus_one; BitField<24, 2, u32> num_dst_components_minus_one; }; + + Swizzle GetComponent(size_t i) const { + const u32 raw = dst_components_raw; + return static_cast<Swizzle>((raw >> (i * 3)) & 0x7); + } }; static_assert(sizeof(RemapConst) == 12); diff --git a/src/video_core/engines/puller.cpp b/src/video_core/engines/puller.cpp new file mode 100644 index 000000000..cca890792 --- /dev/null +++ b/src/video_core/engines/puller.cpp @@ -0,0 +1,306 @@ +// SPDX-FileCopyrightText: 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "common/assert.h" +#include "common/logging/log.h" +#include "common/settings.h" +#include "core/core.h" +#include "video_core/control/channel_state.h" +#include "video_core/dma_pusher.h" +#include "video_core/engines/fermi_2d.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/kepler_memory.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/engines/maxwell_dma.h" +#include "video_core/engines/puller.h" +#include "video_core/gpu.h" +#include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" + +namespace Tegra::Engines { + +Puller::Puller(GPU& gpu_, MemoryManager& memory_manager_, DmaPusher& dma_pusher_, + Control::ChannelState& channel_state_) + : gpu{gpu_}, memory_manager{memory_manager_}, dma_pusher{dma_pusher_}, channel_state{ + channel_state_} {} + +Puller::~Puller() = default; + +void Puller::ProcessBindMethod(const MethodCall& method_call) { + // Bind the current subchannel to the desired engine id. + LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, + method_call.argument); + const auto engine_id = static_cast<EngineID>(method_call.argument); + bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id); + switch (engine_id) { + case EngineID::FERMI_TWOD_A: + dma_pusher.BindSubchannel(channel_state.fermi_2d.get(), method_call.subchannel); + break; + case EngineID::MAXWELL_B: + dma_pusher.BindSubchannel(channel_state.maxwell_3d.get(), method_call.subchannel); + break; + case EngineID::KEPLER_COMPUTE_B: + dma_pusher.BindSubchannel(channel_state.kepler_compute.get(), method_call.subchannel); + break; + case EngineID::MAXWELL_DMA_COPY_A: + dma_pusher.BindSubchannel(channel_state.maxwell_dma.get(), method_call.subchannel); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + dma_pusher.BindSubchannel(channel_state.kepler_memory.get(), method_call.subchannel); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); + } +} + +void Puller::ProcessFenceActionMethod() { + switch (regs.fence_action.op) { + case Puller::FenceOperation::Acquire: + // UNIMPLEMENTED_MSG("Channel Scheduling pending."); + // WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); + rasterizer->ReleaseFences(); + break; + case Puller::FenceOperation::Increment: + rasterizer->SignalSyncPoint(regs.fence_action.syncpoint_id); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); + } +} + +void Puller::ProcessSemaphoreTriggerMethod() { + const auto semaphoreOperationMask = 0xF; + const auto op = + static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask); + if (op == GpuSemaphoreOperation::WriteLong) { + const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; + const u32 payload = regs.semaphore_sequence; + std::function<void()> operation([this, sequence_address, payload] { + memory_manager.Write<u64>(sequence_address + sizeof(u64), gpu.GetTicks()); + memory_manager.Write<u64>(sequence_address, payload); + }); + rasterizer->SignalFence(std::move(operation)); + } else { + do { + const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())}; + regs.acquire_source = true; + regs.acquire_value = regs.semaphore_sequence; + if (op == GpuSemaphoreOperation::AcquireEqual) { + regs.acquire_active = true; + regs.acquire_mode = false; + if (word != regs.acquire_value) { + rasterizer->ReleaseFences(); + continue; + } + } else if (op == GpuSemaphoreOperation::AcquireGequal) { + regs.acquire_active = true; + regs.acquire_mode = true; + if (word < regs.acquire_value) { + rasterizer->ReleaseFences(); + continue; + } + } else if (op == GpuSemaphoreOperation::AcquireMask) { + if (word && regs.semaphore_sequence == 0) { + rasterizer->ReleaseFences(); + continue; + } + } else { + LOG_ERROR(HW_GPU, "Invalid semaphore operation"); + } + } while (false); + } +} + +void Puller::ProcessSemaphoreRelease() { + const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; + const u32 payload = regs.semaphore_release; + std::function<void()> operation([this, sequence_address, payload] { + memory_manager.Write<u32>(sequence_address, payload); + }); + rasterizer->SyncOperation(std::move(operation)); +} + +void Puller::ProcessSemaphoreAcquire() { + u32 word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); + const auto value = regs.semaphore_acquire; + while (word != value) { + regs.acquire_active = true; + regs.acquire_value = value; + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + rasterizer->ReleaseFences(); + word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); + // TODO(kemathe73) figure out how to do the acquire_timeout + regs.acquire_mode = false; + regs.acquire_source = false; + } +} + +/// Calls a GPU puller method. +void Puller::CallPullerMethod(const MethodCall& method_call) { + regs.reg_array[method_call.method] = method_call.argument; + const auto method = static_cast<BufferMethods>(method_call.method); + + switch (method) { + case BufferMethods::BindObject: { + ProcessBindMethod(method_call); + break; + } + case BufferMethods::Nop: + case BufferMethods::SemaphoreAddressHigh: + case BufferMethods::SemaphoreAddressLow: + case BufferMethods::SemaphoreSequencePayload: + case BufferMethods::SyncpointPayload: + break; + case BufferMethods::WrcacheFlush: + case BufferMethods::RefCnt: + rasterizer->SignalReference(); + break; + case BufferMethods::SyncpointOperation: + ProcessFenceActionMethod(); + break; + case BufferMethods::WaitForIdle: + rasterizer->WaitForIdle(); + break; + case BufferMethods::SemaphoreOperation: { + ProcessSemaphoreTriggerMethod(); + break; + } + case BufferMethods::NonStallInterrupt: { + LOG_ERROR(HW_GPU, "Special puller engine method NonStallInterrupt not implemented"); + break; + } + case BufferMethods::MemOpA: { + LOG_ERROR(HW_GPU, "Memory Operation A"); + break; + } + case BufferMethods::MemOpB: { + // Implement this better. + rasterizer->InvalidateGPUCache(); + break; + } + case BufferMethods::MemOpC: + case BufferMethods::MemOpD: { + LOG_ERROR(HW_GPU, "Memory Operation C,D"); + break; + } + case BufferMethods::SemaphoreAcquire: { + ProcessSemaphoreAcquire(); + break; + } + case BufferMethods::SemaphoreRelease: { + ProcessSemaphoreRelease(); + break; + } + case BufferMethods::Yield: { + // TODO(Kmather73): Research and implement this method. + LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented"); + break; + } + default: + LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); + break; + } +} + +/// Calls a GPU engine method. +void Puller::CallEngineMethod(const MethodCall& method_call) { + const EngineID engine = bound_engines[method_call.subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + channel_state.fermi_2d->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::MAXWELL_B: + channel_state.maxwell_3d->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::KEPLER_COMPUTE_B: + channel_state.kepler_compute->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::MAXWELL_DMA_COPY_A: + channel_state.maxwell_dma->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + channel_state.kepler_memory->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } +} + +/// Calls a GPU engine multivalue method. +void Puller::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + const EngineID engine = bound_engines[subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + channel_state.fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::MAXWELL_B: + channel_state.maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::KEPLER_COMPUTE_B: + channel_state.kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::MAXWELL_DMA_COPY_A: + channel_state.maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + channel_state.kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } +} + +/// Calls a GPU method. +void Puller::CallMethod(const MethodCall& method_call) { + LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method, + method_call.subchannel); + + ASSERT(method_call.subchannel < bound_engines.size()); + + if (ExecuteMethodOnEngine(method_call.method)) { + CallEngineMethod(method_call); + } else { + CallPullerMethod(method_call); + } +} + +/// Calls a GPU multivalue method. +void Puller::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel); + + ASSERT(subchannel < bound_engines.size()); + + if (ExecuteMethodOnEngine(method)) { + CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); + } else { + for (std::size_t i = 0; i < amount; i++) { + CallPullerMethod(MethodCall{ + method, + base_start[i], + subchannel, + methods_pending - static_cast<u32>(i), + }); + } + } +} + +void Puller::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; +} + +/// Determines where the method should be executed. +[[nodiscard]] bool Puller::ExecuteMethodOnEngine(u32 method) { + const auto buffer_method = static_cast<BufferMethods>(method); + return buffer_method >= BufferMethods::NonPullerMethods; +} + +} // namespace Tegra::Engines diff --git a/src/video_core/engines/puller.h b/src/video_core/engines/puller.h new file mode 100644 index 000000000..d4175ee94 --- /dev/null +++ b/src/video_core/engines/puller.h @@ -0,0 +1,177 @@ +// SPDX-FileCopyrightText: 2022 yuzu Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + +#pragma once + +#include <array> +#include <cstddef> +#include <vector> +#include "common/bit_field.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "video_core/engines/engine_interface.h" + +namespace Core { +class System; +} + +namespace Tegra { +class MemoryManager; +class DmaPusher; + +enum class EngineID { + FERMI_TWOD_A = 0x902D, // 2D Engine + MAXWELL_B = 0xB197, // 3D Engine + KEPLER_COMPUTE_B = 0xB1C0, + KEPLER_INLINE_TO_MEMORY_B = 0xA140, + MAXWELL_DMA_COPY_A = 0xB0B5, +}; + +namespace Control { +struct ChannelState; +} +} // namespace Tegra + +namespace VideoCore { +class RasterizerInterface; +} + +namespace Tegra::Engines { + +class Puller final { +public: + struct MethodCall { + u32 method{}; + u32 argument{}; + u32 subchannel{}; + u32 method_count{}; + + explicit MethodCall(u32 method_, u32 argument_, u32 subchannel_ = 0, u32 method_count_ = 0) + : method(method_), argument(argument_), subchannel(subchannel_), + method_count(method_count_) {} + + [[nodiscard]] bool IsLastCall() const { + return method_count <= 1; + } + }; + + enum class FenceOperation : u32 { + Acquire = 0, + Increment = 1, + }; + + union FenceAction { + u32 raw; + BitField<0, 1, FenceOperation> op; + BitField<8, 24, u32> syncpoint_id; + }; + + explicit Puller(GPU& gpu_, MemoryManager& memory_manager_, DmaPusher& dma_pusher, + Control::ChannelState& channel_state); + ~Puller(); + + void CallMethod(const MethodCall& method_call); + + void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending); + + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); + + void CallPullerMethod(const MethodCall& method_call); + + void CallEngineMethod(const MethodCall& method_call); + + void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending); + +private: + Tegra::GPU& gpu; + + MemoryManager& memory_manager; + DmaPusher& dma_pusher; + Control::ChannelState& channel_state; + VideoCore::RasterizerInterface* rasterizer = nullptr; + + static constexpr std::size_t NUM_REGS = 0x800; + struct Regs { + static constexpr size_t NUM_REGS = 0x40; + + union { + struct { + INSERT_PADDING_WORDS_NOINIT(0x4); + struct { + u32 address_high; + u32 address_low; + + [[nodiscard]] GPUVAddr SemaphoreAddress() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + } semaphore_address; + + u32 semaphore_sequence; + u32 semaphore_trigger; + INSERT_PADDING_WORDS_NOINIT(0xC); + + // The pusher and the puller share the reference counter, the pusher only has read + // access + u32 reference_count; + INSERT_PADDING_WORDS_NOINIT(0x5); + + u32 semaphore_acquire; + u32 semaphore_release; + u32 fence_value; + FenceAction fence_action; + INSERT_PADDING_WORDS_NOINIT(0xE2); + + // Puller state + u32 acquire_mode; + u32 acquire_source; + u32 acquire_active; + u32 acquire_timeout; + u32 acquire_value; + }; + std::array<u32, NUM_REGS> reg_array; + }; + } regs{}; + + void ProcessBindMethod(const MethodCall& method_call); + void ProcessFenceActionMethod(); + void ProcessSemaphoreAcquire(); + void ProcessSemaphoreRelease(); + void ProcessSemaphoreTriggerMethod(); + [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); + + /// Mapping of command subchannels to their bound engine ids + std::array<EngineID, 8> bound_engines{}; + + enum class GpuSemaphoreOperation { + AcquireEqual = 0x1, + WriteLong = 0x2, + AcquireGequal = 0x4, + AcquireMask = 0x8, + }; + +#define ASSERT_REG_POSITION(field_name, position) \ + static_assert(offsetof(Regs, field_name) == position * 4, \ + "Field " #field_name " has invalid position") + + ASSERT_REG_POSITION(semaphore_address, 0x4); + ASSERT_REG_POSITION(semaphore_sequence, 0x6); + ASSERT_REG_POSITION(semaphore_trigger, 0x7); + ASSERT_REG_POSITION(reference_count, 0x14); + ASSERT_REG_POSITION(semaphore_acquire, 0x1A); + ASSERT_REG_POSITION(semaphore_release, 0x1B); + ASSERT_REG_POSITION(fence_value, 0x1C); + ASSERT_REG_POSITION(fence_action, 0x1D); + + ASSERT_REG_POSITION(acquire_mode, 0x100); + ASSERT_REG_POSITION(acquire_source, 0x101); + ASSERT_REG_POSITION(acquire_active, 0x102); + ASSERT_REG_POSITION(acquire_timeout, 0x103); + ASSERT_REG_POSITION(acquire_value, 0x104); + +#undef ASSERT_REG_POSITION +}; + +} // namespace Tegra::Engines |