diff options
Diffstat (limited to 'src/video_core')
49 files changed, 1343 insertions, 591 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 1e31a2900..1e010e4da 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -3,6 +3,8 @@ add_library(video_core STATIC dma_pusher.h debug_utils/debug_utils.cpp debug_utils/debug_utils.h + engines/engine_upload.cpp + engines/engine_upload.h engines/fermi_2d.cpp engines/fermi_2d.h engines/kepler_compute.cpp @@ -36,6 +38,8 @@ add_library(video_core STATIC renderer_base.h renderer_opengl/gl_buffer_cache.cpp renderer_opengl/gl_buffer_cache.h + renderer_opengl/gl_device.cpp + renderer_opengl/gl_device.h renderer_opengl/gl_global_cache.cpp renderer_opengl/gl_global_cache.h renderer_opengl/gl_primitive_assembler.cpp diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 046d047cb..036e66f05 100644 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -57,8 +57,8 @@ bool DmaPusher::Step() { // Push buffer non-empty, read a word command_headers.resize(command_list_header.size); - gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(), - command_list_header.size * sizeof(u32)); + gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(), + command_list_header.size * sizeof(u32)); for (const CommandHeader& command_header : command_headers) { @@ -105,6 +105,8 @@ bool DmaPusher::Step() { dma_state.non_incrementing = false; dma_increment_once = true; break; + default: + break; } } } diff --git a/src/video_core/engines/engine_upload.cpp b/src/video_core/engines/engine_upload.cpp new file mode 100644 index 000000000..f8aa4ff55 --- /dev/null +++ b/src/video_core/engines/engine_upload.cpp @@ -0,0 +1,48 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/assert.h" +#include "video_core/engines/engine_upload.h" +#include "video_core/memory_manager.h" +#include "video_core/textures/decoders.h" + +namespace Tegra::Engines::Upload { + +State::State(MemoryManager& memory_manager, Registers& regs) + : memory_manager(memory_manager), regs(regs) {} + +void State::ProcessExec(const bool is_linear) { + write_offset = 0; + copy_size = regs.line_length_in * regs.line_count; + inner_buffer.resize(copy_size); + this->is_linear = is_linear; +} + +void State::ProcessData(const u32 data, const bool is_last_call) { + const u32 sub_copy_size = std::min(4U, copy_size - write_offset); + std::memcpy(&inner_buffer[write_offset], &data, sub_copy_size); + write_offset += sub_copy_size; + if (!is_last_call) { + return; + } + const GPUVAddr address{regs.dest.Address()}; + if (is_linear) { + memory_manager.WriteBlock(address, inner_buffer.data(), copy_size); + } else { + UNIMPLEMENTED_IF(regs.dest.z != 0); + UNIMPLEMENTED_IF(regs.dest.depth != 1); + UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1); + UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1); + const std::size_t dst_size = Tegra::Texture::CalculateSize( + true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1); + tmp_buffer.resize(dst_size); + memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size); + Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y, + regs.dest.BlockHeight(), copy_size, inner_buffer.data(), + tmp_buffer.data()); + memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size); + } +} + +} // namespace Tegra::Engines::Upload diff --git a/src/video_core/engines/engine_upload.h b/src/video_core/engines/engine_upload.h new file mode 100644 index 000000000..9c6e0d21c --- /dev/null +++ b/src/video_core/engines/engine_upload.h @@ -0,0 +1,75 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> +#include <vector> +#include "common/bit_field.h" +#include "common/common_funcs.h" +#include "common/common_types.h" + +namespace Tegra { +class MemoryManager; +} + +namespace Tegra::Engines::Upload { + +struct Registers { + u32 line_length_in; + u32 line_count; + + struct { + u32 address_high; + u32 address_low; + u32 pitch; + union { + BitField<0, 4, u32> block_width; + BitField<4, 4, u32> block_height; + BitField<8, 4, u32> block_depth; + }; + u32 width; + u32 height; + u32 depth; + u32 z; + u32 x; + u32 y; + + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | address_low); + } + + u32 BlockWidth() const { + return 1U << block_width.Value(); + } + + u32 BlockHeight() const { + return 1U << block_height.Value(); + } + + u32 BlockDepth() const { + return 1U << block_depth.Value(); + } + } dest; +}; + +class State { +public: + State(MemoryManager& memory_manager, Registers& regs); + ~State() = default; + + void ProcessExec(const bool is_linear); + void ProcessData(const u32 data, const bool is_last_call); + +private: + u32 write_offset = 0; + u32 copy_size = 0; + std::vector<u8> inner_buffer; + std::vector<u8> tmp_buffer; + bool is_linear = false; + Registers& regs; + MemoryManager& memory_manager; +}; + +} // namespace Tegra::Engines::Upload diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 2e51b7f13..45f59a4d9 100644 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -21,6 +21,12 @@ class RasterizerInterface; namespace Tegra::Engines { +/** + * This Engine is known as G80_2D. Documentation can be found in: + * https://github.com/envytools/envytools/blob/master/rnndb/graph/g80_2d.xml + * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nv50/nv50_2d.xml.h + */ + #define FERMI2D_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::Fermi2D::Regs, field_name) / sizeof(u32)) diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index b1d950460..7404a8163 100644 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -4,12 +4,21 @@ #include "common/assert.h" #include "common/logging/log.h" +#include "core/core.h" #include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/renderer_base.h" +#include "video_core/textures/decoders.h" namespace Tegra::Engines { -KeplerCompute::KeplerCompute(MemoryManager& memory_manager) : memory_manager{memory_manager} {} +KeplerCompute::KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + MemoryManager& memory_manager) + : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, upload_state{ + memory_manager, + regs.upload} {} KeplerCompute::~KeplerCompute() = default; @@ -20,14 +29,34 @@ void KeplerCompute::CallMethod(const GPU::MethodCall& method_call) { regs.reg_array[method_call.method] = method_call.argument; switch (method_call.method) { + case KEPLER_COMPUTE_REG_INDEX(exec_upload): { + upload_state.ProcessExec(regs.exec_upload.linear != 0); + break; + } + case KEPLER_COMPUTE_REG_INDEX(data_upload): { + const bool is_last_call = method_call.IsLastCall(); + upload_state.ProcessData(method_call.argument, is_last_call); + if (is_last_call) { + system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + } + break; + } case KEPLER_COMPUTE_REG_INDEX(launch): - // Abort execution since compute shaders can be used to alter game memory (e.g. CUDA - // kernels) - UNREACHABLE_MSG("Compute shaders are not implemented"); + ProcessLaunch(); break; default: break; } } +void KeplerCompute::ProcessLaunch() { + + const GPUVAddr launch_desc_loc = regs.launch_desc_loc.Address(); + memory_manager.ReadBlockUnsafe(launch_desc_loc, &launch_description, + LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32)); + + const GPUVAddr code_loc = regs.code_loc.Address() + launch_description.program_start; + LOG_WARNING(HW_GPU, "Compute Kernel Execute at Address 0x{:016x}, STUBBED", code_loc); +} + } // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index fb6cdf432..5250b8d9b 100644 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -6,22 +6,40 @@ #include <array> #include <cstddef> +#include <vector> +#include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "video_core/engines/engine_upload.h" #include "video_core/gpu.h" +namespace Core { +class System; +} + namespace Tegra { class MemoryManager; } +namespace VideoCore { +class RasterizerInterface; +} + namespace Tegra::Engines { +/** + * This Engine is known as GK104_Compute. Documentation can be found in: + * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_compute.xml + * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h + */ + #define KEPLER_COMPUTE_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::KeplerCompute::Regs, field_name) / sizeof(u32)) class KeplerCompute final { public: - explicit KeplerCompute(MemoryManager& memory_manager); + explicit KeplerCompute(Core::System& system, VideoCore::RasterizerInterface& rasterizer, + MemoryManager& memory_manager); ~KeplerCompute(); static constexpr std::size_t NumConstBuffers = 8; @@ -31,30 +49,181 @@ public: union { struct { - INSERT_PADDING_WORDS(0xAF); + INSERT_PADDING_WORDS(0x60); + + Upload::Registers upload; + + struct { + union { + BitField<0, 1, u32> linear; + }; + } exec_upload; + + u32 data_upload; + + INSERT_PADDING_WORDS(0x3F); + + struct { + u32 address; + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address) << 8)); + } + } launch_desc_loc; + + INSERT_PADDING_WORDS(0x1); u32 launch; - INSERT_PADDING_WORDS(0xC48); + INSERT_PADDING_WORDS(0x4A7); + + struct { + u32 address_high; + u32 address_low; + u32 limit; + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + } tsc; + + INSERT_PADDING_WORDS(0x3); + + struct { + u32 address_high; + u32 address_low; + u32 limit; + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + } tic; + + INSERT_PADDING_WORDS(0x22); + + struct { + u32 address_high; + u32 address_low; + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + } code_loc; + + INSERT_PADDING_WORDS(0x3FE); + + u32 texture_const_buffer_index; + + INSERT_PADDING_WORDS(0x374); }; std::array<u32, NUM_REGS> reg_array; }; } regs{}; + + struct LaunchParams { + static constexpr std::size_t NUM_LAUNCH_PARAMETERS = 0x40; + + INSERT_PADDING_WORDS(0x8); + + u32 program_start; + + INSERT_PADDING_WORDS(0x2); + + BitField<30, 1, u32> linked_tsc; + + BitField<0, 31, u32> grid_dim_x; + union { + BitField<0, 16, u32> grid_dim_y; + BitField<16, 16, u32> grid_dim_z; + }; + + INSERT_PADDING_WORDS(0x3); + + BitField<0, 16, u32> shared_alloc; + + BitField<0, 31, u32> block_dim_x; + union { + BitField<0, 16, u32> block_dim_y; + BitField<16, 16, u32> block_dim_z; + }; + + union { + BitField<0, 8, u32> const_buffer_enable_mask; + BitField<29, 2, u32> cache_layout; + } memory_config; + + INSERT_PADDING_WORDS(0x8); + + struct { + u32 address_low; + union { + BitField<0, 8, u32> address_high; + BitField<15, 17, u32> size; + }; + GPUVAddr Address() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high.Value()) << 32) | + address_low); + } + } const_buffer_config[8]; + + union { + BitField<0, 20, u32> local_pos_alloc; + BitField<27, 5, u32> barrier_alloc; + }; + + union { + BitField<0, 20, u32> local_neg_alloc; + BitField<24, 5, u32> gpr_alloc; + }; + + INSERT_PADDING_WORDS(0x11); + } launch_description; + + struct { + u32 write_offset = 0; + u32 copy_size = 0; + std::vector<u8> inner_buffer; + } state{}; + static_assert(sizeof(Regs) == Regs::NUM_REGS * sizeof(u32), "KeplerCompute Regs has wrong size"); + static_assert(sizeof(LaunchParams) == LaunchParams::NUM_LAUNCH_PARAMETERS * sizeof(u32), + "KeplerCompute LaunchParams has wrong size"); + /// Write the value to the register identified by method. void CallMethod(const GPU::MethodCall& method_call); private: + Core::System& system; + VideoCore::RasterizerInterface& rasterizer; MemoryManager& memory_manager; + Upload::State upload_state; + + void ProcessLaunch(); }; #define ASSERT_REG_POSITION(field_name, position) \ static_assert(offsetof(KeplerCompute::Regs, field_name) == position * 4, \ "Field " #field_name " has invalid position") +#define ASSERT_LAUNCH_PARAM_POSITION(field_name, position) \ + static_assert(offsetof(KeplerCompute::LaunchParams, field_name) == position * 4, \ + "Field " #field_name " has invalid position") + +ASSERT_REG_POSITION(upload, 0x60); +ASSERT_REG_POSITION(exec_upload, 0x6C); +ASSERT_REG_POSITION(data_upload, 0x6D); ASSERT_REG_POSITION(launch, 0xAF); +ASSERT_REG_POSITION(tsc, 0x557); +ASSERT_REG_POSITION(tic, 0x55D); +ASSERT_REG_POSITION(code_loc, 0x582); +ASSERT_REG_POSITION(texture_const_buffer_index, 0x982); +ASSERT_LAUNCH_PARAM_POSITION(program_start, 0x8); +ASSERT_LAUNCH_PARAM_POSITION(grid_dim_x, 0xC); +ASSERT_LAUNCH_PARAM_POSITION(shared_alloc, 0x11); +ASSERT_LAUNCH_PARAM_POSITION(block_dim_x, 0x12); +ASSERT_LAUNCH_PARAM_POSITION(memory_config, 0x14); +ASSERT_LAUNCH_PARAM_POSITION(const_buffer_config, 0x1D); #undef ASSERT_REG_POSITION diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index cd51a31d7..0561f676c 100644 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -10,12 +10,12 @@ #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_base.h" +#include "video_core/textures/decoders.h" namespace Tegra::Engines { -KeplerMemory::KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - MemoryManager& memory_manager) - : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager} {} +KeplerMemory::KeplerMemory(Core::System& system, MemoryManager& memory_manager) + : system{system}, memory_manager{memory_manager}, upload_state{memory_manager, regs.upload} {} KeplerMemory::~KeplerMemory() = default; @@ -27,30 +27,18 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) { switch (method_call.method) { case KEPLERMEMORY_REG_INDEX(exec): { - state.write_offset = 0; + upload_state.ProcessExec(regs.exec.linear != 0); break; } case KEPLERMEMORY_REG_INDEX(data): { - ProcessData(method_call.argument); + const bool is_last_call = method_call.IsLastCall(); + upload_state.ProcessData(method_call.argument, is_last_call); + if (is_last_call) { + system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); + } break; } } } -void KeplerMemory::ProcessData(u32 data) { - ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported"); - ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0); - - // We have to invalidate the destination region to evict any outdated surfaces from the cache. - // We do this before actually writing the new data because the destination address might - // contain a dirty surface that will have to be written back to memory. - const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)}; - rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32)); - memory_manager.Write<u32>(address, data); - - system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite(); - - state.write_offset++; -} - } // namespace Tegra::Engines diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h index 78b6c3e45..f3bc675a9 100644 --- a/src/video_core/engines/kepler_memory.h +++ b/src/video_core/engines/kepler_memory.h @@ -6,9 +6,11 @@ #include <array> #include <cstddef> +#include <vector> #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" +#include "video_core/engines/engine_upload.h" #include "video_core/gpu.h" namespace Core { @@ -19,19 +21,20 @@ namespace Tegra { class MemoryManager; } -namespace VideoCore { -class RasterizerInterface; -} - namespace Tegra::Engines { +/** + * This Engine is known as P2MF. Documentation can be found in: + * https://github.com/envytools/envytools/blob/master/rnndb/graph/gk104_p2mf.xml + * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nve4_p2mf.xml.h + */ + #define KEPLERMEMORY_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::KeplerMemory::Regs, field_name) / sizeof(u32)) class KeplerMemory final { public: - KeplerMemory(Core::System& system, VideoCore::RasterizerInterface& rasterizer, - MemoryManager& memory_manager); + KeplerMemory(Core::System& system, MemoryManager& memory_manager); ~KeplerMemory(); /// Write the value to the register identified by method. @@ -44,26 +47,7 @@ public: struct { INSERT_PADDING_WORDS(0x60); - u32 line_length_in; - u32 line_count; - - struct { - u32 address_high; - u32 address_low; - u32 pitch; - u32 block_dimensions; - u32 width; - u32 height; - u32 depth; - u32 z; - u32 x; - u32 y; - - GPUVAddr Address() const { - return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | - address_low); - } - } dest; + Upload::Registers upload; struct { union { @@ -79,25 +63,17 @@ public: }; } regs{}; - struct { - u32 write_offset = 0; - } state{}; - private: Core::System& system; - VideoCore::RasterizerInterface& rasterizer; MemoryManager& memory_manager; - - void ProcessData(u32 data); + Upload::State upload_state; }; #define ASSERT_REG_POSITION(field_name, position) \ static_assert(offsetof(KeplerMemory::Regs, field_name) == position * 4, \ "Field " #field_name " has invalid position") -ASSERT_REG_POSITION(line_length_in, 0x60); -ASSERT_REG_POSITION(line_count, 0x61); -ASSERT_REG_POSITION(dest, 0x62); +ASSERT_REG_POSITION(upload, 0x60); ASSERT_REG_POSITION(exec, 0x6C); ASSERT_REG_POSITION(data, 0x6D); #undef ASSERT_REG_POSITION diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index b198793bc..d7b586db9 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -20,8 +20,8 @@ constexpr u32 MacroRegistersStart = 0xE00; Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, MemoryManager& memory_manager) - : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, macro_interpreter{ - *this} { + : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, + macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { InitializeRegisterDefaults(); } @@ -253,6 +253,18 @@ void Maxwell3D::CallMethod(const GPU::MethodCall& method_call) { ProcessSyncPoint(); break; } + case MAXWELL3D_REG_INDEX(exec_upload): { + upload_state.ProcessExec(regs.exec_upload.linear != 0); + break; + } + case MAXWELL3D_REG_INDEX(data_upload): { + const bool is_last_call = method_call.IsLastCall(); + upload_state.ProcessData(method_call.argument, is_last_call); + if (is_last_call) { + dirty_flags.OnMemoryWrite(); + } + break; + } default: break; } @@ -418,7 +430,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const { const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)}; Texture::TICEntry tic_entry; - memory_manager.ReadBlock(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); + memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear || tic_entry.header_version == Texture::TICHeaderVersion::Pitch, @@ -439,7 +451,7 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const { const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)}; Texture::TSCEntry tsc_entry; - memory_manager.ReadBlock(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry)); + memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry)); return tsc_entry; } diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index cc2424d38..4883b582a 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -14,6 +14,7 @@ #include "common/common_funcs.h" #include "common/common_types.h" #include "common/math_util.h" +#include "video_core/engines/engine_upload.h" #include "video_core/gpu.h" #include "video_core/macro_interpreter.h" #include "video_core/textures/texture.h" @@ -32,6 +33,12 @@ class RasterizerInterface; namespace Tegra::Engines { +/** + * This Engine is known as GF100_3D. Documentation can be found in: + * https://github.com/envytools/envytools/blob/master/rnndb/graph/gf100_3d.xml + * https://cgit.freedesktop.org/mesa/mesa/tree/src/gallium/drivers/nouveau/nvc0/nvc0_3d.xml.h + */ + #define MAXWELL3D_REG_INDEX(field_name) \ (offsetof(Tegra::Engines::Maxwell3D::Regs, field_name) / sizeof(u32)) @@ -243,9 +250,10 @@ public: return "10_10_10_2"; case Size::Size_11_11_10: return "11_11_10"; + default: + UNREACHABLE(); + return {}; } - UNREACHABLE(); - return {}; } std::string TypeString() const { @@ -579,7 +587,18 @@ public: u32 bind; } macros; - INSERT_PADDING_WORDS(0x69); + INSERT_PADDING_WORDS(0x17); + + Upload::Registers upload; + struct { + union { + BitField<0, 1, u32> linear; + }; + } exec_upload; + + u32 data_upload; + + INSERT_PADDING_WORDS(0x44); struct { union { @@ -1175,6 +1194,8 @@ private: /// Interpreter for the macro codes uploaded to the GPU. MacroInterpreter macro_interpreter; + Upload::State upload_state; + /// Retrieves information about a specific TIC entry from the TIC buffer. Texture::TICEntry GetTICEntry(u32 tic_index) const; @@ -1218,6 +1239,9 @@ private: "Field " #field_name " has invalid position") ASSERT_REG_POSITION(macros, 0x45); +ASSERT_REG_POSITION(upload, 0x60); +ASSERT_REG_POSITION(exec_upload, 0x6C); +ASSERT_REG_POSITION(data_upload, 0x6D); ASSERT_REG_POSITION(sync_info, 0xB2); ASSERT_REG_POSITION(tfb_enabled, 0x1D1); ASSERT_REG_POSITION(rt, 0x200); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index 2426d0067..3a5dfef0c 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -83,57 +83,66 @@ void MaxwellDMA::HandleCopy() { ASSERT(regs.exec.enable_2d == 1); - const std::size_t copy_size = regs.x_count * regs.y_count; + if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { + ASSERT(regs.src_params.size_z == 1); + // If the input is tiled and the output is linear, deswizzle the input and copy it over. + const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x; + const std::size_t src_size = Texture::CalculateSize( + true, src_bytes_per_pixel, regs.src_params.size_x, regs.src_params.size_y, + regs.src_params.size_z, regs.src_params.BlockHeight(), regs.src_params.BlockDepth()); - auto source_ptr{memory_manager.GetPointer(source)}; - auto dst_ptr{memory_manager.GetPointer(dest)}; + const std::size_t dst_size = regs.dst_pitch * regs.y_count; - if (!source_ptr) { - LOG_ERROR(HW_GPU, "source_ptr is invalid"); - return; - } + if (read_buffer.size() < src_size) { + read_buffer.resize(src_size); + } - if (!dst_ptr) { - LOG_ERROR(HW_GPU, "dst_ptr is invalid"); - return; - } + if (write_buffer.size() < dst_size) { + write_buffer.resize(dst_size); + } - const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) { - // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated - // copying. - rasterizer.FlushRegion(ToCacheAddr(source_ptr), src_size); + memory_manager.ReadBlock(source, read_buffer.data(), src_size); + memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); - // We have to invalidate the destination region to evict any outdated surfaces from the - // cache. We do this before actually writing the new data because the destination address - // might contain a dirty surface that will have to be written back to memory. - rasterizer.InvalidateRegion(ToCacheAddr(dst_ptr), dst_size); - }; + Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch, + regs.src_params.size_x, src_bytes_per_pixel, read_buffer.data(), + write_buffer.data(), regs.src_params.BlockHeight(), + regs.src_params.pos_x, regs.src_params.pos_y); - if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) { - ASSERT(regs.src_params.size_z == 1); - // If the input is tiled and the output is linear, deswizzle the input and copy it over. + memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); + } else { + ASSERT(regs.dst_params.BlockDepth() == 1); - const u32 src_bytes_per_pixel = regs.src_pitch / regs.src_params.size_x; + const u32 src_bytes_per_pixel = regs.src_pitch / regs.x_count; - FlushAndInvalidate(regs.src_pitch * regs.src_params.size_y, - copy_size * src_bytes_per_pixel); + const std::size_t dst_size = Texture::CalculateSize( + true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, + regs.dst_params.size_z, regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); - Texture::UnswizzleSubrect(regs.x_count, regs.y_count, regs.dst_pitch, - regs.src_params.size_x, src_bytes_per_pixel, source_ptr, dst_ptr, - regs.src_params.BlockHeight(), regs.src_params.pos_x, - regs.src_params.pos_y); - } else { - ASSERT(regs.dst_params.size_z == 1); - ASSERT(regs.src_pitch == regs.x_count); + const std::size_t dst_layer_size = Texture::CalculateSize( + true, src_bytes_per_pixel, regs.dst_params.size_x, regs.dst_params.size_y, 1, + regs.dst_params.BlockHeight(), regs.dst_params.BlockDepth()); - const u32 src_bpp = regs.src_pitch / regs.x_count; + const std::size_t src_size = regs.src_pitch * regs.y_count; - FlushAndInvalidate(regs.src_pitch * regs.y_count, - regs.dst_params.size_x * regs.dst_params.size_y * src_bpp); + if (read_buffer.size() < src_size) { + read_buffer.resize(src_size); + } + + if (write_buffer.size() < dst_size) { + write_buffer.resize(dst_size); + } + + memory_manager.ReadBlock(source, read_buffer.data(), src_size); + memory_manager.ReadBlock(dest, write_buffer.data(), dst_size); // If the input is linear and the output is tiled, swizzle the input and copy it over. Texture::SwizzleSubrect(regs.x_count, regs.y_count, regs.src_pitch, regs.dst_params.size_x, - src_bpp, dst_ptr, source_ptr, regs.dst_params.BlockHeight()); + src_bytes_per_pixel, + write_buffer.data() + dst_layer_size * regs.dst_params.pos_z, + read_buffer.data(), regs.dst_params.BlockHeight()); + + memory_manager.WriteBlock(dest, write_buffer.data(), dst_size); } } diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index c6b649842..e5942f671 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -6,6 +6,7 @@ #include <array> #include <cstddef> +#include <vector> #include "common/bit_field.h" #include "common/common_funcs.h" #include "common/common_types.h" @@ -25,6 +26,11 @@ class RasterizerInterface; namespace Tegra::Engines { +/** + * This Engine is known as GK104_Copy. Documentation can be found in: + * https://github.com/envytools/envytools/blob/master/rnndb/fifo/gk104_copy.xml + */ + class MaxwellDMA final { public: explicit MaxwellDMA(Core::System& system, VideoCore::RasterizerInterface& rasterizer, @@ -63,6 +69,16 @@ public: static_assert(sizeof(Parameters) == 24, "Parameters has wrong size"); + enum class ComponentMode : u32 { + Src0 = 0, + Src1 = 1, + Src2 = 2, + Src3 = 3, + Const0 = 4, + Const1 = 5, + Zero = 6, + }; + enum class CopyMode : u32 { None = 0, Unk1 = 1, @@ -128,7 +144,26 @@ public: u32 x_count; u32 y_count; - INSERT_PADDING_WORDS(0xBB); + INSERT_PADDING_WORDS(0xB8); + + u32 const0; + u32 const1; + union { + BitField<0, 4, ComponentMode> component0; + BitField<4, 4, ComponentMode> component1; + BitField<8, 4, ComponentMode> component2; + BitField<12, 4, ComponentMode> component3; + BitField<16, 2, u32> component_size; + BitField<20, 3, u32> src_num_components; + BitField<24, 3, u32> dst_num_components; + + u32 SrcBytePerPixel() const { + return src_num_components.Value() * component_size.Value(); + } + u32 DstBytePerPixel() const { + return dst_num_components.Value() * component_size.Value(); + } + } swizzle_config; Parameters dst_params; @@ -149,6 +184,9 @@ private: MemoryManager& memory_manager; + std::vector<u8> read_buffer; + std::vector<u8> write_buffer; + /// Performs the copy from the source buffer to the destination buffer as configured in the /// registers. void HandleCopy(); @@ -165,6 +203,9 @@ ASSERT_REG_POSITION(src_pitch, 0x104); ASSERT_REG_POSITION(dst_pitch, 0x105); ASSERT_REG_POSITION(x_count, 0x106); ASSERT_REG_POSITION(y_count, 0x107); +ASSERT_REG_POSITION(const0, 0x1C0); +ASSERT_REG_POSITION(const1, 0x1C1); +ASSERT_REG_POSITION(swizzle_config, 0x1C2); ASSERT_REG_POSITION(dst_params, 0x1C3); ASSERT_REG_POSITION(src_params, 0x1CA); diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index fce9733b9..e5b4eadea 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -937,21 +937,34 @@ union Instruction { } iset; union { - BitField<8, 2, Register::Size> dest_size; - BitField<10, 2, Register::Size> src_size; - BitField<12, 1, u64> is_output_signed; - BitField<13, 1, u64> is_input_signed; - BitField<41, 2, u64> selector; + BitField<41, 2, u64> selector; // i2i and i2f only BitField<45, 1, u64> negate_a; BitField<49, 1, u64> abs_a; + BitField<10, 2, Register::Size> src_size; + BitField<13, 1, u64> is_input_signed; + BitField<8, 2, Register::Size> dst_size; + BitField<12, 1, u64> is_output_signed; + + union { + BitField<39, 2, u64> tab5cb8_2; + } i2f; union { BitField<39, 2, F2iRoundingOp> rounding; } f2i; union { - BitField<39, 4, F2fRoundingOp> rounding; + BitField<8, 2, Register::Size> src_size; + BitField<10, 2, Register::Size> dst_size; + BitField<39, 4, u64> rounding; + // H0, H1 extract for F16 missing + BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value + F2fRoundingOp GetRoundingMode() const { + constexpr u64 rounding_mask = 0x0B; + return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask); + } } f2f; + } conversion; union { @@ -1734,7 +1747,7 @@ private: INST("0011100-00101---", Id::SHR_IMM, Type::Shift, "SHR_IMM"), INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"), INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"), - INST("01110001-1000---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"), + INST("0011101-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"), INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"), INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"), INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"), diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 4461083ff..52706505b 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -35,9 +35,9 @@ GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{ren dma_pusher = std::make_unique<Tegra::DmaPusher>(*this); maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager); fermi_2d = std::make_unique<Engines::Fermi2D>(rasterizer, *memory_manager); - kepler_compute = std::make_unique<Engines::KeplerCompute>(*memory_manager); + kepler_compute = std::make_unique<Engines::KeplerCompute>(system, rasterizer, *memory_manager); maxwell_dma = std::make_unique<Engines::MaxwellDMA>(system, rasterizer, *memory_manager); - kepler_memory = std::make_unique<Engines::KeplerMemory>(system, rasterizer, *memory_manager); + kepler_memory = std::make_unique<Engines::KeplerMemory>(system, *memory_manager); } GPU::~GPU() = default; diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 0f4e820aa..6c98c6701 100644 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -199,7 +199,15 @@ const u8* MemoryManager::GetPointer(GPUVAddr addr) const { return {}; } -void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const { +bool MemoryManager::IsBlockContinous(const GPUVAddr start, const std::size_t size) { + const GPUVAddr end = start + size; + const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start)); + const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end)); + const std::size_t range = static_cast<std::size_t>(host_ptr_end - host_ptr_start); + return range == size; +} + +void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const { std::size_t remaining_size{size}; std::size_t page_index{src_addr >> page_bits}; std::size_t page_offset{src_addr & page_mask}; @@ -226,7 +234,30 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t } } -void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) { +void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, + const std::size_t size) const { + std::size_t remaining_size{size}; + std::size_t page_index{src_addr >> page_bits}; + std::size_t page_offset{src_addr & page_mask}; + + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; + const u8* page_pointer = page_table.pointers[page_index]; + if (page_pointer) { + const u8* src_ptr{page_pointer + page_offset}; + std::memcpy(dest_buffer, src_ptr, copy_amount); + } else { + std::memset(dest_buffer, 0, copy_amount); + } + page_index++; + page_offset = 0; + dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount; + remaining_size -= copy_amount; + } +} + +void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) { std::size_t remaining_size{size}; std::size_t page_index{dest_addr >> page_bits}; std::size_t page_offset{dest_addr & page_mask}; @@ -253,7 +284,28 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std:: } } -void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) { +void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, + const std::size_t size) { + std::size_t remaining_size{size}; + std::size_t page_index{dest_addr >> page_bits}; + std::size_t page_offset{dest_addr & page_mask}; + + while (remaining_size > 0) { + const std::size_t copy_amount{ + std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)}; + u8* page_pointer = page_table.pointers[page_index]; + if (page_pointer) { + u8* dest_ptr{page_pointer + page_offset}; + std::memcpy(dest_ptr, src_buffer, copy_amount); + } + page_index++; + page_offset = 0; + src_buffer = static_cast<const u8*>(src_buffer) + copy_amount; + remaining_size -= copy_amount; + } +} + +void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { std::size_t remaining_size{size}; std::size_t page_index{src_addr >> page_bits}; std::size_t page_offset{src_addr & page_mask}; @@ -281,6 +333,12 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t } } +void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) { + std::vector<u8> tmp_buffer(size); + ReadBlockUnsafe(src_addr, tmp_buffer.data(), size); + WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size); +} + void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type, VAddr backing_addr) { LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size, diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index 647cbf93a..e4f0c4bd6 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -65,9 +65,32 @@ public: u8* GetPointer(GPUVAddr addr); const u8* GetPointer(GPUVAddr addr) const; - void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; - void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); - void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); + // Returns true if the block is continous in host memory, false otherwise + bool IsBlockContinous(const GPUVAddr start, const std::size_t size); + + /** + * ReadBlock and WriteBlock are full read and write operations over virtual + * GPU Memory. It's important to use these when GPU memory may not be continous + * in the Host Memory counterpart. Note: This functions cause Host GPU Memory + * Flushes and Invalidations, respectively to each operation. + */ + void ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const; + void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size); + void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size); + + /** + * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and + * WriteBlock respectively. In this versions, no flushing or invalidation is actually + * done and their performance is similar to a memcpy. This functions can be used + * on either of this 2 scenarios instead of their safe counterpart: + * - Memory which is sure to never be represented in the Host GPU. + * - Memory Managed by a Cache Manager. Example: Texture Flushing should use + * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture + * being flushed. + */ + void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const; + void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size); + void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size); private: using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>; diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp new file mode 100644 index 000000000..b6d9e0ddb --- /dev/null +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -0,0 +1,45 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include <cstddef> +#include <glad/glad.h> + +#include "common/logging/log.h" +#include "video_core/renderer_opengl/gl_device.h" + +namespace OpenGL { + +namespace { +template <typename T> +T GetInteger(GLenum pname) { + GLint temporary; + glGetIntegerv(pname, &temporary); + return static_cast<T>(temporary); +} +} // Anonymous namespace + +Device::Device() { + uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); + has_variable_aoffi = TestVariableAoffi(); +} + +bool Device::TestVariableAoffi() { + const GLchar* AOFFI_TEST = R"(#version 430 core +uniform sampler2D tex; +uniform ivec2 variable_offset; +void main() { + gl_Position = textureOffset(tex, vec2(0), variable_offset); +} +)"; + const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)}; + GLint link_status{}; + glGetProgramiv(shader, GL_LINK_STATUS, &link_status); + glDeleteProgram(shader); + + const bool supported{link_status == GL_TRUE}; + LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", supported); + return supported; +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h new file mode 100644 index 000000000..78ff5ee58 --- /dev/null +++ b/src/video_core/renderer_opengl/gl_device.h @@ -0,0 +1,30 @@ +// Copyright 2019 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <cstddef> + +namespace OpenGL { + +class Device { +public: + Device(); + + std::size_t GetUniformBufferAlignment() const { + return uniform_buffer_alignment; + } + + bool HasVariableAoffi() const { + return has_variable_aoffi; + } + +private: + static bool TestVariableAoffi(); + + std::size_t uniform_buffer_alignment{}; + bool has_variable_aoffi{}; +}; + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 6034dc489..3cc945235 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -99,7 +99,7 @@ struct FramebufferCacheKey { }; RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info) - : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, system{system}, + : res_cache{*this}, shader_cache{*this, system, device}, global_cache{*this}, system{system}, screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) { OpenGLState::ApplyDefaultState(); @@ -107,8 +107,6 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info) state.draw.shader_program = 0; state.Apply(); - glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment); - LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here"); CheckExtensions(); } @@ -307,6 +305,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { case Maxwell::ShaderProgram::Geometry: shader_program_manager->UseTrivialGeometryShader(); break; + default: + break; } continue; } @@ -315,8 +315,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) { GLShader::MaxwellUniformData ubo{}; ubo.SetFromRegs(gpu, stage); - const GLintptr offset = buffer_cache.UploadHostMemory( - &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment)); + const GLintptr offset = + buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); // Bind the emulation info buffer bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, @@ -700,23 +700,24 @@ void RasterizerOpenGL::DrawArrays() { // Add space for index buffer (keeping in mind non-core primitives) switch (regs.draw.topology) { case Maxwell::PrimitiveTopology::Quads: - buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + + buffer_size = Common::AlignUp(buffer_size, 4) + primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count); break; default: if (is_indexed) { - buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + CalculateIndexBufferSize(); + buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize(); } break; } // Uniform space for the 5 shader stages - buffer_size = - Common::AlignUp<std::size_t>(buffer_size, 4) + - (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage; + buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + + (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) * + Maxwell::MaxShaderStage; // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment); + buffer_size += + Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment()); const bool invalidate = buffer_cache.Map(buffer_size); if (invalidate) { @@ -848,8 +849,8 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader size = Common::AlignUp(size, sizeof(GLvec4)); ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big"); - const GLintptr const_buffer_offset = buffer_cache.UploadMemory( - buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment)); + const GLintptr const_buffer_offset = + buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment()); bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size); } @@ -921,8 +922,8 @@ void RasterizerOpenGL::SyncViewport(OpenGLState& current_state) { viewport.y = viewport_rect.bottom; viewport.width = viewport_rect.GetWidth(); viewport.height = viewport_rect.GetHeight(); - viewport.depth_range_far = regs.viewports[i].depth_range_far; - viewport.depth_range_near = regs.viewports[i].depth_range_near; + viewport.depth_range_far = src.depth_range_far; + viewport.depth_range_near = src.depth_range_near; } state.depth_clamp.far_plane = regs.view_volume_clip_control.depth_clamp_far != 0; state.depth_clamp.near_plane = regs.view_volume_clip_control.depth_clamp_near != 0; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index a0e056142..71b9c5ead 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -21,6 +21,7 @@ #include "video_core/rasterizer_cache.h" #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_global_cache.h" #include "video_core/renderer_opengl/gl_primitive_assembler.h" #include "video_core/renderer_opengl/gl_rasterizer_cache.h" @@ -172,6 +173,7 @@ private: /// but are needed for correct emulation void CheckExtensions(); + const Device device; OpenGLState state; RasterizerCacheOpenGL res_cache; @@ -180,7 +182,6 @@ private: SamplerCacheOpenGL sampler_cache; Core::System& system; - ScreenInfo& screen_info; std::unique_ptr<GLShader::ProgramManager> shader_program_manager; @@ -196,7 +197,6 @@ private: static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; OGLBufferCache buffer_cache; PrimitiveAssembler primitive_assembler{buffer_cache}; - GLint uniform_buffer_alignment; BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp index 67f99a568..7d8fb670e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp @@ -642,13 +642,16 @@ void CachedSurface::LoadGLBuffer(RasterizerTemporaryMemory& res_cache_tmp_mem) { SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i); } else { const u32 bpp = params.GetFormatBpp() / 8; - const u32 copy_size = params.width * bpp; + const u32 copy_size = (params.width * bpp + GetDefaultBlockWidth(params.pixel_format) - 1) / + GetDefaultBlockWidth(params.pixel_format); if (params.pitch == copy_size) { std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl); } else { + const u32 height = (params.height + GetDefaultBlockHeight(params.pixel_format) - 1) / + GetDefaultBlockHeight(params.pixel_format); const u8* start{params.host_ptr}; u8* write_to = gl_buffer[0].data(); - for (u32 h = params.height; h > 0; h--) { + for (u32 h = height; h > 0; h--) { std::memcpy(write_to, start, copy_size); start += params.pitch; write_to += copy_size; diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 99f67494c..b1c8f7c35 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp @@ -38,13 +38,15 @@ GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) { } /// Gets the shader program code from memory for the specified address -ProgramCode GetShaderCode(const u8* host_ptr) { +ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr, + const u8* host_ptr) { ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); ASSERT_OR_EXECUTE(host_ptr != nullptr, { std::fill(program_code.begin(), program_code.end(), 0); return program_code; }); - std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64)); + memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(), + program_code.size() * sizeof(u64)); return program_code; } @@ -134,8 +136,8 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode& } /// Creates an unspecialized program from code streams -GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, ProgramCode program_code, - ProgramCode program_code_b) { +GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type, + ProgramCode program_code, ProgramCode program_code_b) { GLShader::ShaderSetup setup(program_code); if (program_type == Maxwell::ShaderProgram::VertexA) { // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders. @@ -149,11 +151,11 @@ GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, Progr switch (program_type) { case Maxwell::ShaderProgram::VertexA: case Maxwell::ShaderProgram::VertexB: - return GLShader::GenerateVertexShader(setup); + return GLShader::GenerateVertexShader(device, setup); case Maxwell::ShaderProgram::Geometry: - return GLShader::GenerateGeometryShader(setup); + return GLShader::GenerateGeometryShader(device, setup); case Maxwell::ShaderProgram::Fragment: - return GLShader::GenerateFragmentShader(setup); + return GLShader::GenerateFragmentShader(device, setup); default: LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type)); UNREACHABLE(); @@ -212,22 +214,20 @@ std::set<GLenum> GetSupportedFormats() { return supported_formats; } -} // namespace +} // Anonymous namespace -CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier, +CachedShader::CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, const PrecompiledPrograms& precompiled_programs, ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr) : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{precompiled_programs} { - - const std::size_t code_size = CalculateProgramSize(program_code); - const std::size_t code_size_b = - program_code_b.empty() ? 0 : CalculateProgramSize(program_code_b); - - GLShader::ProgramResult program_result = - CreateProgram(program_type, program_code, program_code_b); + const std::size_t code_size{CalculateProgramSize(program_code)}; + const std::size_t code_size_b{program_code_b.empty() ? 0 + : CalculateProgramSize(program_code_b)}; + GLShader::ProgramResult program_result{ + CreateProgram(device, program_type, program_code, program_code_b)}; if (program_result.first.empty()) { // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now return; @@ -251,7 +251,6 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier, : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{ precompiled_programs} { - code = std::move(result.first); entries = result.second; shader_length = entries.shader_length; @@ -344,8 +343,9 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode, return {unique_identifier, base_bindings, primitive_mode}; } -ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system) - : RasterizerCache{rasterizer}, disk_cache{system} {} +ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, + const Device& device) + : RasterizerCache{rasterizer}, disk_cache{system}, device{device} {} void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, const VideoCore::DiskResourceLoadCallback& callback) { @@ -363,6 +363,10 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, if (stop_loading) return; + // Track if precompiled cache was altered during loading to know if we have to serialize the + // virtual precompiled cache file back to the hard drive + bool precompiled_cache_altered = false; + // Build shaders if (callback) callback(VideoCore::LoadCallbackStage::Build, 0, usages.size()); @@ -384,6 +388,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, if (!shader) { // Invalidate the precompiled cache if a shader dumped shader was rejected disk_cache.InvalidatePrecompiled(); + precompiled_cache_altered = true; dumps.clear(); } } @@ -405,8 +410,13 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, if (dumps.find(usage) == dumps.end()) { const auto& program = precompiled_programs.at(usage); disk_cache.SaveDump(usage, program->handle); + precompiled_cache_altered = true; } } + + if (precompiled_cache_altered) { + disk_cache.SaveVirtualPrecompiledFile(); + } } CachedProgram ShaderCacheOpenGL::GeneratePrecompiledProgram( @@ -439,17 +449,18 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled) { std::unordered_map<u64, UnspecializedShader> unspecialized; - if (callback) + if (callback) { callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size()); + } for (std::size_t i = 0; i < raws.size(); ++i) { - if (stop_loading) + if (stop_loading) { return {}; - + } const auto& raw{raws[i]}; - const u64 unique_identifier = raw.GetUniqueIdentifier(); - const u64 calculated_hash = - GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB()); + const u64 unique_identifier{raw.GetUniqueIdentifier()}; + const u64 calculated_hash{ + GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB())}; if (unique_identifier != calculated_hash) { LOG_ERROR( Render_OpenGL, @@ -466,8 +477,8 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia result = {stored_decompiled.code, stored_decompiled.entries}; } else { // Otherwise decompile the shader at boot and save the result to the decompiled file - result = - CreateProgram(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB()); + result = CreateProgram(device, raw.GetProgramType(), raw.GetProgramCode(), + raw.GetProgramCodeB()); disk_cache.SaveDecompiled(unique_identifier, result.first, result.second); } @@ -477,8 +488,9 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia {raw.GetUniqueIdentifier(), {std::move(result.first), std::move(result.second), raw.GetProgramType()}}); - if (callback) + if (callback) { callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size()); + } } return unspecialized; } @@ -497,11 +509,12 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { if (!shader) { // No shader found - create a new one - ProgramCode program_code{GetShaderCode(host_ptr)}; + ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)}; ProgramCode program_code_b; if (program == Maxwell::ShaderProgram::VertexA) { - program_code_b = GetShaderCode( - memory_manager.GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB))); + const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)}; + program_code_b = GetShaderCode(memory_manager, program_addr_b, + memory_manager.GetPointer(program_addr_b)); } const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; @@ -512,7 +525,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) { precompiled_programs, found->second, host_ptr); } else { shader = std::make_shared<CachedShader>( - cpu_addr, unique_identifier, program, disk_cache, precompiled_programs, + device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs, std::move(program_code), std::move(program_code_b), host_ptr); } Register(shader); diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h index eae771c08..31b979987 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_cache.h @@ -27,6 +27,7 @@ class System; namespace OpenGL { class CachedShader; +class Device; class RasterizerOpenGL; struct UnspecializedShader; @@ -38,7 +39,7 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>; class CachedShader final : public RasterizerCacheObject { public: - explicit CachedShader(VAddr cpu_addr, u64 unique_identifier, + explicit CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier, Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, const PrecompiledPrograms& precompiled_programs, ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr); @@ -109,7 +110,8 @@ private: class ShaderCacheOpenGL final : public RasterizerCache<Shader> { public: - explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system); + explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system, + const Device& device); /// Loads disk cache for the current game void LoadDiskCache(const std::atomic_bool& stop_loading, @@ -131,6 +133,8 @@ private: CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump, const std::set<GLenum>& supported_formats); + const Device& device; + std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; ShaderDiskCacheOpenGL disk_cache; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 445048daf..ef1a1995f 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -15,6 +15,7 @@ #include "common/assert.h" #include "common/common_types.h" #include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/shader/shader_ir.h" @@ -119,14 +120,10 @@ std::string GetTopologyName(Tegra::Shader::OutputTopology topology) { /// Returns true if an object has to be treated as precise bool IsPrecise(Operation operand) { - const auto& meta = operand.GetMeta(); - + const auto& meta{operand.GetMeta()}; if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) { return arithmetic->precise; } - if (const auto half_arithmetic = std::get_if<MetaHalfArithmetic>(&meta)) { - return half_arithmetic->precise; - } return false; } @@ -139,8 +136,9 @@ bool IsPrecise(Node node) { class GLSLDecompiler final { public: - explicit GLSLDecompiler(const ShaderIR& ir, ShaderStage stage, std::string suffix) - : ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} + explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage, + std::string suffix) + : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} void Decompile() { DeclareVertex(); @@ -627,28 +625,7 @@ private: } std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) { - std::string value = VisitOperand(operation, operand_index); - switch (type) { - case Type::HalfFloat: { - const auto half_meta = std::get_if<MetaHalfArithmetic>(&operation.GetMeta()); - if (!half_meta) { - value = "toHalf2(" + value + ')'; - } - - switch (half_meta->types.at(operand_index)) { - case Tegra::Shader::HalfType::H0_H1: - return "toHalf2(" + value + ')'; - case Tegra::Shader::HalfType::F32: - return "vec2(" + value + ')'; - case Tegra::Shader::HalfType::H0_H0: - return "vec2(toHalf2(" + value + ")[0])"; - case Tegra::Shader::HalfType::H1_H1: - return "vec2(toHalf2(" + value + ")[1])"; - } - } - default: - return CastOperand(value, type); - } + return CastOperand(VisitOperand(operation, operand_index), type); } std::string CastOperand(const std::string& value, Type type) const { @@ -662,9 +639,7 @@ private: case Type::Uint: return "ftou(" + value + ')'; case Type::HalfFloat: - // Can't be handled as a stand-alone value - UNREACHABLE(); - return value; + return "toHalf2(" + value + ')'; } UNREACHABLE(); return value; @@ -829,8 +804,12 @@ private: // Inline the string as an immediate integer in GLSL (AOFFI arguments are required // to be constant by the standard). expr += std::to_string(static_cast<s32>(immediate->GetValue())); - } else { + } else if (device.HasVariableAoffi()) { + // Avoid using variable AOFFI on unsupported devices. expr += "ftoi(" + Visit(operand) + ')'; + } else { + // Insert 0 on devices not supporting variable AOFFI. + expr += '0'; } if (index + 1 < aoffi.size()) { expr += ", "; @@ -1083,13 +1062,40 @@ private: return BitwiseCastResult(value, Type::HalfFloat); } + std::string HClamp(Operation operation) { + const std::string value = VisitOperand(operation, 0, Type::HalfFloat); + const std::string min = VisitOperand(operation, 1, Type::Float); + const std::string max = VisitOperand(operation, 2, Type::Float); + const std::string clamped = "clamp(" + value + ", vec2(" + min + "), vec2(" + max + "))"; + return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat)); + } + + std::string HUnpack(Operation operation) { + const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)}; + const auto value = [&]() -> std::string { + switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) { + case Tegra::Shader::HalfType::H0_H1: + return operand; + case Tegra::Shader::HalfType::F32: + return "vec2(fromHalf2(" + operand + "))"; + case Tegra::Shader::HalfType::H0_H0: + return "vec2(" + operand + "[0])"; + case Tegra::Shader::HalfType::H1_H1: + return "vec2(" + operand + "[1])"; + } + UNREACHABLE(); + return "0"; + }(); + return "fromHalf2(" + value + ')'; + } + std::string HMergeF32(Operation operation) { return "float(toHalf2(" + Visit(operation[0]) + ")[0])"; } std::string HMergeH0(Operation operation) { - return "fromHalf2(vec2(toHalf2(" + Visit(operation[0]) + ")[1], toHalf2(" + - Visit(operation[1]) + ")[0]))"; + return "fromHalf2(vec2(toHalf2(" + Visit(operation[1]) + ")[0], toHalf2(" + + Visit(operation[0]) + ")[1]))"; } std::string HMergeH1(Operation operation) { @@ -1189,34 +1195,46 @@ private: return GenerateUnary(operation, "any", Type::Bool, Type::Bool2); } + template <bool with_nan> + std::string GenerateHalfComparison(Operation operation, std::string compare_op) { + std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2, + Type::HalfFloat, Type::HalfFloat)}; + if constexpr (!with_nan) { + return comparison; + } + return "halfFloatNanComparison(" + comparison + ", " + + VisitOperand(operation, 0, Type::HalfFloat) + ", " + + VisitOperand(operation, 1, Type::HalfFloat) + ')'; + } + + template <bool with_nan> std::string Logical2HLessThan(Operation operation) { - return GenerateBinaryCall(operation, "lessThan", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "lessThan"); } + template <bool with_nan> std::string Logical2HEqual(Operation operation) { - return GenerateBinaryCall(operation, "equal", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "equal"); } + template <bool with_nan> std::string Logical2HLessEqual(Operation operation) { - return GenerateBinaryCall(operation, "lessThanEqual", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "lessThanEqual"); } + template <bool with_nan> std::string Logical2HGreaterThan(Operation operation) { - return GenerateBinaryCall(operation, "greaterThan", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "greaterThan"); } + template <bool with_nan> std::string Logical2HNotEqual(Operation operation) { - return GenerateBinaryCall(operation, "notEqual", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "notEqual"); } + template <bool with_nan> std::string Logical2HGreaterEqual(Operation operation) { - return GenerateBinaryCall(operation, "greaterThanEqual", Type::Bool2, Type::HalfFloat, - Type::HalfFloat); + return GenerateHalfComparison<with_nan>(operation, "greaterThanEqual"); } std::string Texture(Operation operation) { @@ -1505,6 +1523,8 @@ private: &GLSLDecompiler::Fma<Type::HalfFloat>, &GLSLDecompiler::Absolute<Type::HalfFloat>, &GLSLDecompiler::HNegate, + &GLSLDecompiler::HClamp, + &GLSLDecompiler::HUnpack, &GLSLDecompiler::HMergeF32, &GLSLDecompiler::HMergeH0, &GLSLDecompiler::HMergeH1, @@ -1541,12 +1561,18 @@ private: &GLSLDecompiler::LogicalNotEqual<Type::Uint>, &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>, - &GLSLDecompiler::Logical2HLessThan, - &GLSLDecompiler::Logical2HEqual, - &GLSLDecompiler::Logical2HLessEqual, - &GLSLDecompiler::Logical2HGreaterThan, - &GLSLDecompiler::Logical2HNotEqual, - &GLSLDecompiler::Logical2HGreaterEqual, + &GLSLDecompiler::Logical2HLessThan<false>, + &GLSLDecompiler::Logical2HEqual<false>, + &GLSLDecompiler::Logical2HLessEqual<false>, + &GLSLDecompiler::Logical2HGreaterThan<false>, + &GLSLDecompiler::Logical2HNotEqual<false>, + &GLSLDecompiler::Logical2HGreaterEqual<false>, + &GLSLDecompiler::Logical2HLessThan<true>, + &GLSLDecompiler::Logical2HEqual<true>, + &GLSLDecompiler::Logical2HLessEqual<true>, + &GLSLDecompiler::Logical2HGreaterThan<true>, + &GLSLDecompiler::Logical2HNotEqual<true>, + &GLSLDecompiler::Logical2HGreaterEqual<true>, &GLSLDecompiler::Texture, &GLSLDecompiler::TextureLod, @@ -1625,6 +1651,7 @@ private: return name + '_' + std::to_string(index) + '_' + suffix; } + const Device& device; const ShaderIR& ir; const ShaderStage stage; const std::string suffix; @@ -1647,11 +1674,18 @@ std::string GetCommonDeclarations() { "}\n\n" "vec2 toHalf2(float value) {\n" " return unpackHalf2x16(ftou(value));\n" + "}\n\n" + "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {\n" + " bvec2 is_nan1 = isnan(pair1);\n" + " bvec2 is_nan2 = isnan(pair2);\n" + " return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || " + "is_nan2.y);\n" "}\n"; } -ProgramResult Decompile(const ShaderIR& ir, Maxwell::ShaderStage stage, const std::string& suffix) { - GLSLDecompiler decompiler(ir, stage, suffix); +ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage, + const std::string& suffix) { + GLSLDecompiler decompiler(device, ir, stage, suffix); decompiler.Decompile(); return {decompiler.GetResult(), decompiler.GetShaderEntries()}; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index 74032d237..c1569e737 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -12,6 +12,10 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/shader/shader_ir.h" +namespace OpenGL { +class Device; +} + namespace VideoCommon::Shader { class ShaderIR; } @@ -77,7 +81,7 @@ struct ShaderEntries { std::string GetCommonDeclarations(); -ProgramResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage, - const std::string& suffix); +ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir, + Maxwell::ShaderStage stage, const std::string& suffix); } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp index 53752b38d..254c0d499 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.cpp @@ -104,7 +104,8 @@ bool ShaderDiskCacheRaw::Save(FileUtil::IOFile& file) const { return true; } -ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) : system{system} {} +ShaderDiskCacheOpenGL::ShaderDiskCacheOpenGL(Core::System& system) + : system{system}, precompiled_cache_virtual_file_offset{0} {} std::optional<std::pair<std::vector<ShaderDiskCacheRaw>, std::vector<ShaderDiskCacheUsage>>> ShaderDiskCacheOpenGL::LoadTransferable() { @@ -177,6 +178,7 @@ ShaderDiskCacheOpenGL::LoadTransferable() { return {}; } } + return {{raws, usages}}; } @@ -208,59 +210,64 @@ ShaderDiskCacheOpenGL::LoadPrecompiled() { std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>, std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>> ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { + // Read compressed file from disk and decompress to virtual precompiled cache file + std::vector<u8> compressed(file.GetSize()); + file.ReadBytes(compressed.data(), compressed.size()); + const std::vector<u8> decompressed = Common::Compression::DecompressDataZSTD(compressed); + SaveArrayToPrecompiled(decompressed.data(), decompressed.size()); + precompiled_cache_virtual_file_offset = 0; + ShaderCacheVersionHash file_hash{}; - if (file.ReadArray(file_hash.data(), file_hash.size()) != file_hash.size()) { + if (!LoadArrayFromPrecompiled(file_hash.data(), file_hash.size())) { + precompiled_cache_virtual_file_offset = 0; return {}; } if (GetShaderCacheVersionHash() != file_hash) { LOG_INFO(Render_OpenGL, "Precompiled cache is from another version of the emulator"); + precompiled_cache_virtual_file_offset = 0; return {}; } std::unordered_map<u64, ShaderDiskCacheDecompiled> decompiled; std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump> dumps; - while (file.Tell() < file.GetSize()) { + while (precompiled_cache_virtual_file_offset < precompiled_cache_virtual_file.GetSize()) { PrecompiledEntryKind kind{}; - if (file.ReadBytes(&kind, sizeof(u32)) != sizeof(u32)) { + if (!LoadObjectFromPrecompiled(kind)) { return {}; } switch (kind) { case PrecompiledEntryKind::Decompiled: { u64 unique_identifier{}; - if (file.ReadBytes(&unique_identifier, sizeof(u64)) != sizeof(u64)) + if (!LoadObjectFromPrecompiled(unique_identifier)) { return {}; + } - const auto entry = LoadDecompiledEntry(file); - if (!entry) + const auto entry = LoadDecompiledEntry(); + if (!entry) { return {}; + } decompiled.insert({unique_identifier, std::move(*entry)}); break; } case PrecompiledEntryKind::Dump: { ShaderDiskCacheUsage usage; - if (file.ReadBytes(&usage, sizeof(usage)) != sizeof(usage)) + if (!LoadObjectFromPrecompiled(usage)) { return {}; + } ShaderDiskCacheDump dump; - if (file.ReadBytes(&dump.binary_format, sizeof(u32)) != sizeof(u32)) - return {}; - - u32 binary_length{}; - u32 compressed_size{}; - if (file.ReadBytes(&binary_length, sizeof(u32)) != sizeof(u32) || - file.ReadBytes(&compressed_size, sizeof(u32)) != sizeof(u32)) { + if (!LoadObjectFromPrecompiled(dump.binary_format)) { return {}; } - std::vector<u8> compressed_binary(compressed_size); - if (file.ReadArray(compressed_binary.data(), compressed_binary.size()) != - compressed_binary.size()) { + u32 binary_length{}; + if (!LoadObjectFromPrecompiled(binary_length)) { return {}; } - dump.binary = Common::Compression::DecompressDataZSTD(compressed_binary); - if (dump.binary.empty()) { + dump.binary.resize(binary_length); + if (!LoadArrayFromPrecompiled(dump.binary.data(), dump.binary.size())) { return {}; } @@ -274,45 +281,41 @@ ShaderDiskCacheOpenGL::LoadPrecompiledFile(FileUtil::IOFile& file) { return {{decompiled, dumps}}; } -std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEntry( - FileUtil::IOFile& file) { +std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEntry() { u32 code_size{}; - u32 compressed_code_size{}; - if (file.ReadBytes(&code_size, sizeof(u32)) != sizeof(u32) || - file.ReadBytes(&compressed_code_size, sizeof(u32)) != sizeof(u32)) { + if (!LoadObjectFromPrecompiled(code_size)) { return {}; } - std::vector<u8> compressed_code(compressed_code_size); - if (file.ReadArray(compressed_code.data(), compressed_code.size()) != compressed_code.size()) { + std::vector<u8> code(code_size); + if (!LoadArrayFromPrecompiled(code.data(), code.size())) { return {}; } - const std::vector<u8> code = Common::Compression::DecompressDataZSTD(compressed_code); - if (code.empty()) { - return {}; - } ShaderDiskCacheDecompiled entry; entry.code = std::string(reinterpret_cast<const char*>(code.data()), code_size); u32 const_buffers_count{}; - if (file.ReadBytes(&const_buffers_count, sizeof(u32)) != sizeof(u32)) + if (!LoadObjectFromPrecompiled(const_buffers_count)) { return {}; + } + for (u32 i = 0; i < const_buffers_count; ++i) { u32 max_offset{}; u32 index{}; u8 is_indirect{}; - if (file.ReadBytes(&max_offset, sizeof(u32)) != sizeof(u32) || - file.ReadBytes(&index, sizeof(u32)) != sizeof(u32) || - file.ReadBytes(&is_indirect, sizeof(u8)) != sizeof(u8)) { + if (!LoadObjectFromPrecompiled(max_offset) || !LoadObjectFromPrecompiled(index) || + !LoadObjectFromPrecompiled(is_indirect)) { return {}; } entry.entries.const_buffers.emplace_back(max_offset, is_indirect != 0, index); } u32 samplers_count{}; - if (file.ReadBytes(&samplers_count, sizeof(u32)) != sizeof(u32)) + if (!LoadObjectFromPrecompiled(samplers_count)) { return {}; + } + for (u32 i = 0; i < samplers_count; ++i) { u64 offset{}; u64 index{}; @@ -320,12 +323,9 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn u8 is_array{}; u8 is_shadow{}; u8 is_bindless{}; - if (file.ReadBytes(&offset, sizeof(u64)) != sizeof(u64) || - file.ReadBytes(&index, sizeof(u64)) != sizeof(u64) || - file.ReadBytes(&type, sizeof(u32)) != sizeof(u32) || - file.ReadBytes(&is_array, sizeof(u8)) != sizeof(u8) || - file.ReadBytes(&is_shadow, sizeof(u8)) != sizeof(u8) || - file.ReadBytes(&is_bindless, sizeof(u8)) != sizeof(u8)) { + if (!LoadObjectFromPrecompiled(offset) || !LoadObjectFromPrecompiled(index) || + !LoadObjectFromPrecompiled(type) || !LoadObjectFromPrecompiled(is_array) || + !LoadObjectFromPrecompiled(is_shadow) || !LoadObjectFromPrecompiled(is_bindless)) { return {}; } entry.entries.samplers.emplace_back(static_cast<std::size_t>(offset), @@ -335,17 +335,17 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn } u32 global_memory_count{}; - if (file.ReadBytes(&global_memory_count, sizeof(u32)) != sizeof(u32)) + if (!LoadObjectFromPrecompiled(global_memory_count)) { return {}; + } + for (u32 i = 0; i < global_memory_count; ++i) { u32 cbuf_index{}; u32 cbuf_offset{}; u8 is_read{}; u8 is_written{}; - if (file.ReadBytes(&cbuf_index, sizeof(u32)) != sizeof(u32) || - file.ReadBytes(&cbuf_offset, sizeof(u32)) != sizeof(u32) || - file.ReadBytes(&is_read, sizeof(u8)) != sizeof(u8) || - file.ReadBytes(&is_written, sizeof(u8)) != sizeof(u8)) { + if (!LoadObjectFromPrecompiled(cbuf_index) || !LoadObjectFromPrecompiled(cbuf_offset) || + !LoadObjectFromPrecompiled(is_read) || !LoadObjectFromPrecompiled(is_written)) { return {}; } entry.entries.global_memory_entries.emplace_back(cbuf_index, cbuf_offset, is_read != 0, @@ -354,74 +354,81 @@ std::optional<ShaderDiskCacheDecompiled> ShaderDiskCacheOpenGL::LoadDecompiledEn for (auto& clip_distance : entry.entries.clip_distances) { u8 clip_distance_raw{}; - if (file.ReadBytes(&clip_distance_raw, sizeof(u8)) != sizeof(u8)) + if (!LoadObjectFromPrecompiled(clip_distance_raw)) return {}; clip_distance = clip_distance_raw != 0; } u64 shader_length{}; - if (file.ReadBytes(&shader_length, sizeof(u64)) != sizeof(u64)) + if (!LoadObjectFromPrecompiled(shader_length)) { return {}; + } + entry.entries.shader_length = static_cast<std::size_t>(shader_length); return entry; } -bool ShaderDiskCacheOpenGL::SaveDecompiledFile(FileUtil::IOFile& file, u64 unique_identifier, - const std::string& code, - const std::vector<u8>& compressed_code, +bool ShaderDiskCacheOpenGL::SaveDecompiledFile(u64 unique_identifier, const std::string& code, const GLShader::ShaderEntries& entries) { - if (file.WriteObject(static_cast<u32>(PrecompiledEntryKind::Decompiled)) != 1 || - file.WriteObject(unique_identifier) != 1 || - file.WriteObject(static_cast<u32>(code.size())) != 1 || - file.WriteObject(static_cast<u32>(compressed_code.size())) != 1 || - file.WriteArray(compressed_code.data(), compressed_code.size()) != compressed_code.size()) { + if (!SaveObjectToPrecompiled(static_cast<u32>(PrecompiledEntryKind::Decompiled)) || + !SaveObjectToPrecompiled(unique_identifier) || + !SaveObjectToPrecompiled(static_cast<u32>(code.size())) || + !SaveArrayToPrecompiled(code.data(), code.size())) { return false; } - if (file.WriteObject(static_cast<u32>(entries.const_buffers.size())) != 1) + if (!SaveObjectToPrecompiled(static_cast<u32>(entries.const_buffers.size()))) { return false; + } for (const auto& cbuf : entries.const_buffers) { - if (file.WriteObject(static_cast<u32>(cbuf.GetMaxOffset())) != 1 || - file.WriteObject(static_cast<u32>(cbuf.GetIndex())) != 1 || - file.WriteObject(static_cast<u8>(cbuf.IsIndirect() ? 1 : 0)) != 1) { + if (!SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetMaxOffset())) || + !SaveObjectToPrecompiled(static_cast<u32>(cbuf.GetIndex())) || + !SaveObjectToPrecompiled(static_cast<u8>(cbuf.IsIndirect() ? 1 : 0))) { return false; } } - if (file.WriteObject(static_cast<u32>(entries.samplers.size())) != 1) + if (!SaveObjectToPrecompiled(static_cast<u32>(entries.samplers.size()))) { return false; + } for (const auto& sampler : entries.samplers) { - if (file.WriteObject(static_cast<u64>(sampler.GetOffset())) != 1 || - file.WriteObject(static_cast<u64>(sampler.GetIndex())) != 1 || - file.WriteObject(static_cast<u32>(sampler.GetType())) != 1 || - file.WriteObject(static_cast<u8>(sampler.IsArray() ? 1 : 0)) != 1 || - file.WriteObject(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) != 1 || - file.WriteObject(static_cast<u8>(sampler.IsBindless() ? 1 : 0)) != 1) { + if (!SaveObjectToPrecompiled(static_cast<u64>(sampler.GetOffset())) || + !SaveObjectToPrecompiled(static_cast<u64>(sampler.GetIndex())) || + !SaveObjectToPrecompiled(static_cast<u32>(sampler.GetType())) || + !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsArray() ? 1 : 0)) || + !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsShadow() ? 1 : 0)) || + !SaveObjectToPrecompiled(static_cast<u8>(sampler.IsBindless() ? 1 : 0))) { return false; } } - if (file.WriteObject(static_cast<u32>(entries.global_memory_entries.size())) != 1) + if (!SaveObjectToPrecompiled(static_cast<u32>(entries.global_memory_entries.size()))) { return false; + } for (const auto& gmem : entries.global_memory_entries) { - if (file.WriteObject(static_cast<u32>(gmem.GetCbufIndex())) != 1 || - file.WriteObject(static_cast<u32>(gmem.GetCbufOffset())) != 1 || - file.WriteObject(static_cast<u8>(gmem.IsRead() ? 1 : 0)) != 1 || - file.WriteObject(static_cast<u8>(gmem.IsWritten() ? 1 : 0)) != 1) { + if (!SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufIndex())) || + !SaveObjectToPrecompiled(static_cast<u32>(gmem.GetCbufOffset())) || + !SaveObjectToPrecompiled(static_cast<u8>(gmem.IsRead() ? 1 : 0)) || + !SaveObjectToPrecompiled(static_cast<u8>(gmem.IsWritten() ? 1 : 0))) { return false; } } for (const bool clip_distance : entries.clip_distances) { - if (file.WriteObject(static_cast<u8>(clip_distance ? 1 : 0)) != 1) + if (!SaveObjectToPrecompiled(static_cast<u8>(clip_distance ? 1 : 0))) { return false; + } } - return file.WriteObject(static_cast<u64>(entries.shader_length)) == 1; + if (!SaveObjectToPrecompiled(static_cast<u64>(entries.shader_length))) { + return false; + } + + return true; } -void ShaderDiskCacheOpenGL::InvalidateTransferable() const { +void ShaderDiskCacheOpenGL::InvalidateTransferable() { if (!FileUtil::Delete(GetTransferablePath())) { LOG_ERROR(Render_OpenGL, "Failed to invalidate transferable file={}", GetTransferablePath()); @@ -429,7 +436,10 @@ void ShaderDiskCacheOpenGL::InvalidateTransferable() const { InvalidatePrecompiled(); } -void ShaderDiskCacheOpenGL::InvalidatePrecompiled() const { +void ShaderDiskCacheOpenGL::InvalidatePrecompiled() { + // Clear virtaul precompiled cache file + precompiled_cache_virtual_file.Resize(0); + if (!FileUtil::Delete(GetPrecompiledPath())) { LOG_ERROR(Render_OpenGL, "Failed to invalidate precompiled file={}", GetPrecompiledPath()); } @@ -465,7 +475,10 @@ void ShaderDiskCacheOpenGL::SaveUsage(const ShaderDiskCacheUsage& usage) { ASSERT_MSG(it != transferable.end(), "Saving shader usage without storing raw previously"); auto& usages{it->second}; - ASSERT(usages.find(usage) == usages.end()); + if (usages.find(usage) != usages.end()) { + // Skip this variant since the shader is already stored. + return; + } usages.insert(usage); FileUtil::IOFile file = AppendTransferableFile(); @@ -485,22 +498,13 @@ void ShaderDiskCacheOpenGL::SaveDecompiled(u64 unique_identifier, const std::str if (!IsUsable()) return; - const std::vector<u8> compressed_code{Common::Compression::CompressDataZSTDDefault( - reinterpret_cast<const u8*>(code.data()), code.size())}; - if (compressed_code.empty()) { - LOG_ERROR(Render_OpenGL, "Failed to compress GLSL code - skipping shader {:016x}", - unique_identifier); - return; + if (precompiled_cache_virtual_file.GetSize() == 0) { + SavePrecompiledHeaderToVirtualPrecompiledCache(); } - FileUtil::IOFile file = AppendPrecompiledFile(); - if (!file.IsOpen()) - return; - - if (!SaveDecompiledFile(file, unique_identifier, code, compressed_code, entries)) { + if (!SaveDecompiledFile(unique_identifier, code, entries)) { LOG_ERROR(Render_OpenGL, "Failed to save decompiled entry to the precompiled file - removing"); - file.Close(); InvalidatePrecompiled(); } } @@ -516,28 +520,13 @@ void ShaderDiskCacheOpenGL::SaveDump(const ShaderDiskCacheUsage& usage, GLuint p std::vector<u8> binary(binary_length); glGetProgramBinary(program, binary_length, nullptr, &binary_format, binary.data()); - const std::vector<u8> compressed_binary = - Common::Compression::CompressDataZSTDDefault(binary.data(), binary.size()); - - if (compressed_binary.empty()) { - LOG_ERROR(Render_OpenGL, "Failed to compress binary program in shader={:016x}", - usage.unique_identifier); - return; - } - - FileUtil::IOFile file = AppendPrecompiledFile(); - if (!file.IsOpen()) - return; - - if (file.WriteObject(static_cast<u32>(PrecompiledEntryKind::Dump)) != 1 || - file.WriteObject(usage) != 1 || file.WriteObject(static_cast<u32>(binary_format)) != 1 || - file.WriteObject(static_cast<u32>(binary_length)) != 1 || - file.WriteObject(static_cast<u32>(compressed_binary.size())) != 1 || - file.WriteArray(compressed_binary.data(), compressed_binary.size()) != - compressed_binary.size()) { + if (!SaveObjectToPrecompiled(static_cast<u32>(PrecompiledEntryKind::Dump)) || + !SaveObjectToPrecompiled(usage) || + !SaveObjectToPrecompiled(static_cast<u32>(binary_format)) || + !SaveObjectToPrecompiled(static_cast<u32>(binary_length)) || + !SaveArrayToPrecompiled(binary.data(), binary.size())) { LOG_ERROR(Render_OpenGL, "Failed to save binary program file in shader={:016x} - removing", usage.unique_identifier); - file.Close(); InvalidatePrecompiled(); return; } @@ -570,28 +559,33 @@ FileUtil::IOFile ShaderDiskCacheOpenGL::AppendTransferableFile() const { return file; } -FileUtil::IOFile ShaderDiskCacheOpenGL::AppendPrecompiledFile() const { - if (!EnsureDirectories()) - return {}; +void ShaderDiskCacheOpenGL::SavePrecompiledHeaderToVirtualPrecompiledCache() { + const auto hash{GetShaderCacheVersionHash()}; + if (!SaveArrayToPrecompiled(hash.data(), hash.size())) { + LOG_ERROR( + Render_OpenGL, + "Failed to write precompiled cache version hash to virtual precompiled cache file"); + } +} + +void ShaderDiskCacheOpenGL::SaveVirtualPrecompiledFile() { + precompiled_cache_virtual_file_offset = 0; + const std::vector<u8>& uncompressed = precompiled_cache_virtual_file.ReadAllBytes(); + const std::vector<u8>& compressed = + Common::Compression::CompressDataZSTDDefault(uncompressed.data(), uncompressed.size()); const auto precompiled_path{GetPrecompiledPath()}; - const bool existed = FileUtil::Exists(precompiled_path); + FileUtil::IOFile file(precompiled_path, "wb"); - FileUtil::IOFile file(precompiled_path, "ab"); if (!file.IsOpen()) { LOG_ERROR(Render_OpenGL, "Failed to open precompiled cache in path={}", precompiled_path); - return {}; + return; } - - if (!existed || file.GetSize() == 0) { - const auto hash{GetShaderCacheVersionHash()}; - if (file.WriteArray(hash.data(), hash.size()) != hash.size()) { - LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version hash in path={}", - precompiled_path); - return {}; - } + if (file.WriteBytes(compressed.data(), compressed.size()) != compressed.size()) { + LOG_ERROR(Render_OpenGL, "Failed to write precompiled cache version in path={}", + precompiled_path); + return; } - return file; } bool ShaderDiskCacheOpenGL::EnsureDirectories() const { diff --git a/src/video_core/renderer_opengl/gl_shader_disk_cache.h b/src/video_core/renderer_opengl/gl_shader_disk_cache.h index 6be0c0547..0142b2e3b 100644 --- a/src/video_core/renderer_opengl/gl_shader_disk_cache.h +++ b/src/video_core/renderer_opengl/gl_shader_disk_cache.h @@ -16,6 +16,7 @@ #include "common/assert.h" #include "common/common_types.h" +#include "core/file_sys/vfs_vector.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/renderer_opengl/gl_shader_gen.h" @@ -172,10 +173,10 @@ public: LoadPrecompiled(); /// Removes the transferable (and precompiled) cache file. - void InvalidateTransferable() const; + void InvalidateTransferable(); - /// Removes the precompiled cache file. - void InvalidatePrecompiled() const; + /// Removes the precompiled cache file and clears virtual precompiled cache file. + void InvalidatePrecompiled(); /// Saves a raw dump to the transferable file. Checks for collisions. void SaveRaw(const ShaderDiskCacheRaw& entry); @@ -190,18 +191,21 @@ public: /// Saves a dump entry to the precompiled file. Does not check for collisions. void SaveDump(const ShaderDiskCacheUsage& usage, GLuint program); + /// Serializes virtual precompiled shader cache file to real file + void SaveVirtualPrecompiledFile(); + private: /// Loads the transferable cache. Returns empty on failure. std::optional<std::pair<std::unordered_map<u64, ShaderDiskCacheDecompiled>, std::unordered_map<ShaderDiskCacheUsage, ShaderDiskCacheDump>>> LoadPrecompiledFile(FileUtil::IOFile& file); - /// Loads a decompiled cache entry from the passed file. Returns empty on failure. - std::optional<ShaderDiskCacheDecompiled> LoadDecompiledEntry(FileUtil::IOFile& file); + /// Loads a decompiled cache entry from m_precompiled_cache_virtual_file. Returns empty on + /// failure. + std::optional<ShaderDiskCacheDecompiled> LoadDecompiledEntry(); /// Saves a decompiled entry to the passed file. Returns true on success. - bool SaveDecompiledFile(FileUtil::IOFile& file, u64 unique_identifier, const std::string& code, - const std::vector<u8>& compressed_code, + bool SaveDecompiledFile(u64 unique_identifier, const std::string& code, const GLShader::ShaderEntries& entries); /// Returns if the cache can be used @@ -210,8 +214,8 @@ private: /// Opens current game's transferable file and write it's header if it doesn't exist FileUtil::IOFile AppendTransferableFile() const; - /// Opens current game's precompiled file and write it's header if it doesn't exist - FileUtil::IOFile AppendPrecompiledFile() const; + /// Save precompiled header to precompiled_cache_in_memory + void SavePrecompiledHeaderToVirtualPrecompiledCache(); /// Create shader disk cache directories. Returns true on success. bool EnsureDirectories() const; @@ -234,10 +238,42 @@ private: /// Get current game's title id std::string GetTitleID() const; + template <typename T> + bool SaveArrayToPrecompiled(const T* data, std::size_t length) { + const std::size_t write_length = precompiled_cache_virtual_file.WriteArray( + data, length, precompiled_cache_virtual_file_offset); + precompiled_cache_virtual_file_offset += write_length; + return write_length == sizeof(T) * length; + } + + template <typename T> + bool LoadArrayFromPrecompiled(T* data, std::size_t length) { + const std::size_t read_length = precompiled_cache_virtual_file.ReadArray( + data, length, precompiled_cache_virtual_file_offset); + precompiled_cache_virtual_file_offset += read_length; + return read_length == sizeof(T) * length; + } + + template <typename T> + bool SaveObjectToPrecompiled(const T& object) { + return SaveArrayToPrecompiled(&object, 1); + } + + template <typename T> + bool LoadObjectFromPrecompiled(T& object) { + return LoadArrayFromPrecompiled(&object, 1); + } + // Copre system Core::System& system; // Stored transferable shaders std::map<u64, std::unordered_set<ShaderDiskCacheUsage>> transferable; + // Stores whole precompiled cache which will be read from or saved to the precompiled chache + // file + FileSys::VectorVfsFile precompiled_cache_virtual_file; + // Stores the current offset of the precompiled cache file for IO purposes + std::size_t precompiled_cache_virtual_file_offset; + // The cache has been loaded at boot bool tried_to_load{}; }; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp index 8763d9c71..6abf948f8 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.cpp +++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp @@ -16,7 +16,7 @@ using VideoCommon::Shader::ShaderIR; static constexpr u32 PROGRAM_OFFSET{10}; -ProgramResult GenerateVertexShader(const ShaderSetup& setup) { +ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; @@ -34,14 +34,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config { )"; ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); - ProgramResult program = Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex"); + ProgramResult program = + Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex"); out += program.first; if (setup.IsDualProgram()) { ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET); ProgramResult program_b = - Decompile(program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b"); + Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b"); out += program_b.first; } @@ -57,6 +58,9 @@ void main() { } out += R"( + + // Set Position Y direction + position.y *= utof(config_pack[2]); // Check if the flip stage is VertexB // Config pack's second value is flip_stage if (config_pack[1] == 1) { @@ -75,7 +79,7 @@ void main() { return {out, program.second}; } -ProgramResult GenerateGeometryShader(const ShaderSetup& setup) { +ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; @@ -95,7 +99,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config { )"; ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); ProgramResult program = - Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry"); + Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry"); out += program.first; out += R"( @@ -106,7 +110,7 @@ void main() { return {out, program.second}; } -ProgramResult GenerateFragmentShader(const ShaderSetup& setup) { +ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) { const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; @@ -158,7 +162,7 @@ bool AlphaFunc(in float value) { )"; ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); ProgramResult program = - Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment"); + Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment"); out += program.first; diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h index fad346b48..0536c8a03 100644 --- a/src/video_core/renderer_opengl/gl_shader_gen.h +++ b/src/video_core/renderer_opengl/gl_shader_gen.h @@ -10,6 +10,10 @@ #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/shader/shader_ir.h" +namespace OpenGL { +class Device; +} + namespace OpenGL::GLShader { using VideoCommon::Shader::ProgramCode; @@ -39,22 +43,13 @@ private: bool has_program_b{}; }; -/** - * Generates the GLSL vertex shader program source code for the given VS program - * @returns String of the shader source code - */ -ProgramResult GenerateVertexShader(const ShaderSetup& setup); - -/** - * Generates the GLSL geometry shader program source code for the given GS program - * @returns String of the shader source code - */ -ProgramResult GenerateGeometryShader(const ShaderSetup& setup); - -/** - * Generates the GLSL fragment shader program source code for the given FS program - * @returns String of the shader source code - */ -ProgramResult GenerateFragmentShader(const ShaderSetup& setup); +/// Generates the GLSL vertex shader program source code for the given VS program +ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup); + +/// Generates the GLSL geometry shader program source code for the given GS program +ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup); + +/// Generates the GLSL fragment shader program source code for the given FS program +ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup); } // namespace OpenGL::GLShader diff --git a/src/video_core/renderer_opengl/gl_state.cpp b/src/video_core/renderer_opengl/gl_state.cpp index 52d569a1b..7425fbe5d 100644 --- a/src/video_core/renderer_opengl/gl_state.cpp +++ b/src/video_core/renderer_opengl/gl_state.cpp @@ -471,8 +471,9 @@ void OpenGLState::ApplyTextures() const { const auto& texture_unit = texture_units[i]; auto& cur_state_texture_unit = cur_state.texture_units[i]; textures[i] = texture_unit.texture; - if (cur_state_texture_unit.texture == textures[i]) + if (cur_state_texture_unit.texture == textures[i]) { continue; + } cur_state_texture_unit.texture = textures[i]; if (!has_delta) { first = i; @@ -493,10 +494,11 @@ void OpenGLState::ApplySamplers() const { std::array<GLuint, Maxwell::NumTextureSamplers> samplers; for (std::size_t i = 0; i < std::size(samplers); ++i) { - if (cur_state.texture_units[i].sampler == texture_units[i].sampler) + samplers[i] = texture_units[i].sampler; + if (cur_state.texture_units[i].sampler == texture_units[i].sampler) { continue; + } cur_state.texture_units[i].sampler = texture_units[i].sampler; - samplers[i] = texture_units[i].sampler; if (!has_delta) { first = i; has_delta = true; diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index a8833c06e..95b773135 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -27,8 +27,7 @@ using Maxwell = Tegra::Engines::Maxwell3D::Regs; inline GLenum VertexType(Maxwell::VertexAttribute attrib) { switch (attrib.type) { case Maxwell::VertexAttribute::Type::UnsignedInt: - case Maxwell::VertexAttribute::Type::UnsignedNorm: { - + case Maxwell::VertexAttribute::Type::UnsignedNorm: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: @@ -47,16 +46,13 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { return GL_UNSIGNED_INT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_UNSIGNED_INT_2_10_10_10_REV; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); + UNREACHABLE(); + return {}; } - - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - UNREACHABLE(); - return {}; - } - case Maxwell::VertexAttribute::Type::SignedInt: - case Maxwell::VertexAttribute::Type::SignedNorm: { - + case Maxwell::VertexAttribute::Type::SignedNorm: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_8: case Maxwell::VertexAttribute::Size::Size_8_8: @@ -75,14 +71,12 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { return GL_INT; case Maxwell::VertexAttribute::Size::Size_10_10_10_2: return GL_INT_2_10_10_10_REV; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); + UNREACHABLE(); + return {}; } - - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); - UNREACHABLE(); - return {}; - } - - case Maxwell::VertexAttribute::Type::Float: { + case Maxwell::VertexAttribute::Type::Float: switch (attrib.size) { case Maxwell::VertexAttribute::Size::Size_16: case Maxwell::VertexAttribute::Size::Size_16_16: @@ -94,13 +88,16 @@ inline GLenum VertexType(Maxwell::VertexAttribute attrib) { case Maxwell::VertexAttribute::Size::Size_32_32_32: case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return GL_FLOAT; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex size={}", attrib.SizeString()); + UNREACHABLE(); + return {}; } + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); + UNREACHABLE(); + return {}; } - } - - LOG_CRITICAL(Render_OpenGL, "Unimplemented vertex type={}", attrib.TypeString()); - UNREACHABLE(); - return {}; } inline GLenum IndexFormat(Maxwell::IndexFormat index_format) { @@ -129,10 +126,11 @@ inline GLenum PrimitiveTopology(Maxwell::PrimitiveTopology topology) { return GL_TRIANGLES; case Maxwell::PrimitiveTopology::TriangleStrip: return GL_TRIANGLE_STRIP; + default: + LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology)); + UNREACHABLE(); + return {}; } - LOG_CRITICAL(Render_OpenGL, "Unimplemented topology={}", static_cast<u32>(topology)); - UNREACHABLE(); - return {}; } inline GLenum TextureFilterMode(Tegra::Texture::TextureFilter filter_mode, @@ -186,9 +184,10 @@ inline GLenum WrapMode(Tegra::Texture::WrapMode wrap_mode) { } else { return GL_MIRROR_CLAMP_TO_EDGE; } + default: + LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); + return GL_REPEAT; } - LOG_ERROR(Render_OpenGL, "Unimplemented texture wrap mode={}", static_cast<u32>(wrap_mode)); - return GL_REPEAT; } inline GLenum DepthCompareFunc(Tegra::Texture::DepthCompareFunc func) { diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 34bf26ff2..9fe1e3280 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -62,9 +62,10 @@ vk::SamplerAddressMode WrapMode(Tegra::Texture::WrapMode wrap_mode) { case Tegra::Texture::WrapMode::MirrorOnceBorder: UNIMPLEMENTED(); return vk::SamplerAddressMode::eMirrorClampToEdge; + default: + UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode)); + return {}; } - UNIMPLEMENTED_MSG("Unimplemented wrap mode={}", static_cast<u32>(wrap_mode)); - return {}; } vk::CompareOp DepthCompareFunction(Tegra::Texture::DepthCompareFunc depth_compare_func) { @@ -225,9 +226,10 @@ vk::PrimitiveTopology PrimitiveTopology(Maxwell::PrimitiveTopology topology) { return vk::PrimitiveTopology::eTriangleList; case Maxwell::PrimitiveTopology::TriangleStrip: return vk::PrimitiveTopology::eTriangleStrip; + default: + UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology)); + return {}; } - UNIMPLEMENTED_MSG("Unimplemented topology={}", static_cast<u32>(topology)); - return {}; } vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttribute::Size size) { diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 25500f9a3..23d9b10db 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -76,14 +76,10 @@ constexpr u32 GetGenericAttributeLocation(Attribute::Index attribute) { /// Returns true if an object has to be treated as precise bool IsPrecise(Operation operand) { - const auto& meta = operand.GetMeta(); - + const auto& meta{operand.GetMeta()}; if (std::holds_alternative<MetaArithmetic>(meta)) { return std::get<MetaArithmetic>(meta).precise; } - if (std::holds_alternative<MetaHalfArithmetic>(meta)) { - return std::get<MetaHalfArithmetic>(meta).precise; - } return false; } @@ -746,6 +742,16 @@ private: return {}; } + Id HClamp(Operation operation) { + UNIMPLEMENTED(); + return {}; + } + + Id HUnpack(Operation operation) { + UNIMPLEMENTED(); + return {}; + } + Id HMergeF32(Operation operation) { UNIMPLEMENTED(); return {}; @@ -1218,6 +1224,8 @@ private: &SPIRVDecompiler::Ternary<&Module::OpFma, Type::HalfFloat>, &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>, &SPIRVDecompiler::HNegate, + &SPIRVDecompiler::HClamp, + &SPIRVDecompiler::HUnpack, &SPIRVDecompiler::HMergeF32, &SPIRVDecompiler::HMergeH0, &SPIRVDecompiler::HMergeH1, @@ -1260,6 +1268,13 @@ private: &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>, &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>, &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>, + // TODO(Rodrigo): Should these use the OpFUnord* variants? + &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>, + &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>, &SPIRVDecompiler::Texture, &SPIRVDecompiler::TextureLod, diff --git a/src/video_core/shader/decode.cpp b/src/video_core/shader/decode.cpp index e4c438792..2da595c0d 100644 --- a/src/video_core/shader/decode.cpp +++ b/src/video_core/shader/decode.cpp @@ -116,6 +116,8 @@ ExitMethod ShaderIR::Scan(u32 begin, u32 end, std::set<u32>& labels) { // Continue scanning for an exit method. break; } + default: + break; } } return exit_method = ExitMethod::AlwaysReturn; @@ -206,4 +208,4 @@ u32 ShaderIR::DecodeInstr(NodeBlock& bb, u32 pc) { return pc + 1; } -} // namespace VideoCommon::Shader
\ No newline at end of file +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp index baee89107..2098c1170 100644 --- a/src/video_core/shader/decode/arithmetic_half.cpp +++ b/src/video_core/shader/decode/arithmetic_half.cpp @@ -9,6 +9,7 @@ namespace VideoCommon::Shader { +using Tegra::Shader::HalfType; using Tegra::Shader::Instruction; using Tegra::Shader::OpCode; @@ -18,48 +19,50 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { if (opcode->get().GetId() == OpCode::Id::HADD2_C || opcode->get().GetId() == OpCode::Id::HADD2_R) { - UNIMPLEMENTED_IF(instr.alu_half.ftz != 0); + if (instr.alu_half.ftz != 0) { + LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + } } - UNIMPLEMENTED_IF_MSG(instr.alu_half.saturate != 0, "Half float saturation not implemented"); const bool negate_a = opcode->get().GetId() != OpCode::Id::HMUL2_R && instr.alu_half.negate_a != 0; const bool negate_b = opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0; - const Node op_a = GetOperandAbsNegHalf(GetRegister(instr.gpr8), instr.alu_half.abs_a, negate_a); - - // instr.alu_half.type_a + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a); + op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a); - Node op_b = [&]() { + auto [type_b, op_b] = [&]() -> std::tuple<HalfType, Node> { switch (opcode->get().GetId()) { case OpCode::Id::HADD2_C: case OpCode::Id::HMUL2_C: - return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + return {HalfType::F32, GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())}; case OpCode::Id::HADD2_R: case OpCode::Id::HMUL2_R: - return GetRegister(instr.gpr20); + return {instr.alu_half.type_b, GetRegister(instr.gpr20)}; default: UNREACHABLE(); - return Immediate(0); + return {HalfType::F32, Immediate(0)}; } }(); - op_b = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b); + op_b = UnpackHalfFloat(op_b, type_b); + // redeclaration to avoid a bug in clang with reusing local bindings in lambdas + Node op_b_alt = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b); Node value = [&]() { - MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a, instr.alu_half.type_b}}; switch (opcode->get().GetId()) { case OpCode::Id::HADD2_C: case OpCode::Id::HADD2_R: - return Operation(OperationCode::HAdd, meta, op_a, op_b); + return Operation(OperationCode::HAdd, PRECISE, op_a, op_b_alt); case OpCode::Id::HMUL2_C: case OpCode::Id::HMUL2_R: - return Operation(OperationCode::HMul, meta, op_a, op_b); + return Operation(OperationCode::HMul, PRECISE, op_a, op_b_alt); default: UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName()); return Immediate(0); } }(); + value = GetSaturatedHalfFloat(value, instr.alu_half.saturate); value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half.merge); SetRegister(bb, instr.gpr0, value); @@ -67,4 +70,4 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) { return pc; } -} // namespace VideoCommon::Shader
\ No newline at end of file +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp index c2164ba50..fbcd35b18 100644 --- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp +++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp @@ -17,34 +17,33 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) { - UNIMPLEMENTED_IF(instr.alu_half_imm.ftz != 0); + if (instr.alu_half_imm.ftz != 0) { + LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + } } else { UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None); } - UNIMPLEMENTED_IF_MSG(instr.alu_half_imm.saturate != 0, - "Half float immediate saturation not implemented"); - Node op_a = GetRegister(instr.gpr8); + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a); op_a = GetOperandAbsNegHalf(op_a, instr.alu_half_imm.abs_a, instr.alu_half_imm.negate_a); const Node op_b = UnpackHalfImmediate(instr, true); Node value = [&]() { - MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a}}; switch (opcode->get().GetId()) { case OpCode::Id::HADD2_IMM: - return Operation(OperationCode::HAdd, meta, op_a, op_b); + return Operation(OperationCode::HAdd, PRECISE, op_a, op_b); case OpCode::Id::HMUL2_IMM: - return Operation(OperationCode::HMul, meta, op_a, op_b); + return Operation(OperationCode::HMul, PRECISE, op_a, op_b); default: UNREACHABLE(); return Immediate(0); } }(); - value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge); + value = GetSaturatedHalfFloat(value, instr.alu_half_imm.saturate); + value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge); SetRegister(bb, instr.gpr0, value); - return pc; } diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp index 55a6fbbf2..b5ec9a6f5 100644 --- a/src/video_core/shader/decode/conversion.cpp +++ b/src/video_core/shader/decode/conversion.cpp @@ -18,13 +18,29 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { const auto opcode = OpCode::Decode(instr); switch (opcode->get().GetId()) { - case OpCode::Id::I2I_R: { + case OpCode::Id::I2I_R: + case OpCode::Id::I2I_C: + case OpCode::Id::I2I_IMM: { UNIMPLEMENTED_IF(instr.conversion.selector); + UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word); + UNIMPLEMENTED_IF(instr.alu.saturate_d); const bool input_signed = instr.conversion.is_input_signed; const bool output_signed = instr.conversion.is_output_signed; - Node value = GetRegister(instr.gpr20); + Node value = [&]() { + switch (opcode->get().GetId()) { + case OpCode::Id::I2I_R: + return GetRegister(instr.gpr20); + case OpCode::Id::I2I_C: + return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::I2I_IMM: + return Immediate(instr.alu.GetSignedImm20_20()); + default: + UNREACHABLE(); + return Immediate(0); + } + }(); value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed); value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a, @@ -38,17 +54,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::I2F_R: - case OpCode::Id::I2F_C: { - UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word); + case OpCode::Id::I2F_C: + case OpCode::Id::I2F_IMM: { + UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word); UNIMPLEMENTED_IF(instr.conversion.selector); UNIMPLEMENTED_IF_MSG(instr.generates_cc, "Condition codes generation in I2F is not implemented"); Node value = [&]() { - if (instr.is_b_gpr) { + switch (opcode->get().GetId()) { + case OpCode::Id::I2F_R: return GetRegister(instr.gpr20); - } else { + case OpCode::Id::I2F_C: return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::I2F_IMM: + return Immediate(instr.alu.GetSignedImm20_20()); + default: + UNREACHABLE(); + return Immediate(0); } }(); const bool input_signed = instr.conversion.is_input_signed; @@ -62,24 +85,31 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::F2F_R: - case OpCode::Id::F2F_C: { - UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word); - UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); + case OpCode::Id::F2F_C: + case OpCode::Id::F2F_IMM: { + UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word); + UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word); UNIMPLEMENTED_IF_MSG(instr.generates_cc, "Condition codes generation in F2F is not implemented"); Node value = [&]() { - if (instr.is_b_gpr) { + switch (opcode->get().GetId()) { + case OpCode::Id::F2F_R: return GetRegister(instr.gpr20); - } else { + case OpCode::Id::F2F_C: return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::F2F_IMM: + return GetImmediate19(instr); + default: + UNREACHABLE(); + return Immediate(0); } }(); value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a); value = [&]() { - switch (instr.conversion.f2f.rounding) { + switch (instr.conversion.f2f.GetRoundingMode()) { case Tegra::Shader::F2fRoundingOp::None: return value; case Tegra::Shader::F2fRoundingOp::Round: @@ -90,10 +120,11 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { return Operation(OperationCode::FCeil, PRECISE, value); case Tegra::Shader::F2fRoundingOp::Trunc: return Operation(OperationCode::FTrunc, PRECISE, value); + default: + UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}", + static_cast<u32>(instr.conversion.f2f.rounding.Value())); + return Immediate(0); } - UNIMPLEMENTED_MSG("Unimplemented F2F rounding mode {}", - static_cast<u32>(instr.conversion.f2f.rounding.Value())); - return Immediate(0); }(); value = GetSaturatedFloat(value, instr.alu.saturate_d); @@ -102,15 +133,22 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { break; } case OpCode::Id::F2I_R: - case OpCode::Id::F2I_C: { + case OpCode::Id::F2I_C: + case OpCode::Id::F2I_IMM: { UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); UNIMPLEMENTED_IF_MSG(instr.generates_cc, "Condition codes generation in F2I is not implemented"); Node value = [&]() { - if (instr.is_b_gpr) { + switch (opcode->get().GetId()) { + case OpCode::Id::F2I_R: return GetRegister(instr.gpr20); - } else { + case OpCode::Id::F2I_C: return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::F2I_IMM: + return GetImmediate19(instr); + default: + UNREACHABLE(); + return Immediate(0); } }(); @@ -134,7 +172,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) { }(); const bool is_signed = instr.conversion.is_output_signed; value = SignedOperation(OperationCode::ICastFloat, is_signed, PRECISE, value); - value = ConvertIntegerSize(value, instr.conversion.dest_size, is_signed); + value = ConvertIntegerSize(value, instr.conversion.dst_size, is_signed); SetRegister(bb, instr.gpr0, value); break; diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp index 748368555..1dd94bf9d 100644 --- a/src/video_core/shader/decode/half_set.cpp +++ b/src/video_core/shader/decode/half_set.cpp @@ -18,11 +18,13 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.hset2.ftz != 0); + if (instr.hset2.ftz != 0) { + LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName()); + } + + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a); + op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a); - // instr.hset2.type_a - // instr.hset2.type_b - Node op_a = GetRegister(instr.gpr8); Node op_b = [&]() { switch (opcode->get().GetId()) { case OpCode::Id::HSET2_R: @@ -32,14 +34,12 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) { return Immediate(0); } }(); - - op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a); + op_b = UnpackHalfFloat(op_b, instr.hset2.type_b); op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b); const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); - MetaHalfArithmetic meta{false, {instr.hset2.type_a, instr.hset2.type_b}}; - const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, meta, op_a, op_b); + const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b); const OperationCode combiner = GetPredicateCombiner(instr.hset2.op); diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp index e68512692..6e59eb650 100644 --- a/src/video_core/shader/decode/half_set_predicate.cpp +++ b/src/video_core/shader/decode/half_set_predicate.cpp @@ -19,10 +19,10 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0); - Node op_a = GetRegister(instr.gpr8); + Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a); op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a); - const Node op_b = [&]() { + Node op_b = [&]() { switch (opcode->get().GetId()) { case OpCode::Id::HSETP2_R: return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a, @@ -32,6 +32,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { return Immediate(0); } }(); + op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b); // We can't use the constant predicate as destination. ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex)); @@ -42,8 +43,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) { const OperationCode pair_combiner = instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2; - MetaHalfArithmetic meta = {false, {instr.hsetp2.type_a, instr.hsetp2.type_b}}; - const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, meta, op_a, op_b); + const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b); const Node first_pred = Operation(pair_combiner, comparison); // Set the primary predicate to the result of Predicate OP SecondPredicate diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp index 7a07c5ec6..a425f9eb7 100644 --- a/src/video_core/shader/decode/hfma2.cpp +++ b/src/video_core/shader/decode/hfma2.cpp @@ -27,10 +27,6 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) { } constexpr auto identity = HalfType::H0_H1; - - const HalfType type_a = instr.hfma2.type_a; - const Node op_a = GetRegister(instr.gpr8); - bool neg_b{}, neg_c{}; auto [saturate, type_b, op_b, type_c, op_c] = [&]() -> std::tuple<bool, HalfType, Node, HalfType, Node> { @@ -38,15 +34,14 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) { case OpCode::Id::HFMA2_CR: neg_b = instr.hfma2.negate_b; neg_c = instr.hfma2.negate_c; - return {instr.hfma2.saturate, instr.hfma2.type_b, + return {instr.hfma2.saturate, HalfType::F32, GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()), instr.hfma2.type_reg39, GetRegister(instr.gpr39)}; case OpCode::Id::HFMA2_RC: neg_b = instr.hfma2.negate_b; neg_c = instr.hfma2.negate_c; return {instr.hfma2.saturate, instr.hfma2.type_reg39, GetRegister(instr.gpr39), - instr.hfma2.type_b, - GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())}; + HalfType::F32, GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset())}; case OpCode::Id::HFMA2_RR: neg_b = instr.hfma2.rr.negate_b; neg_c = instr.hfma2.rr.negate_c; @@ -60,13 +55,13 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) { return {false, identity, Immediate(0), identity, Immediate(0)}; } }(); - UNIMPLEMENTED_IF_MSG(saturate, "HFMA2 saturation is not implemented"); - op_b = GetOperandAbsNegHalf(op_b, false, neg_b); - op_c = GetOperandAbsNegHalf(op_c, false, neg_c); + const Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hfma2.type_a); + op_b = GetOperandAbsNegHalf(UnpackHalfFloat(op_b, type_b), false, neg_b); + op_c = GetOperandAbsNegHalf(UnpackHalfFloat(op_c, type_c), false, neg_c); - MetaHalfArithmetic meta{true, {type_a, type_b, type_c}}; - Node value = Operation(OperationCode::HFma, meta, op_a, op_b, op_c); + Node value = Operation(OperationCode::HFma, PRECISE, op_a, op_b, op_c); + value = GetSaturatedHalfFloat(value, saturate); value = HalfMerge(GetRegister(instr.gpr0), value, instr.hfma2.merge); SetRegister(bb, instr.gpr0, value); @@ -74,4 +69,4 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) { return pc; } -} // namespace VideoCommon::Shader
\ No newline at end of file +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/decode/texture.cpp b/src/video_core/shader/decode/texture.cpp index fa65ac9a9..8b574d4e5 100644 --- a/src/video_core/shader/decode/texture.cpp +++ b/src/video_core/shader/decode/texture.cpp @@ -296,7 +296,7 @@ const Sampler& ShaderIR::GetBindlessSampler(const Tegra::Shader::Register& reg, ASSERT(cbuf_offset_imm != nullptr); const auto cbuf_offset = cbuf_offset_imm->GetValue(); const auto cbuf_index = cbuf->GetIndex(); - const u64 cbuf_key = (cbuf_index << 32) | cbuf_offset; + const auto cbuf_key = (static_cast<u64>(cbuf_index) << 32) | static_cast<u64>(cbuf_offset); // If this sampler has already been used, return the existing mapping. const auto itr = @@ -541,7 +541,6 @@ Node4 ShaderIR::GetTld4Code(Instruction instr, TextureType texture_type, bool de bool is_array, bool is_aoffi) { const std::size_t coord_count = GetCoordCount(texture_type); const std::size_t total_coord_count = coord_count + (is_array ? 1 : 0); - const std::size_t total_reg_count = total_coord_count + (depth_compare ? 1 : 0); // If enabled arrays index is always stored in the gpr8 field const u64 array_register = instr.gpr8.Value(); diff --git a/src/video_core/shader/decode/xmad.cpp b/src/video_core/shader/decode/xmad.cpp index db15c0718..04a776398 100644 --- a/src/video_core/shader/decode/xmad.cpp +++ b/src/video_core/shader/decode/xmad.cpp @@ -56,9 +56,10 @@ u32 ShaderIR::DecodeXmad(NodeBlock& bb, u32 pc) { instr.xmad.mode, Immediate(static_cast<u32>(instr.xmad.imm20_16)), GetRegister(instr.gpr39)}; + default: + UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName()); + return {false, false, false, Tegra::Shader::XmadMode::None, Immediate(0), Immediate(0)}; } - UNIMPLEMENTED_MSG("Unhandled XMAD instruction: {}", opcode->get().GetName()); - return {false, false, false, Tegra::Shader::XmadMode::None, Immediate(0), Immediate(0)}; }(); op_a = BitfieldExtract(op_a, instr.xmad.high_a ? 16 : 0, 16); diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp index ac5112d78..e4eb0dfd9 100644 --- a/src/video_core/shader/shader_ir.cpp +++ b/src/video_core/shader/shader_ir.cpp @@ -189,7 +189,11 @@ Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) { const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0); const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0); - return Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, first_negate, second_negate); + return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate); +} + +Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) { + return Operation(OperationCode::HUnpack, type, value); } Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) { @@ -209,17 +213,26 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) { Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) { if (absolute) { - value = Operation(OperationCode::HAbsolute, HALF_NO_PRECISE, value); + value = Operation(OperationCode::HAbsolute, NO_PRECISE, value); } if (negate) { - value = Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, GetPredicate(true), + value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true), GetPredicate(true)); } return value; } +Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) { + if (!saturate) { + return value; + } + const Node positive_zero = Immediate(std::copysignf(0, 1)); + const Node positive_one = Immediate(1.0f); + return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one); +} + Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) { - static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { + const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { {PredCondition::LessThan, OperationCode::LogicalFLessThan}, {PredCondition::Equal, OperationCode::LogicalFEqual}, {PredCondition::LessEqual, OperationCode::LogicalFLessEqual}, @@ -255,7 +268,7 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a, Node op_b) { - static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { + const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { {PredCondition::LessThan, OperationCode::LogicalILessThan}, {PredCondition::Equal, OperationCode::LogicalIEqual}, {PredCondition::LessEqual, OperationCode::LogicalILessEqual}, @@ -283,40 +296,32 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si return predicate; } -Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, - const MetaHalfArithmetic& meta, Node op_a, Node op_b) { - - UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan || - condition == PredCondition::NotEqualWithNan || - condition == PredCondition::LessEqualWithNan || - condition == PredCondition::GreaterThanWithNan || - condition == PredCondition::GreaterEqualWithNan, - "Unimplemented NaN comparison for half floats"); - - static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { +Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, + Node op_b) { + const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { {PredCondition::LessThan, OperationCode::Logical2HLessThan}, {PredCondition::Equal, OperationCode::Logical2HEqual}, {PredCondition::LessEqual, OperationCode::Logical2HLessEqual}, {PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan}, {PredCondition::NotEqual, OperationCode::Logical2HNotEqual}, {PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual}, - {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThan}, - {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqual}, - {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqual}, - {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThan}, - {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqual}}; + {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThanWithNan}, + {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqualWithNan}, + {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqualWithNan}, + {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThanWithNan}, + {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqualWithNan}}; const auto comparison{PredicateComparisonTable.find(condition)}; UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(), "Unknown predicate comparison operation"); - const Node predicate = Operation(comparison->second, meta, op_a, op_b); + const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b); return predicate; } OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) { - static const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = { + const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = { {PredOperation::And, OperationCode::LogicalAnd}, {PredOperation::Or, OperationCode::LogicalOr}, {PredOperation::Xor, OperationCode::LogicalXor}, @@ -434,11 +439,14 @@ Node ShaderIR::BitfieldExtract(Node value, u32 offset, u32 bits) { return OperationCode::LogicalUGreaterEqual; case OperationCode::INegate: UNREACHABLE_MSG("Can't negate an unsigned integer"); + return {}; case OperationCode::IAbsolute: UNREACHABLE_MSG("Can't apply absolute to an unsigned integer"); + return {}; + default: + UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code)); + return {}; } - UNREACHABLE_MSG("Unknown signed operation with code={}", static_cast<u32>(operation_code)); - return {}; } -} // namespace VideoCommon::Shader
\ No newline at end of file +} // namespace VideoCommon::Shader diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 57af8b10f..65f1e1de9 100644 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -109,11 +109,13 @@ enum class OperationCode { UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint UBitCount, /// (MetaArithmetic, uint) -> uint - HAdd, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 - HMul, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 - HFma, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2 + HAdd, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 + HMul, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 + HFma, /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2 HAbsolute, /// (f16vec2 a) -> f16vec2 HNegate, /// (f16vec2 a, bool first, bool second) -> f16vec2 + HClamp, /// (f16vec2 src, float min, float max) -> f16vec2 + HUnpack, /// (Tegra::Shader::HalfType, T value) -> f16vec2 HMergeF32, /// (f16vec2 src) -> float HMergeH0, /// (f16vec2 dest, f16vec2 src) -> f16vec2 HMergeH1, /// (f16vec2 dest, f16vec2 src) -> f16vec2 @@ -150,12 +152,18 @@ enum class OperationCode { LogicalUNotEqual, /// (uint a, uint b) -> bool LogicalUGreaterEqual, /// (uint a, uint b) -> bool - Logical2HLessThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HLessEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HGreaterThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HNotEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 - Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HLessThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HLessEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HGreaterThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HNotEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HLessThanWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HLessEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HGreaterThanWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HNotEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 + Logical2HGreaterEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 Texture, /// (MetaTexture, float[N] coords) -> float4 TextureLod, /// (MetaTexture, float[N] coords) -> float4 @@ -243,8 +251,9 @@ public: } bool operator<(const Sampler& rhs) const { - return std::tie(offset, index, type, is_array, is_shadow) < - std::tie(rhs.offset, rhs.index, rhs.type, rhs.is_array, rhs.is_shadow); + return std::tie(index, offset, type, is_array, is_shadow, is_bindless) < + std::tie(rhs.index, rhs.offset, rhs.type, rhs.is_array, rhs.is_shadow, + rhs.is_bindless); } private: @@ -308,13 +317,6 @@ struct MetaArithmetic { bool precise{}; }; -struct MetaHalfArithmetic { - bool precise{}; - std::array<Tegra::Shader::HalfType, 3> types = {Tegra::Shader::HalfType::H0_H1, - Tegra::Shader::HalfType::H0_H1, - Tegra::Shader::HalfType::H0_H1}; -}; - struct MetaTexture { const Sampler& sampler; Node array{}; @@ -326,11 +328,10 @@ struct MetaTexture { u32 element{}; }; -constexpr MetaArithmetic PRECISE = {true}; -constexpr MetaArithmetic NO_PRECISE = {false}; -constexpr MetaHalfArithmetic HALF_NO_PRECISE = {false}; +inline constexpr MetaArithmetic PRECISE = {true}; +inline constexpr MetaArithmetic NO_PRECISE = {false}; -using Meta = std::variant<MetaArithmetic, MetaHalfArithmetic, MetaTexture>; +using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>; /// Holds any kind of operation that can be done in the IR class OperationNode final { @@ -734,10 +735,14 @@ private: /// Unpacks a half immediate from an instruction Node UnpackHalfImmediate(Tegra::Shader::Instruction instr, bool has_negation); + /// Unpacks a binary value into a half float pair with a type format + Node UnpackHalfFloat(Node value, Tegra::Shader::HalfType type); /// Merges a half pair into another value Node HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge); /// Conditionally absolute/negated half float pair. Absolute is applied first Node GetOperandAbsNegHalf(Node value, bool absolute, bool negate); + /// Conditionally saturates a half float pair + Node GetSaturatedHalfFloat(Node value, bool saturate = true); /// Returns a predicate comparing two floats Node GetPredicateComparisonFloat(Tegra::Shader::PredCondition condition, Node op_a, Node op_b); @@ -745,8 +750,7 @@ private: Node GetPredicateComparisonInteger(Tegra::Shader::PredCondition condition, bool is_signed, Node op_a, Node op_b); /// Returns a predicate comparing two half floats. meta consumes how both pairs will be compared - Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, - const MetaHalfArithmetic& meta, Node op_a, Node op_b); + Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, Node op_b); /// Returns a predicate combiner operation OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 3b022a456..6384fa8d2 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -178,39 +178,44 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format, return PixelFormat::ABGR8S; case Tegra::Texture::ComponentType::UINT: return PixelFormat::ABGR8UI; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::B5G6R5: switch (component_type) { case Tegra::Texture::ComponentType::UNORM: return PixelFormat::B5G6R5U; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::A2B10G10R10: switch (component_type) { case Tegra::Texture::ComponentType::UNORM: return PixelFormat::A2B10G10R10U; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::A1B5G5R5: switch (component_type) { case Tegra::Texture::ComponentType::UNORM: return PixelFormat::A1B5G5R5U; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::R8: switch (component_type) { case Tegra::Texture::ComponentType::UNORM: return PixelFormat::R8U; case Tegra::Texture::ComponentType::UINT: return PixelFormat::R8UI; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::G8R8: // TextureFormat::G8R8 is actually ordered red then green, as such we can use // PixelFormat::RG8U and PixelFormat::RG8S. This was tested with The Legend of Zelda: Breath @@ -220,50 +225,55 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format, return PixelFormat::RG8U; case Tegra::Texture::ComponentType::SNORM: return PixelFormat::RG8S; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::R16_G16_B16_A16: switch (component_type) { case Tegra::Texture::ComponentType::UNORM: return PixelFormat::RGBA16U; case Tegra::Texture::ComponentType::FLOAT: return PixelFormat::RGBA16F; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::BF10GF11RF11: switch (component_type) { case Tegra::Texture::ComponentType::FLOAT: return PixelFormat::R11FG11FB10F; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); case Tegra::Texture::TextureFormat::R32_G32_B32_A32: switch (component_type) { case Tegra::Texture::ComponentType::FLOAT: return PixelFormat::RGBA32F; case Tegra::Texture::ComponentType::UINT: return PixelFormat::RGBA32UI; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::R32_G32: switch (component_type) { case Tegra::Texture::ComponentType::FLOAT: return PixelFormat::RG32F; case Tegra::Texture::ComponentType::UINT: return PixelFormat::RG32UI; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::R32_G32_B32: switch (component_type) { case Tegra::Texture::ComponentType::FLOAT: return PixelFormat::RGB32F; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::R16: switch (component_type) { case Tegra::Texture::ComponentType::FLOAT: @@ -276,18 +286,20 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format, return PixelFormat::R16UI; case Tegra::Texture::ComponentType::SINT: return PixelFormat::R16I; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::R32: switch (component_type) { case Tegra::Texture::ComponentType::FLOAT: return PixelFormat::R32F; case Tegra::Texture::ComponentType::UINT: return PixelFormat::R32UI; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::ZF32: return PixelFormat::Z32F; case Tegra::Texture::TextureFormat::Z16: @@ -310,9 +322,10 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format, return PixelFormat::DXN2UNORM; case Tegra::Texture::ComponentType::SNORM: return PixelFormat::DXN2SNORM; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; case Tegra::Texture::TextureFormat::BC7U: return is_srgb ? PixelFormat::BC7U_SRGB : PixelFormat::BC7U; case Tegra::Texture::TextureFormat::BC6H_UF16: @@ -343,15 +356,17 @@ PixelFormat PixelFormatFromTextureFormat(Tegra::Texture::TextureFormat format, return PixelFormat::RG16UI; case Tegra::Texture::ComponentType::SINT: return PixelFormat::RG16I; + default: + break; } - LOG_CRITICAL(HW_GPU, "Unimplemented component_type={}", static_cast<u32>(component_type)); - UNREACHABLE(); + break; default: - LOG_CRITICAL(HW_GPU, "Unimplemented format={}, component_type={}", static_cast<u32>(format), - static_cast<u32>(component_type)); - UNREACHABLE(); - return PixelFormat::ABGR8U; + break; } + LOG_CRITICAL(HW_GPU, "Unimplemented format={}, component_type={}", static_cast<u32>(format), + static_cast<u32>(component_type)); + UNREACHABLE(); + return PixelFormat::ABGR8U; } ComponentType ComponentTypeFromTexture(Tegra::Texture::ComponentType type) { @@ -513,8 +528,9 @@ bool IsFormatBCn(PixelFormat format) { case PixelFormat::DXT45_SRGB: case PixelFormat::BC7U_SRGB: return true; + default: + return false; } - return false; } } // namespace VideoCore::Surface diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp index b508d64e9..eafb6b73a 100644 --- a/src/video_core/textures/astc.cpp +++ b/src/video_core/textures/astc.cpp @@ -1616,6 +1616,7 @@ namespace Tegra::Texture::ASTC { std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height, uint32_t depth, uint32_t block_width, uint32_t block_height) { uint32_t blockIdx = 0; + std::size_t depth_offset = 0; std::vector<uint8_t> outData(height * width * depth * 4); for (uint32_t k = 0; k < depth; k++) { for (uint32_t j = 0; j < height; j += block_height) { @@ -1630,7 +1631,7 @@ std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t he uint32_t decompWidth = std::min(block_width, width - i); uint32_t decompHeight = std::min(block_height, height - j); - uint8_t* outRow = outData.data() + (j * width + i) * 4; + uint8_t* outRow = depth_offset + outData.data() + (j * width + i) * 4; for (uint32_t jj = 0; jj < decompHeight; jj++) { memcpy(outRow + jj * width * 4, uncompData + jj * block_width, decompWidth * 4); } @@ -1638,6 +1639,7 @@ std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t he blockIdx++; } } + depth_offset += height * width * 4; } return outData; diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp index 995d0e068..217805386 100644 --- a/src/video_core/textures/decoders.cpp +++ b/src/video_core/textures/decoders.cpp @@ -288,6 +288,29 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 } } +void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, + const u32 block_height, const std::size_t copy_size, const u8* source_data, + u8* swizzle_data) { + const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x}; + std::size_t count = 0; + for (std::size_t y = dst_y; y < height && count < copy_size; ++y) { + const std::size_t gob_address_y = + (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs + + ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size; + const auto& table = legacy_swizzle_table[y % gob_size_y]; + for (std::size_t x = dst_x; x < width && count < copy_size; ++x) { + const std::size_t gob_address = + gob_address_y + (x / gob_size_x) * gob_size * block_height; + const std::size_t swizzled_offset = gob_address + table[x % gob_size_x]; + const u8* source_line = source_data + count; + u8* dest_addr = swizzle_data + swizzled_offset; + count++; + + std::memcpy(dest_addr, source_line, 1); + } + } +} + std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width, u32 height) { std::vector<u8> rgba_data; diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h index e078fa274..e072d8401 100644 --- a/src/video_core/textures/decoders.h +++ b/src/video_core/textures/decoders.h @@ -51,4 +51,8 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, u32 offset_x, u32 offset_y); +void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y, + const u32 block_height, const std::size_t copy_size, const u8* source_data, + u8* swizzle_data); + } // namespace Tegra::Texture |