diff options
-rw-r--r-- | src/common/page_table.cpp | 12 | ||||
-rw-r--r-- | src/common/page_table.h | 15 | ||||
-rw-r--r-- | src/video_core/engines/shader_bytecode.h | 11 | ||||
-rw-r--r-- | src/video_core/gpu.h | 1 | ||||
-rw-r--r-- | src/video_core/memory_manager.h | 2 | ||||
-rw-r--r-- | src/video_core/morton.cpp | 2 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_shader_decompiler.cpp | 15 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.cpp | 1 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/renderer_opengl.cpp | 288 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/maxwell_to_vk.cpp | 7 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_device.cpp | 1 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_rasterizer.cpp | 2 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_shader_decompiler.cpp | 2 | ||||
-rw-r--r-- | src/video_core/shader/decode/bfe.cpp | 69 | ||||
-rw-r--r-- | src/video_core/shader/node_helper.cpp | 2 | ||||
-rw-r--r-- | src/video_core/surface.cpp | 2 | ||||
-rw-r--r-- | src/video_core/surface.h | 142 | ||||
-rw-r--r-- | src/video_core/texture_cache/format_lookup_table.cpp | 3 |
18 files changed, 349 insertions, 228 deletions
diff --git a/src/common/page_table.cpp b/src/common/page_table.cpp index 69b7abc54..566b57b62 100644 --- a/src/common/page_table.cpp +++ b/src/common/page_table.cpp @@ -16,7 +16,6 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) { pointers.resize(num_page_table_entries); attributes.resize(num_page_table_entries); - backing_addr.resize(num_page_table_entries); // The default is a 39-bit address space, which causes an initial 1GB allocation size. If the // vector size is subsequently decreased (via resize), the vector might not automatically @@ -25,6 +24,17 @@ void PageTable::Resize(std::size_t address_space_width_in_bits) { pointers.shrink_to_fit(); attributes.shrink_to_fit(); +} + +BackingPageTable::BackingPageTable(std::size_t page_size_in_bits) : PageTable{page_size_in_bits} {} + +BackingPageTable::~BackingPageTable() = default; + +void BackingPageTable::Resize(std::size_t address_space_width_in_bits) { + PageTable::Resize(address_space_width_in_bits); + const std::size_t num_page_table_entries = 1ULL + << (address_space_width_in_bits - page_size_in_bits); + backing_addr.resize(num_page_table_entries); backing_addr.shrink_to_fit(); } diff --git a/src/common/page_table.h b/src/common/page_table.h index 8b8ff0bb8..dbc272ab7 100644 --- a/src/common/page_table.h +++ b/src/common/page_table.h @@ -76,9 +76,20 @@ struct PageTable { */ std::vector<PageType> attributes; - std::vector<u64> backing_addr; - const std::size_t page_size_in_bits{}; }; +/** + * A more advanced Page Table with the ability to save a backing address when using it + * depends on another MMU. + */ +struct BackingPageTable : PageTable { + explicit BackingPageTable(std::size_t page_size_in_bits); + ~BackingPageTable(); + + void Resize(std::size_t address_space_width_in_bits); + + std::vector<u64> backing_addr; +}; + } // namespace Common diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h index c9bc83cd7..eba42deb4 100644 --- a/src/video_core/engines/shader_bytecode.h +++ b/src/video_core/engines/shader_bytecode.h @@ -911,14 +911,9 @@ union Instruction { } fadd32i; union { - BitField<20, 8, u64> shift_position; - BitField<28, 8, u64> shift_length; - BitField<48, 1, u64> negate_b; - BitField<49, 1, u64> negate_a; - - u64 GetLeftShiftValue() const { - return 32 - (shift_position + shift_length); - } + BitField<40, 1, u64> brev; + BitField<47, 1, u64> rd_cc; + BitField<48, 1, u64> is_signed; } bfe; union { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index ba8c9d665..64acb17df 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -39,6 +39,7 @@ enum class RenderTargetFormat : u32 { RGBA32_FLOAT = 0xC0, RGBA32_UINT = 0xC2, RGBA16_UNORM = 0xC6, + RGBA16_SNORM = 0xC7, RGBA16_UINT = 0xC9, RGBA16_FLOAT = 0xCA, RG32_FLOAT = 0xCB, diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index aea010087..073bdb491 100644 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -174,7 +174,7 @@ private: /// End of address space, based on address space in bits. static constexpr GPUVAddr address_space_end{1ULL << address_space_width}; - Common::PageTable page_table{page_bits}; + Common::BackingPageTable page_table{page_bits}; VMAMap vma_map; VideoCore::RasterizerInterface& rasterizer; diff --git a/src/video_core/morton.cpp b/src/video_core/morton.cpp index f2c83266e..6d522c318 100644 --- a/src/video_core/morton.cpp +++ b/src/video_core/morton.cpp @@ -51,6 +51,7 @@ static constexpr ConversionArray morton_to_linear_fns = { MortonCopy<true, PixelFormat::R8UI>, MortonCopy<true, PixelFormat::RGBA16F>, MortonCopy<true, PixelFormat::RGBA16U>, + MortonCopy<true, PixelFormat::RGBA16S>, MortonCopy<true, PixelFormat::RGBA16UI>, MortonCopy<true, PixelFormat::R11FG11FB10F>, MortonCopy<true, PixelFormat::RGBA32UI>, @@ -131,6 +132,7 @@ static constexpr ConversionArray linear_to_morton_fns = { MortonCopy<false, PixelFormat::R8U>, MortonCopy<false, PixelFormat::R8UI>, MortonCopy<false, PixelFormat::RGBA16F>, + MortonCopy<false, PixelFormat::RGBA16S>, MortonCopy<false, PixelFormat::RGBA16U>, MortonCopy<false, PixelFormat::RGBA16UI>, MortonCopy<false, PixelFormat::R11FG11FB10F>, diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 3adf7f0cb..849839fe3 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -2009,16 +2009,19 @@ private: expr += GetSampler(meta->sampler); expr += ", "; - expr += constructors.at(operation.GetOperandsCount() - 1); + expr += constructors.at(operation.GetOperandsCount() + (meta->array ? 1 : 0) - 1); expr += '('; for (std::size_t i = 0; i < count; ++i) { - expr += VisitOperand(operation, i).AsInt(); - const std::size_t next = i + 1; - if (next == count) - expr += ')'; - else if (next < count) + if (i > 0) { expr += ", "; + } + expr += VisitOperand(operation, i).AsInt(); } + if (meta->array) { + expr += ", "; + expr += Visit(meta->array).AsInt(); + } + expr += ')'; if (meta->lod && !meta->sampler.IsBuffer()) { expr += ", "; diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 2d3838a7a..f424e3000 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -53,6 +53,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> tex_format {GL_R8UI, GL_RED_INTEGER, GL_UNSIGNED_BYTE, false}, // R8UI {GL_RGBA16F, GL_RGBA, GL_HALF_FLOAT, false}, // RGBA16F {GL_RGBA16, GL_RGBA, GL_UNSIGNED_SHORT, false}, // RGBA16U + {GL_RGBA16_SNORM, GL_RGBA, GL_SHORT, false}, // RGBA16S {GL_RGBA16UI, GL_RGBA_INTEGER, GL_UNSIGNED_SHORT, false}, // RGBA16UI {GL_R11F_G11F_B10F, GL_RGB, GL_UNSIGNED_INT_10F_11F_11F_REV, false}, // R11FG11FB10F {GL_RGBA32UI, GL_RGBA_INTEGER, GL_UNSIGNED_INT, false}, // RGBA32UI diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index 12333e8c9..fca5e3ec0 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -5,8 +5,11 @@ #include <algorithm> #include <cstddef> #include <cstdlib> +#include <cstring> #include <memory> + #include <glad/glad.h> + #include "common/assert.h" #include "common/logging/log.h" #include "common/microprofile.h" @@ -25,6 +28,8 @@ namespace OpenGL { +namespace { + // If the size of this is too small, it ends up creating a soft cap on FPS as the renderer will have // to wait on available presentation frames. constexpr std::size_t SWAP_CHAIN_SIZE = 3; @@ -41,124 +46,6 @@ struct Frame { bool is_srgb{}; /// Framebuffer is sRGB or RGB }; -/** - * For smooth Vsync rendering, we want to always present the latest frame that the core generates, - * but also make sure that rendering happens at the pace that the frontend dictates. This is a - * helper class that the renderer uses to sync frames between the render thread and the presentation - * thread - */ -class FrameMailbox { -public: - std::mutex swap_chain_lock; - std::condition_variable present_cv; - std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; - std::queue<Frame*> free_queue; - std::deque<Frame*> present_queue; - Frame* previous_frame{}; - - FrameMailbox() { - for (auto& frame : swap_chain) { - free_queue.push(&frame); - } - } - - ~FrameMailbox() { - // lock the mutex and clear out the present and free_queues and notify any people who are - // blocked to prevent deadlock on shutdown - std::scoped_lock lock{swap_chain_lock}; - std::queue<Frame*>().swap(free_queue); - present_queue.clear(); - present_cv.notify_all(); - } - - void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { - frame->present.Release(); - frame->present.Create(); - GLint previous_draw_fbo{}; - glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); - glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); - } - glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); - frame->color_reloaded = false; - } - - void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { - // Recreate the color texture attachment - frame->color.Release(); - frame->color.Create(); - const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; - glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); - - // Recreate the FBO for the render target - frame->render.Release(); - frame->render.Create(); - glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); - glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, - frame->color.handle); - if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { - LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); - } - - frame->width = width; - frame->height = height; - frame->color_reloaded = true; - } - - Frame* GetRenderFrame() { - std::unique_lock lock{swap_chain_lock}; - - // If theres no free frames, we will reuse the oldest render frame - if (free_queue.empty()) { - auto frame = present_queue.back(); - present_queue.pop_back(); - return frame; - } - - Frame* frame = free_queue.front(); - free_queue.pop(); - return frame; - } - - void ReleaseRenderFrame(Frame* frame) { - std::unique_lock lock{swap_chain_lock}; - present_queue.push_front(frame); - present_cv.notify_one(); - } - - Frame* TryGetPresentFrame(int timeout_ms) { - std::unique_lock lock{swap_chain_lock}; - // wait for new entries in the present_queue - present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), - [&] { return !present_queue.empty(); }); - if (present_queue.empty()) { - // timed out waiting for a frame to draw so return the previous frame - return previous_frame; - } - - // free the previous frame and add it back to the free queue - if (previous_frame) { - free_queue.push(previous_frame); - } - - // the newest entries are pushed to the front of the queue - Frame* frame = present_queue.front(); - present_queue.pop_front(); - // remove all old entries from the present queue and move them back to the free_queue - for (auto f : present_queue) { - free_queue.push(f); - } - present_queue.clear(); - previous_frame = frame; - return frame; - } -}; - -namespace { - constexpr char VERTEX_SHADER[] = R"( #version 430 core @@ -211,6 +98,24 @@ struct ScreenRectVertex { std::array<GLfloat, 2> tex_coord; }; +/// Returns true if any debug tool is attached +bool HasDebugTool() { + const bool nsight = std::getenv("NVTX_INJECTION64_PATH") || std::getenv("NSIGHT_LAUNCHED"); + if (nsight) { + return true; + } + + GLint num_extensions; + glGetIntegerv(GL_NUM_EXTENSIONS, &num_extensions); + for (GLuint index = 0; index < static_cast<GLuint>(num_extensions); ++index) { + const auto name = reinterpret_cast<const char*>(glGetStringi(GL_EXTENSIONS, index)); + if (!std::strcmp(name, "GL_EXT_debug_tool")) { + return true; + } + } + return false; +} + /** * Defines a 1:1 pixel ortographic projection matrix with (0,0) on the top-left * corner and (width, height) on the lower-bottom. @@ -294,6 +199,153 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit } // Anonymous namespace +/** + * For smooth Vsync rendering, we want to always present the latest frame that the core generates, + * but also make sure that rendering happens at the pace that the frontend dictates. This is a + * helper class that the renderer uses to sync frames between the render thread and the presentation + * thread + */ +class FrameMailbox { +public: + std::mutex swap_chain_lock; + std::condition_variable present_cv; + std::array<Frame, SWAP_CHAIN_SIZE> swap_chain{}; + std::queue<Frame*> free_queue; + std::deque<Frame*> present_queue; + Frame* previous_frame{}; + + FrameMailbox() : has_debug_tool{HasDebugTool()} { + for (auto& frame : swap_chain) { + free_queue.push(&frame); + } + } + + ~FrameMailbox() { + // lock the mutex and clear out the present and free_queues and notify any people who are + // blocked to prevent deadlock on shutdown + std::scoped_lock lock{swap_chain_lock}; + std::queue<Frame*>().swap(free_queue); + present_queue.clear(); + present_cv.notify_all(); + } + + void ReloadPresentFrame(Frame* frame, u32 height, u32 width) { + frame->present.Release(); + frame->present.Create(); + GLint previous_draw_fbo{}; + glGetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &previous_draw_fbo); + glBindFramebuffer(GL_FRAMEBUFFER, frame->present.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + frame->color.handle); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG_CRITICAL(Render_OpenGL, "Failed to recreate present FBO!"); + } + glBindFramebuffer(GL_DRAW_FRAMEBUFFER, previous_draw_fbo); + frame->color_reloaded = false; + } + + void ReloadRenderFrame(Frame* frame, u32 width, u32 height) { + // Recreate the color texture attachment + frame->color.Release(); + frame->color.Create(); + const GLenum internal_format = frame->is_srgb ? GL_SRGB8 : GL_RGB8; + glNamedRenderbufferStorage(frame->color.handle, internal_format, width, height); + + // Recreate the FBO for the render target + frame->render.Release(); + frame->render.Create(); + glBindFramebuffer(GL_FRAMEBUFFER, frame->render.handle); + glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_RENDERBUFFER, + frame->color.handle); + if (glCheckFramebufferStatus(GL_FRAMEBUFFER) != GL_FRAMEBUFFER_COMPLETE) { + LOG_CRITICAL(Render_OpenGL, "Failed to recreate render FBO!"); + } + + frame->width = width; + frame->height = height; + frame->color_reloaded = true; + } + + Frame* GetRenderFrame() { + std::unique_lock lock{swap_chain_lock}; + + // If theres no free frames, we will reuse the oldest render frame + if (free_queue.empty()) { + auto frame = present_queue.back(); + present_queue.pop_back(); + return frame; + } + + Frame* frame = free_queue.front(); + free_queue.pop(); + return frame; + } + + void ReleaseRenderFrame(Frame* frame) { + std::unique_lock lock{swap_chain_lock}; + present_queue.push_front(frame); + present_cv.notify_one(); + + DebugNotifyNextFrame(); + } + + Frame* TryGetPresentFrame(int timeout_ms) { + DebugWaitForNextFrame(); + + std::unique_lock lock{swap_chain_lock}; + // wait for new entries in the present_queue + present_cv.wait_for(lock, std::chrono::milliseconds(timeout_ms), + [&] { return !present_queue.empty(); }); + if (present_queue.empty()) { + // timed out waiting for a frame to draw so return the previous frame + return previous_frame; + } + + // free the previous frame and add it back to the free queue + if (previous_frame) { + free_queue.push(previous_frame); + } + + // the newest entries are pushed to the front of the queue + Frame* frame = present_queue.front(); + present_queue.pop_front(); + // remove all old entries from the present queue and move them back to the free_queue + for (auto f : present_queue) { + free_queue.push(f); + } + present_queue.clear(); + previous_frame = frame; + return frame; + } + +private: + std::mutex debug_synch_mutex; + std::condition_variable debug_synch_condition; + std::atomic_int frame_for_debug{}; + const bool has_debug_tool; // When true, using a GPU debugger, so keep frames in lock-step + + /// Signal that a new frame is available (called from GPU thread) + void DebugNotifyNextFrame() { + if (!has_debug_tool) { + return; + } + frame_for_debug++; + std::lock_guard lock{debug_synch_mutex}; + debug_synch_condition.notify_one(); + } + + /// Wait for a new frame to be available (called from presentation thread) + void DebugWaitForNextFrame() { + if (!has_debug_tool) { + return; + } + const int last_frame = frame_for_debug; + std::unique_lock lock{debug_synch_mutex}; + debug_synch_condition.wait(lock, + [this, last_frame] { return frame_for_debug > last_frame; }); + } +}; + RendererOpenGL::RendererOpenGL(Core::Frontend::EmuWindow& emu_window, Core::System& system) : VideoCore::RendererBase{emu_window}, emu_window{emu_window}, system{system}, frame_mailbox{std::make_unique<FrameMailbox>()} {} diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index df3ac707c..0e2e5e6c7 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -125,6 +125,7 @@ struct FormatTuple { {vk::Format::eR8Uint, Attachable | Storage}, // R8UI {vk::Format::eR16G16B16A16Sfloat, Attachable | Storage}, // RGBA16F {vk::Format::eR16G16B16A16Unorm, Attachable | Storage}, // RGBA16U + {vk::Format::eR16G16B16A16Snorm, Attachable | Storage}, // RGBA16S {vk::Format::eR16G16B16A16Uint, Attachable | Storage}, // RGBA16UI {vk::Format::eB10G11R11UfloatPack32, Attachable | Storage}, // R11FG11FB10F {vk::Format::eR32G32B32A32Uint, Attachable | Storage}, // RGBA32UI @@ -331,6 +332,8 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR16G16B16Unorm; case Maxwell::VertexAttribute::Size::Size_16_16_16_16: return vk::Format::eR16G16B16A16Unorm; + case Maxwell::VertexAttribute::Size::Size_10_10_10_2: + return vk::Format::eA2B10G10R10UnormPack32; default: break; } @@ -364,6 +367,10 @@ vk::Format VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttr return vk::Format::eR8G8B8A8Uint; case Maxwell::VertexAttribute::Size::Size_32: return vk::Format::eR32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32: + return vk::Format::eR32G32Uint; + case Maxwell::VertexAttribute::Size::Size_32_32_32: + return vk::Format::eR32G32B32Uint; case Maxwell::VertexAttribute::Size::Size_32_32_32_32: return vk::Format::eR32G32B32A32Uint; default: diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 3847bd722..28d2fbc4f 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp @@ -535,6 +535,7 @@ std::unordered_map<vk::Format, vk::FormatProperties> VKDevice::GetFormatProperti vk::Format::eR32G32Sfloat, vk::Format::eR32G32Uint, vk::Format::eR16G16B16A16Uint, + vk::Format::eR16G16B16A16Snorm, vk::Format::eR16G16B16A16Unorm, vk::Format::eR16G16Unorm, vk::Format::eR16G16Snorm, diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 8636967df..755aad643 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -1157,7 +1157,7 @@ std::size_t RasterizerVulkan::CalculateVertexArraysSize() const { // This implementation assumes that all attributes are used in the shader. const GPUVAddr start{regs.vertex_array[index].StartAddress()}; const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - DEBUG_ASSERT(end > start); + DEBUG_ASSERT(end >= start); size += (end - start + 1) * regs.vertex_array[index].enable; } diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index b2c298051..51ecb5567 100644 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -837,7 +837,7 @@ private: Decorate(id, spv::Decoration::Offset, static_cast<u32>(tfb->offset)); } - element += static_cast<u8>(num_components); + element = static_cast<u8>(static_cast<std::size_t>(element) + num_components); } } diff --git a/src/video_core/shader/decode/bfe.cpp b/src/video_core/shader/decode/bfe.cpp index e02bcd097..8e3b46e8e 100644 --- a/src/video_core/shader/decode/bfe.cpp +++ b/src/video_core/shader/decode/bfe.cpp @@ -17,33 +17,60 @@ u32 ShaderIR::DecodeBfe(NodeBlock& bb, u32 pc) { const Instruction instr = {program_code[pc]}; const auto opcode = OpCode::Decode(instr); - UNIMPLEMENTED_IF(instr.bfe.negate_b); - Node op_a = GetRegister(instr.gpr8); - op_a = GetOperandAbsNegInteger(op_a, false, instr.bfe.negate_a, false); - - switch (opcode->get().GetId()) { - case OpCode::Id::BFE_IMM: { - UNIMPLEMENTED_IF_MSG(instr.generates_cc, - "Condition codes generation in BFE is not implemented"); + Node op_b = [&] { + switch (opcode->get().GetId()) { + case OpCode::Id::BFE_R: + return GetRegister(instr.gpr20); + case OpCode::Id::BFE_C: + return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); + case OpCode::Id::BFE_IMM: + return Immediate(instr.alu.GetSignedImm20_20()); + default: + UNREACHABLE(); + return Immediate(0); + } + }(); - const Node inner_shift_imm = Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue())); - const Node outer_shift_imm = - Immediate(static_cast<u32>(instr.bfe.GetLeftShiftValue() + instr.bfe.shift_position)); + UNIMPLEMENTED_IF_MSG(instr.bfe.rd_cc, "Condition codes in BFE is not implemented"); - const Node inner_shift = - Operation(OperationCode::ILogicalShiftLeft, NO_PRECISE, op_a, inner_shift_imm); - const Node outer_shift = - Operation(OperationCode::ILogicalShiftRight, NO_PRECISE, inner_shift, outer_shift_imm); + const bool is_signed = instr.bfe.is_signed; - SetInternalFlagsFromInteger(bb, outer_shift, instr.generates_cc); - SetRegister(bb, instr.gpr0, outer_shift); - break; - } - default: - UNIMPLEMENTED_MSG("Unhandled BFE instruction: {}", opcode->get().GetName()); + // using reverse parallel method in + // https://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel + // note for later if possible to implement faster method. + if (instr.bfe.brev) { + const auto swap = [&](u32 s, u32 mask) { + Node v1 = + SignedOperation(OperationCode::ILogicalShiftRight, is_signed, op_a, Immediate(s)); + if (mask != 0) { + v1 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v1), + Immediate(mask)); + } + Node v2 = op_a; + if (mask != 0) { + v2 = SignedOperation(OperationCode::IBitwiseAnd, is_signed, std::move(v2), + Immediate(mask)); + } + v2 = SignedOperation(OperationCode::ILogicalShiftLeft, is_signed, std::move(v2), + Immediate(s)); + return SignedOperation(OperationCode::IBitwiseOr, is_signed, std::move(v1), + std::move(v2)); + }; + op_a = swap(1, 0x55555555U); + op_a = swap(2, 0x33333333U); + op_a = swap(4, 0x0F0F0F0FU); + op_a = swap(8, 0x00FF00FFU); + op_a = swap(16, 0); } + const auto offset = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, + Immediate(0), Immediate(8)); + const auto bits = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_b, + Immediate(8), Immediate(8)); + auto result = SignedOperation(OperationCode::IBitfieldExtract, is_signed, op_a, offset, bits); + SetRegister(bb, instr.gpr0, std::move(result)); + return pc; } diff --git a/src/video_core/shader/node_helper.cpp b/src/video_core/shader/node_helper.cpp index b3dcd291c..76c56abb5 100644 --- a/src/video_core/shader/node_helper.cpp +++ b/src/video_core/shader/node_helper.cpp @@ -68,6 +68,8 @@ OperationCode SignedToUnsignedCode(OperationCode operation_code, bool is_signed) return OperationCode::UBitwiseXor; case OperationCode::IBitwiseNot: return OperationCode::UBitwiseNot; + case OperationCode::IBitfieldExtract: + return OperationCode::UBitfieldExtract; case OperationCode::IBitfieldInsert: return OperationCode::UBitfieldInsert; case OperationCode::IBitCount: diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index 9707c353d..cc7181229 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -111,6 +111,8 @@ PixelFormat PixelFormatFromRenderTargetFormat(Tegra::RenderTargetFormat format) return PixelFormat::RGBA16F; case Tegra::RenderTargetFormat::RGBA16_UNORM: return PixelFormat::RGBA16U; + case Tegra::RenderTargetFormat::RGBA16_SNORM: + return PixelFormat::RGBA16S; case Tegra::RenderTargetFormat::RGBA16_UINT: return PixelFormat::RGBA16UI; case Tegra::RenderTargetFormat::RGBA32_FLOAT: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index d88109e5a..ae8817465 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -25,82 +25,83 @@ enum class PixelFormat { R8UI = 7, RGBA16F = 8, RGBA16U = 9, - RGBA16UI = 10, - R11FG11FB10F = 11, - RGBA32UI = 12, - DXT1 = 13, - DXT23 = 14, - DXT45 = 15, - DXN1 = 16, // This is also known as BC4 - DXN2UNORM = 17, - DXN2SNORM = 18, - BC7U = 19, - BC6H_UF16 = 20, - BC6H_SF16 = 21, - ASTC_2D_4X4 = 22, - BGRA8 = 23, - RGBA32F = 24, - RG32F = 25, - R32F = 26, - R16F = 27, - R16U = 28, - R16S = 29, - R16UI = 30, - R16I = 31, - RG16 = 32, - RG16F = 33, - RG16UI = 34, - RG16I = 35, - RG16S = 36, - RGB32F = 37, - RGBA8_SRGB = 38, - RG8U = 39, - RG8S = 40, - RG32UI = 41, - RGBX16F = 42, - R32UI = 43, - R32I = 44, - ASTC_2D_8X8 = 45, - ASTC_2D_8X5 = 46, - ASTC_2D_5X4 = 47, - BGRA8_SRGB = 48, - DXT1_SRGB = 49, - DXT23_SRGB = 50, - DXT45_SRGB = 51, - BC7U_SRGB = 52, - R4G4B4A4U = 53, - ASTC_2D_4X4_SRGB = 54, - ASTC_2D_8X8_SRGB = 55, - ASTC_2D_8X5_SRGB = 56, - ASTC_2D_5X4_SRGB = 57, - ASTC_2D_5X5 = 58, - ASTC_2D_5X5_SRGB = 59, - ASTC_2D_10X8 = 60, - ASTC_2D_10X8_SRGB = 61, - ASTC_2D_6X6 = 62, - ASTC_2D_6X6_SRGB = 63, - ASTC_2D_10X10 = 64, - ASTC_2D_10X10_SRGB = 65, - ASTC_2D_12X12 = 66, - ASTC_2D_12X12_SRGB = 67, - ASTC_2D_8X6 = 68, - ASTC_2D_8X6_SRGB = 69, - ASTC_2D_6X5 = 70, - ASTC_2D_6X5_SRGB = 71, - E5B9G9R9F = 72, + RGBA16S = 10, + RGBA16UI = 11, + R11FG11FB10F = 12, + RGBA32UI = 13, + DXT1 = 14, + DXT23 = 15, + DXT45 = 16, + DXN1 = 17, // This is also known as BC4 + DXN2UNORM = 18, + DXN2SNORM = 19, + BC7U = 20, + BC6H_UF16 = 21, + BC6H_SF16 = 22, + ASTC_2D_4X4 = 23, + BGRA8 = 24, + RGBA32F = 25, + RG32F = 26, + R32F = 27, + R16F = 28, + R16U = 29, + R16S = 30, + R16UI = 31, + R16I = 32, + RG16 = 33, + RG16F = 34, + RG16UI = 35, + RG16I = 36, + RG16S = 37, + RGB32F = 38, + RGBA8_SRGB = 39, + RG8U = 40, + RG8S = 41, + RG32UI = 42, + RGBX16F = 43, + R32UI = 44, + R32I = 45, + ASTC_2D_8X8 = 46, + ASTC_2D_8X5 = 47, + ASTC_2D_5X4 = 48, + BGRA8_SRGB = 49, + DXT1_SRGB = 50, + DXT23_SRGB = 51, + DXT45_SRGB = 52, + BC7U_SRGB = 53, + R4G4B4A4U = 54, + ASTC_2D_4X4_SRGB = 55, + ASTC_2D_8X8_SRGB = 56, + ASTC_2D_8X5_SRGB = 57, + ASTC_2D_5X4_SRGB = 58, + ASTC_2D_5X5 = 59, + ASTC_2D_5X5_SRGB = 60, + ASTC_2D_10X8 = 61, + ASTC_2D_10X8_SRGB = 62, + ASTC_2D_6X6 = 63, + ASTC_2D_6X6_SRGB = 64, + ASTC_2D_10X10 = 65, + ASTC_2D_10X10_SRGB = 66, + ASTC_2D_12X12 = 67, + ASTC_2D_12X12_SRGB = 68, + ASTC_2D_8X6 = 69, + ASTC_2D_8X6_SRGB = 70, + ASTC_2D_6X5 = 71, + ASTC_2D_6X5_SRGB = 72, + E5B9G9R9F = 73, MaxColorFormat, // Depth formats - Z32F = 73, - Z16 = 74, + Z32F = 74, + Z16 = 75, MaxDepthFormat, // DepthStencil formats - Z24S8 = 75, - S8Z24 = 76, - Z32FS8 = 77, + Z24S8 = 76, + S8Z24 = 77, + Z32FS8 = 78, MaxDepthStencilFormat, @@ -138,6 +139,7 @@ constexpr std::array<u32, MaxPixelFormat> compression_factor_shift_table = {{ 0, // R8UI 0, // RGBA16F 0, // RGBA16U + 0, // RGBA16S 0, // RGBA16UI 0, // R11FG11FB10F 0, // RGBA32UI @@ -235,6 +237,7 @@ constexpr std::array<u32, MaxPixelFormat> block_width_table = {{ 1, // R8UI 1, // RGBA16F 1, // RGBA16U + 1, // RGBA16S 1, // RGBA16UI 1, // R11FG11FB10F 1, // RGBA32UI @@ -324,6 +327,7 @@ constexpr std::array<u32, MaxPixelFormat> block_height_table = {{ 1, // R8UI 1, // RGBA16F 1, // RGBA16U + 1, // RGBA16S 1, // RGBA16UI 1, // R11FG11FB10F 1, // RGBA32UI @@ -413,6 +417,7 @@ constexpr std::array<u32, MaxPixelFormat> bpp_table = {{ 8, // R8UI 64, // RGBA16F 64, // RGBA16U + 64, // RGBA16S 64, // RGBA16UI 32, // R11FG11FB10F 128, // RGBA32UI @@ -517,6 +522,7 @@ constexpr std::array<SurfaceCompression, MaxPixelFormat> compression_type_table SurfaceCompression::None, // R8UI SurfaceCompression::None, // RGBA16F SurfaceCompression::None, // RGBA16U + SurfaceCompression::None, // RGBA16S SurfaceCompression::None, // RGBA16UI SurfaceCompression::None, // R11FG11FB10F SurfaceCompression::None, // RGBA32UI diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index cc3ad8417..e151c26c4 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp @@ -41,7 +41,7 @@ struct Table { ComponentType alpha_component; bool is_srgb; }; -constexpr std::array<Table, 75> DefinitionTable = {{ +constexpr std::array<Table, 76> DefinitionTable = {{ {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, @@ -61,6 +61,7 @@ constexpr std::array<Table, 75> DefinitionTable = {{ {TextureFormat::G8R8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RG8U}, {TextureFormat::G8R8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RG8S}, + {TextureFormat::R16_G16_B16_A16, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::RGBA16S}, {TextureFormat::R16_G16_B16_A16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::RGBA16U}, {TextureFormat::R16_G16_B16_A16, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::RGBA16F}, {TextureFormat::R16_G16_B16_A16, C, UINT, UINT, UINT, UINT, PixelFormat::RGBA16UI}, |