diff options
Diffstat (limited to 'src')
28 files changed, 391 insertions, 441 deletions
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt index 869da5e83..a965af291 100644 --- a/src/audio_core/CMakeLists.txt +++ b/src/audio_core/CMakeLists.txt @@ -4,6 +4,7 @@ set(SRCS hle/dsp.cpp hle/filter.cpp hle/pipe.cpp + interpolate.cpp ) set(HEADERS @@ -13,9 +14,13 @@ set(HEADERS hle/dsp.h hle/filter.h hle/pipe.h + interpolate.h sink.h ) +include_directories(../../externals/soundtouch/include) + create_directory_groups(${SRCS} ${HEADERS}) -add_library(audio_core STATIC ${SRCS} ${HEADERS})
\ No newline at end of file +add_library(audio_core STATIC ${SRCS} ${HEADERS}) +target_link_libraries(audio_core SoundTouch) diff --git a/src/audio_core/audio_core.h b/src/audio_core/audio_core.h index 64c330914..b349895ea 100644 --- a/src/audio_core/audio_core.h +++ b/src/audio_core/audio_core.h @@ -10,8 +10,6 @@ class VMManager; namespace AudioCore { -constexpr int num_sources = 24; -constexpr int samples_per_frame = 160; ///< Samples per audio frame at native sample rate constexpr int native_sample_rate = 32728; ///< 32kHz /// Initialise Audio Core diff --git a/src/audio_core/hle/common.h b/src/audio_core/hle/common.h index 37d441eb2..7910f42ae 100644 --- a/src/audio_core/hle/common.h +++ b/src/audio_core/hle/common.h @@ -7,18 +7,19 @@ #include <algorithm> #include <array> -#include "audio_core/audio_core.h" - #include "common/common_types.h" namespace DSP { namespace HLE { +constexpr int num_sources = 24; +constexpr int samples_per_frame = 160; ///< Samples per audio frame at native sample rate + /// The final output to the speakers is stereo. Preprocessing output in Source is also stereo. -using StereoFrame16 = std::array<std::array<s16, 2>, AudioCore::samples_per_frame>; +using StereoFrame16 = std::array<std::array<s16, 2>, samples_per_frame>; /// The DSP is quadraphonic internally. -using QuadFrame32 = std::array<std::array<s32, 4>, AudioCore::samples_per_frame>; +using QuadFrame32 = std::array<std::array<s32, 4>, samples_per_frame>; /** * This performs the filter operation defined by FilterT::ProcessSample on the frame in-place. diff --git a/src/audio_core/hle/dsp.h b/src/audio_core/hle/dsp.h index f94ec9467..f0f125284 100644 --- a/src/audio_core/hle/dsp.h +++ b/src/audio_core/hle/dsp.h @@ -8,7 +8,7 @@ #include <cstddef> #include <type_traits> -#include "audio_core/audio_core.h" +#include "audio_core/hle/common.h" #include "common/bit_field.h" #include "common/common_funcs.h" @@ -305,7 +305,7 @@ struct SourceConfiguration { u16_le buffer_id; }; - Configuration config[AudioCore::num_sources]; + Configuration config[num_sources]; }; ASSERT_DSP_STRUCT(SourceConfiguration::Configuration, 192); ASSERT_DSP_STRUCT(SourceConfiguration::Configuration::Buffer, 20); @@ -320,7 +320,7 @@ struct SourceStatus { INSERT_PADDING_DSPWORDS(1); }; - Status status[AudioCore::num_sources]; + Status status[num_sources]; }; ASSERT_DSP_STRUCT(SourceStatus::Status, 12); @@ -413,7 +413,7 @@ ASSERT_DSP_STRUCT(DspConfiguration::ReverbEffect, 52); struct AdpcmCoefficients { /// Coefficients are signed fixed point with 11 fractional bits. /// Each source has 16 coefficients associated with it. - s16_le coeff[AudioCore::num_sources][16]; + s16_le coeff[num_sources][16]; }; ASSERT_DSP_STRUCT(AdpcmCoefficients, 768); @@ -427,7 +427,7 @@ ASSERT_DSP_STRUCT(DspStatus, 32); /// Final mixed output in PCM16 stereo format, what you hear out of the speakers. /// When the application writes to this region it has no effect. struct FinalMixSamples { - s16_le pcm16[2 * AudioCore::samples_per_frame]; + s16_le pcm16[2 * samples_per_frame]; }; ASSERT_DSP_STRUCT(FinalMixSamples, 640); @@ -437,7 +437,7 @@ ASSERT_DSP_STRUCT(FinalMixSamples, 640); /// Values that exceed s16 range will be clipped by the DSP after further processing. struct IntermediateMixSamples { struct Samples { - s32_le pcm32[4][AudioCore::samples_per_frame]; ///< Little-endian as opposed to DSP middle-endian. + s32_le pcm32[4][samples_per_frame]; ///< Little-endian as opposed to DSP middle-endian. }; Samples mix1; diff --git a/src/audio_core/interpolate.cpp b/src/audio_core/interpolate.cpp new file mode 100644 index 000000000..fcd3aa066 --- /dev/null +++ b/src/audio_core/interpolate.cpp @@ -0,0 +1,85 @@ +// Copyright 2016 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "audio_core/interpolate.h" + +#include "common/assert.h" +#include "common/math_util.h" + +namespace AudioInterp { + +// Calculations are done in fixed point with 24 fractional bits. +// (This is not verified. This was chosen for minimal error.) +constexpr u64 scale_factor = 1 << 24; +constexpr u64 scale_mask = scale_factor - 1; + +/// Here we step over the input in steps of rate_multiplier, until we consume all of the input. +/// Three adjacent samples are passed to fn each step. +template <typename Function> +static StereoBuffer16 StepOverSamples(State& state, const StereoBuffer16& input, float rate_multiplier, Function fn) { + ASSERT(rate_multiplier > 0); + + if (input.size() < 2) + return {}; + + StereoBuffer16 output; + output.reserve(static_cast<size_t>(input.size() / rate_multiplier)); + + u64 step_size = static_cast<u64>(rate_multiplier * scale_factor); + + u64 fposition = 0; + const u64 max_fposition = input.size() * scale_factor; + + while (fposition < 1 * scale_factor) { + u64 fraction = fposition & scale_mask; + + output.push_back(fn(fraction, state.xn2, state.xn1, input[0])); + + fposition += step_size; + } + + while (fposition < 2 * scale_factor) { + u64 fraction = fposition & scale_mask; + + output.push_back(fn(fraction, state.xn1, input[0], input[1])); + + fposition += step_size; + } + + while (fposition < max_fposition) { + u64 fraction = fposition & scale_mask; + + size_t index = static_cast<size_t>(fposition / scale_factor); + output.push_back(fn(fraction, input[index - 2], input[index - 1], input[index])); + + fposition += step_size; + } + + state.xn2 = input[input.size() - 2]; + state.xn1 = input[input.size() - 1]; + + return output; +} + +StereoBuffer16 None(State& state, const StereoBuffer16& input, float rate_multiplier) { + return StepOverSamples(state, input, rate_multiplier, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) { + return x0; + }); +} + +StereoBuffer16 Linear(State& state, const StereoBuffer16& input, float rate_multiplier) { + // Note on accuracy: Some values that this produces are +/- 1 from the actual firmware. + return StepOverSamples(state, input, rate_multiplier, [](u64 fraction, const auto& x0, const auto& x1, const auto& x2) { + // This is a saturated subtraction. (Verified by black-box fuzzing.) + s64 delta0 = MathUtil::Clamp<s64>(x1[0] - x0[0], -32768, 32767); + s64 delta1 = MathUtil::Clamp<s64>(x1[1] - x0[1], -32768, 32767); + + return std::array<s16, 2> { + static_cast<s16>(x0[0] + fraction * delta0 / scale_factor), + static_cast<s16>(x0[1] + fraction * delta1 / scale_factor) + }; + }); +} + +} // namespace AudioInterp diff --git a/src/audio_core/interpolate.h b/src/audio_core/interpolate.h new file mode 100644 index 000000000..a4c0a453d --- /dev/null +++ b/src/audio_core/interpolate.h @@ -0,0 +1,41 @@ +// Copyright 2016 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include <array> +#include <vector> + +#include "common/common_types.h" + +namespace AudioInterp { + +/// A variable length buffer of signed PCM16 stereo samples. +using StereoBuffer16 = std::vector<std::array<s16, 2>>; + +struct State { + // Two historical samples. + std::array<s16, 2> xn1 = {}; ///< x[n-1] + std::array<s16, 2> xn2 = {}; ///< x[n-2] +}; + +/** + * No interpolation. This is equivalent to a zero-order hold. There is a two-sample predelay. + * @param input Input buffer. + * @param rate_multiplier Stretch factor. Must be a positive non-zero value. + * rate_multiplier > 1.0 performs decimation and rate_multipler < 1.0 performs upsampling. + * @return The resampled audio buffer. + */ +StereoBuffer16 None(State& state, const StereoBuffer16& input, float rate_multiplier); + +/** + * Linear interpolation. This is equivalent to a first-order hold. There is a two-sample predelay. + * @param input Input buffer. + * @param rate_multiplier Stretch factor. Must be a positive non-zero value. + * rate_multiplier > 1.0 performs decimation and rate_multipler < 1.0 performs upsampling. + * @return The resampled audio buffer. + */ +StereoBuffer16 Linear(State& state, const StereoBuffer16& input, float rate_multiplier); + +} // namespace AudioInterp diff --git a/src/citra_qt/bootmanager.cpp b/src/citra_qt/bootmanager.cpp index 8e60b9cad..01b81c11c 100644 --- a/src/citra_qt/bootmanager.cpp +++ b/src/citra_qt/bootmanager.cpp @@ -71,7 +71,9 @@ void EmuThread::run() { // Shutdown the core emulation System::Shutdown(); +#if MICROPROFILE_ENABLED MicroProfileOnThreadExit(); +#endif render_window->moveContext(); } diff --git a/src/citra_qt/debugger/profiler.cpp b/src/citra_qt/debugger/profiler.cpp index 4f6ba0e1f..7bb010f77 100644 --- a/src/citra_qt/debugger/profiler.cpp +++ b/src/citra_qt/debugger/profiler.cpp @@ -9,13 +9,16 @@ #include "citra_qt/debugger/profiler.h" #include "citra_qt/util/util.h" +#include "common/common_types.h" #include "common/microprofile.h" #include "common/profiler_reporting.h" // Include the implementation of the UI in this file. This isn't in microprofile.cpp because the // non-Qt frontends don't need it (and don't implement the UI drawing hooks either). +#if MICROPROFILE_ENABLED #define MICROPROFILEUI_IMPL 1 #include "common/microprofileui.h" +#endif using namespace Common::Profiling; @@ -34,21 +37,9 @@ static QVariant GetDataForColumn(int col, const AggregatedDuration& duration) } } -static const TimingCategoryInfo* GetCategoryInfo(int id) -{ - const auto& categories = GetProfilingManager().GetTimingCategoriesInfo(); - if ((size_t)id >= categories.size()) { - return nullptr; - } else { - return &categories[id]; - } -} - ProfilerModel::ProfilerModel(QObject* parent) : QAbstractItemModel(parent) { updateProfilingInfo(); - const auto& categories = GetProfilingManager().GetTimingCategoriesInfo(); - results.time_per_category.resize(categories.size()); } QVariant ProfilerModel::headerData(int section, Qt::Orientation orientation, int role) const @@ -85,7 +76,7 @@ int ProfilerModel::rowCount(const QModelIndex& parent) const if (parent.isValid()) { return 0; } else { - return static_cast<int>(results.time_per_category.size() + 2); + return 2; } } @@ -104,17 +95,6 @@ QVariant ProfilerModel::data(const QModelIndex& index, int role) const } else { return GetDataForColumn(index.column(), results.interframe_time); } - } else { - if (index.column() == 0) { - const TimingCategoryInfo* info = GetCategoryInfo(index.row() - 2); - return info != nullptr ? QString(info->name) : QVariant(); - } else { - if (index.row() - 2 < (int)results.time_per_category.size()) { - return GetDataForColumn(index.column(), results.time_per_category[index.row() - 2]); - } else { - return QVariant(); - } - } } } @@ -148,6 +128,8 @@ void ProfilerWidget::setProfilingInfoUpdateEnabled(bool enable) } } +#if MICROPROFILE_ENABLED + class MicroProfileWidget : public QWidget { public: MicroProfileWidget(QWidget* parent = nullptr); @@ -171,6 +153,8 @@ private: QTimer update_timer; }; +#endif + MicroProfileDialog::MicroProfileDialog(QWidget* parent) : QWidget(parent, Qt::Dialog) { @@ -180,6 +164,8 @@ MicroProfileDialog::MicroProfileDialog(QWidget* parent) // Remove the "?" button from the titlebar and enable the maximize button setWindowFlags(windowFlags() & ~Qt::WindowContextHelpButtonHint | Qt::WindowMaximizeButtonHint); +#if MICROPROFILE_ENABLED + MicroProfileWidget* widget = new MicroProfileWidget(this); QLayout* layout = new QVBoxLayout(this); @@ -191,6 +177,7 @@ MicroProfileDialog::MicroProfileDialog(QWidget* parent) setFocusProxy(widget); widget->setFocusPolicy(Qt::StrongFocus); widget->setFocus(); +#endif } QAction* MicroProfileDialog::toggleViewAction() { @@ -218,6 +205,9 @@ void MicroProfileDialog::hideEvent(QHideEvent* ev) { QWidget::hideEvent(ev); } + +#if MICROPROFILE_ENABLED + /// There's no way to pass a user pointer to MicroProfile, so this variable is used to make the /// QPainter available inside the drawing callbacks. static QPainter* mp_painter = nullptr; @@ -337,3 +327,4 @@ void MicroProfileDrawLine2D(u32 vertices_length, float* vertices, u32 hex_color) mp_painter->drawPolyline(point_buf.data(), vertices_length); point_buf.clear(); } +#endif diff --git a/src/citra_qt/debugger/profiler.h b/src/citra_qt/debugger/profiler.h index 036054740..3b38ed8ec 100644 --- a/src/citra_qt/debugger/profiler.h +++ b/src/citra_qt/debugger/profiler.h @@ -7,8 +7,10 @@ #include <QAbstractItemModel> #include <QDockWidget> #include <QTimer> + #include "ui_profiler.h" +#include "common/microprofile.h" #include "common/profiler_reporting.h" class ProfilerModel : public QAbstractItemModel @@ -49,6 +51,7 @@ private: QTimer update_timer; }; + class MicroProfileDialog : public QWidget { Q_OBJECT diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp index 2ca1e51f6..f1ab29755 100644 --- a/src/citra_qt/main.cpp +++ b/src/citra_qt/main.cpp @@ -69,8 +69,10 @@ GMainWindow::GMainWindow() : config(new Config()), emu_thread(nullptr) addDockWidget(Qt::BottomDockWidgetArea, profilerWidget); profilerWidget->hide(); +#if MICROPROFILE_ENABLED microProfileDialog = new MicroProfileDialog(this); microProfileDialog->hide(); +#endif disasmWidget = new DisassemblerWidget(this, emu_thread.get()); addDockWidget(Qt::BottomDockWidgetArea, disasmWidget); @@ -110,7 +112,9 @@ GMainWindow::GMainWindow() : config(new Config()), emu_thread(nullptr) QMenu* debug_menu = ui.menu_View->addMenu(tr("Debugging")); debug_menu->addAction(profilerWidget->toggleViewAction()); +#if MICROPROFILE_ENABLED debug_menu->addAction(microProfileDialog->toggleViewAction()); +#endif debug_menu->addAction(disasmWidget->toggleViewAction()); debug_menu->addAction(registersWidget->toggleViewAction()); debug_menu->addAction(callstackWidget->toggleViewAction()); @@ -136,8 +140,10 @@ GMainWindow::GMainWindow() : config(new Config()), emu_thread(nullptr) restoreGeometry(UISettings::values.geometry); restoreState(UISettings::values.state); render_window->restoreGeometry(UISettings::values.renderwindow_geometry); +#if MICROPROFILE_ENABLED microProfileDialog->restoreGeometry(UISettings::values.microprofile_geometry); microProfileDialog->setVisible(UISettings::values.microprofile_visible); +#endif game_list->LoadInterfaceLayout(); @@ -511,9 +517,10 @@ void GMainWindow::closeEvent(QCloseEvent* event) { UISettings::values.geometry = saveGeometry(); UISettings::values.state = saveState(); UISettings::values.renderwindow_geometry = render_window->saveGeometry(); +#if MICROPROFILE_ENABLED UISettings::values.microprofile_geometry = microProfileDialog->saveGeometry(); UISettings::values.microprofile_visible = microProfileDialog->isVisible(); - +#endif UISettings::values.single_window_mode = ui.action_Single_Window_Mode->isChecked(); UISettings::values.display_titlebar = ui.actionDisplay_widget_title_bars->isChecked(); UISettings::values.first_start = false; diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index c839ce173..aa6eee2a3 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -47,7 +47,6 @@ set(HEADERS microprofile.h microprofileui.h platform.h - profiler.h profiler_reporting.h scm_rev.h scope_exit.h diff --git a/src/common/microprofile.h b/src/common/microprofile.h index d3b6cb97c..ef312c6e1 100644 --- a/src/common/microprofile.h +++ b/src/common/microprofile.h @@ -4,6 +4,10 @@ #pragma once +// Uncomment this to disable microprofile. This will get you cleaner profiles when using +// external sampling profilers like "Very Sleepy", and will improve performance somewhat. +// #define MICROPROFILE_ENABLED 0 + // Customized Citra settings. // This file wraps the MicroProfile header so that these are consistent everywhere. #define MICROPROFILE_WEBSERVER 0 diff --git a/src/common/microprofileui.h b/src/common/microprofileui.h index 97c369bd9..41abe6b75 100644 --- a/src/common/microprofileui.h +++ b/src/common/microprofileui.h @@ -13,4 +13,7 @@ #define MICROPROFILE_HELP_ALT "Right-Click" #define MICROPROFILE_HELP_MOD "Ctrl" +// This isn't included by microprofileui.h :( +#include <cstdlib> // For std::abs + #include <microprofileui.h> diff --git a/src/common/profiler.cpp b/src/common/profiler.cpp index 7792edd2f..49eb3f40c 100644 --- a/src/common/profiler.cpp +++ b/src/common/profiler.cpp @@ -7,71 +7,16 @@ #include <vector> #include "common/assert.h" -#include "common/profiler.h" #include "common/profiler_reporting.h" #include "common/synchronized_wrapper.h" -#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013. - #define WIN32_LEAN_AND_MEAN - #include <Windows.h> // For QueryPerformanceCounter/Frequency -#endif - namespace Common { namespace Profiling { -#if ENABLE_PROFILING -thread_local Timer* Timer::current_timer = nullptr; -#endif - -#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013 -QPCClock::time_point QPCClock::now() { - static LARGE_INTEGER freq; - // Use this dummy local static to ensure this gets initialized once. - static BOOL dummy = QueryPerformanceFrequency(&freq); - - LARGE_INTEGER ticks; - QueryPerformanceCounter(&ticks); - - // This is prone to overflow when multiplying, which is why I'm using micro instead of nano. The - // correct way to approach this would be to just return ticks as a time_point and then subtract - // and do this conversion when creating a duration from two time_points, however, as far as I - // could tell the C++ requirements for these types are incompatible with this approach. - return time_point(duration(ticks.QuadPart * std::micro::den / freq.QuadPart)); -} -#endif - -TimingCategory::TimingCategory(const char* name, TimingCategory* parent) - : accumulated_duration(0) { - - ProfilingManager& manager = GetProfilingManager(); - category_id = manager.RegisterTimingCategory(this, name); - if (parent != nullptr) - manager.SetTimingCategoryParent(category_id, parent->category_id); -} - ProfilingManager::ProfilingManager() : last_frame_end(Clock::now()), this_frame_start(Clock::now()) { } -unsigned int ProfilingManager::RegisterTimingCategory(TimingCategory* category, const char* name) { - TimingCategoryInfo info; - info.category = category; - info.name = name; - info.parent = TimingCategoryInfo::NO_PARENT; - - unsigned int id = (unsigned int)timing_categories.size(); - timing_categories.push_back(std::move(info)); - - return id; -} - -void ProfilingManager::SetTimingCategoryParent(unsigned int category, unsigned int parent) { - ASSERT(category < timing_categories.size()); - ASSERT(parent < timing_categories.size()); - - timing_categories[category].parent = parent; -} - void ProfilingManager::BeginFrame() { this_frame_start = Clock::now(); } @@ -82,11 +27,6 @@ void ProfilingManager::FinishFrame() { results.interframe_time = now - last_frame_end; results.frame_time = now - this_frame_start; - results.time_per_category.resize(timing_categories.size()); - for (size_t i = 0; i < timing_categories.size(); ++i) { - results.time_per_category[i] = timing_categories[i].category->GetAccumulatedTime(); - } - last_frame_end = now; } @@ -100,26 +40,9 @@ void TimingResultsAggregator::Clear() { window_size = cursor = 0; } -void TimingResultsAggregator::SetNumberOfCategories(size_t n) { - size_t old_size = times_per_category.size(); - if (n == old_size) - return; - - times_per_category.resize(n); - - for (size_t i = old_size; i < n; ++i) { - times_per_category[i].resize(max_window_size, Duration::zero()); - } -} - void TimingResultsAggregator::AddFrame(const ProfilingFrameResult& frame_result) { - SetNumberOfCategories(frame_result.time_per_category.size()); - interframe_times[cursor] = frame_result.interframe_time; frame_times[cursor] = frame_result.frame_time; - for (size_t i = 0; i < frame_result.time_per_category.size(); ++i) { - times_per_category[i][cursor] = frame_result.time_per_category[i]; - } ++cursor; if (cursor == max_window_size) @@ -162,11 +85,6 @@ AggregatedFrameResult TimingResultsAggregator::GetAggregatedResults() const { result.fps = 0.0f; } - result.time_per_category.resize(times_per_category.size()); - for (size_t i = 0; i < times_per_category.size(); ++i) { - result.time_per_category[i] = AggregateField(times_per_category[i], window_size); - } - return result; } diff --git a/src/common/profiler.h b/src/common/profiler.h deleted file mode 100644 index 3e967b4bc..000000000 --- a/src/common/profiler.h +++ /dev/null @@ -1,152 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -#include <atomic> -#include <chrono> - -#include "common/assert.h" -#include "common/thread.h" - -namespace Common { -namespace Profiling { - -// If this is defined to 0, it turns all Timers into no-ops. -#ifndef ENABLE_PROFILING -#define ENABLE_PROFILING 1 -#endif - -#if defined(_MSC_VER) && _MSC_VER <= 1800 // MSVC 2013 -// MSVC up to 2013 doesn't use QueryPerformanceCounter for high_resolution_clock, so it has bad -// precision. We manually implement a clock based on QPC to get good results. - -struct QPCClock { - using duration = std::chrono::microseconds; - using time_point = std::chrono::time_point<QPCClock>; - using rep = duration::rep; - using period = duration::period; - static const bool is_steady = false; - - static time_point now(); -}; - -using Clock = QPCClock; -#else -using Clock = std::chrono::high_resolution_clock; -#endif - -using Duration = Clock::duration; - -/** - * Represents a timing category that measured time can be accounted towards. Should be declared as a - * global variable and passed to Timers. - */ -class TimingCategory final { -public: - TimingCategory(const char* name, TimingCategory* parent = nullptr); - - unsigned int GetCategoryId() const { - return category_id; - } - - /// Adds some time to this category. Can safely be called from multiple threads at the same time. - void AddTime(Duration amount) { - std::atomic_fetch_add_explicit( - &accumulated_duration, amount.count(), - std::memory_order_relaxed); - } - - /** - * Atomically retrieves the accumulated measured time for this category and resets the counter - * to zero. Can be safely called concurrently with AddTime. - */ - Duration GetAccumulatedTime() { - return Duration(std::atomic_exchange_explicit( - &accumulated_duration, (Duration::rep)0, - std::memory_order_relaxed)); - } - -private: - unsigned int category_id; - std::atomic<Duration::rep> accumulated_duration; -}; - -/** - * Measures time elapsed between a call to Start and a call to Stop and attributes it to the given - * TimingCategory. Start/Stop can be called multiple times on the same timer, but each call must be - * appropriately paired. - * - * When a Timer is started, it automatically pauses a previously running timer on the same thread, - * which is resumed when it is stopped. As such, no special action needs to be taken to avoid - * double-accounting of time on two categories. - */ -class Timer { -public: - Timer(TimingCategory& category) : category(category) { - } - - void Start() { -#if ENABLE_PROFILING - ASSERT(!running); - previous_timer = current_timer; - current_timer = this; - if (previous_timer != nullptr) - previous_timer->StopTiming(); - - StartTiming(); -#endif - } - - void Stop() { -#if ENABLE_PROFILING - ASSERT(running); - StopTiming(); - - if (previous_timer != nullptr) - previous_timer->StartTiming(); - current_timer = previous_timer; -#endif - } - -private: -#if ENABLE_PROFILING - void StartTiming() { - start = Clock::now(); - running = true; - } - - void StopTiming() { - auto duration = Clock::now() - start; - running = false; - category.AddTime(std::chrono::duration_cast<Duration>(duration)); - } - - Clock::time_point start; - bool running = false; - - Timer* previous_timer; - static thread_local Timer* current_timer; -#endif - - TimingCategory& category; -}; - -/** - * A Timer that automatically starts timing when created and stops at the end of the scope. Should - * be used in the majority of cases. - */ -class ScopeTimer : public Timer { -public: - ScopeTimer(TimingCategory& category) : Timer(category) { - Start(); - } - - ~ScopeTimer() { - Stop(); - } -}; - -} // namespace Profiling -} // namespace Common diff --git a/src/common/profiler_reporting.h b/src/common/profiler_reporting.h index df98e05b7..fa1ac883f 100644 --- a/src/common/profiler_reporting.h +++ b/src/common/profiler_reporting.h @@ -4,22 +4,17 @@ #pragma once +#include <chrono> #include <cstddef> #include <vector> -#include "common/profiler.h" #include "common/synchronized_wrapper.h" namespace Common { namespace Profiling { -struct TimingCategoryInfo { - static const unsigned int NO_PARENT = -1; - - TimingCategory* category; - const char* name; - unsigned int parent; -}; +using Clock = std::chrono::high_resolution_clock; +using Duration = Clock::duration; struct ProfilingFrameResult { /// Time since the last delivered frame @@ -27,22 +22,12 @@ struct ProfilingFrameResult { /// Time spent processing a frame, excluding VSync Duration frame_time; - - /// Total amount of time spent inside each category in this frame. Indexed by the category id - std::vector<Duration> time_per_category; }; class ProfilingManager final { public: ProfilingManager(); - unsigned int RegisterTimingCategory(TimingCategory* category, const char* name); - void SetTimingCategoryParent(unsigned int category, unsigned int parent); - - const std::vector<TimingCategoryInfo>& GetTimingCategoriesInfo() const { - return timing_categories; - } - /// This should be called after swapping screen buffers. void BeginFrame(); /// This should be called before swapping screen buffers. @@ -54,7 +39,6 @@ public: } private: - std::vector<TimingCategoryInfo> timing_categories; Clock::time_point last_frame_end; Clock::time_point this_frame_start; @@ -73,9 +57,6 @@ struct AggregatedFrameResult { AggregatedDuration frame_time; float fps; - - /// Total amount of time spent inside each category in this frame. Indexed by the category id - std::vector<AggregatedDuration> time_per_category; }; class TimingResultsAggregator final { @@ -83,7 +64,6 @@ public: TimingResultsAggregator(size_t window_size); void Clear(); - void SetNumberOfCategories(size_t n); void AddFrame(const ProfilingFrameResult& frame_result); @@ -95,7 +75,6 @@ public: std::vector<Duration> interframe_times; std::vector<Duration> frame_times; - std::vector<std::vector<Duration>> times_per_category; }; ProfilingManager& GetProfilingManager(); diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp index 647784208..8d4b26815 100644 --- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp +++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp @@ -10,7 +10,6 @@ #include "common/common_types.h" #include "common/logging/log.h" #include "common/microprofile.h" -#include "common/profiler.h" #include "core/memory.h" #include "core/hle/svc.h" @@ -25,9 +24,6 @@ #include "core/gdbstub/gdbstub.h" -Common::Profiling::TimingCategory profile_execute("DynCom::Execute"); -Common::Profiling::TimingCategory profile_decode("DynCom::Decode"); - enum { COND = (1 << 0), NON_BRANCH = (1 << 1), @@ -3496,7 +3492,6 @@ static unsigned int InterpreterTranslateInstruction(const ARMul_State* cpu, cons } static int InterpreterTranslateBlock(ARMul_State* cpu, int& bb_start, u32 addr) { - Common::Profiling::ScopeTimer timer_decode(profile_decode); MICROPROFILE_SCOPE(DynCom_Decode); // Decode instruction, get index @@ -3530,7 +3525,6 @@ static int InterpreterTranslateBlock(ARMul_State* cpu, int& bb_start, u32 addr) } static int InterpreterTranslateSingle(ARMul_State* cpu, int& bb_start, u32 addr) { - Common::Profiling::ScopeTimer timer_decode(profile_decode); MICROPROFILE_SCOPE(DynCom_Decode); ARM_INST_PTR inst_base = nullptr; @@ -3565,7 +3559,6 @@ static int clz(unsigned int x) { MICROPROFILE_DEFINE(DynCom_Execute, "DynCom", "Execute", MP_RGB(255, 0, 0)); unsigned InterpreterMainLoop(ARMul_State* cpu) { - Common::Profiling::ScopeTimer timer_execute(profile_execute); MICROPROFILE_SCOPE(DynCom_Execute); GDBStub::BreakpointAddress breakpoint_data; diff --git a/src/core/hle/service/gsp_gpu.cpp b/src/core/hle/service/gsp_gpu.cpp index 211fcf599..233592d7f 100644 --- a/src/core/hle/service/gsp_gpu.cpp +++ b/src/core/hle/service/gsp_gpu.cpp @@ -4,7 +4,6 @@ #include "common/bit_field.h" #include "common/microprofile.h" -#include "common/profiler.h" #include "core/memory.h" #include "core/hle/kernel/event.h" diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp index ae54afb1c..a9a1a3244 100644 --- a/src/core/hle/svc.cpp +++ b/src/core/hle/svc.cpp @@ -6,7 +6,6 @@ #include "common/logging/log.h" #include "common/microprofile.h" -#include "common/profiler.h" #include "common/string_util.h" #include "common/symbols.h" @@ -1031,8 +1030,6 @@ static const FunctionDef SVC_Table[] = { {0x7D, HLE::Wrap<QueryProcessMemory>, "QueryProcessMemory"}, }; -Common::Profiling::TimingCategory profiler_svc("SVC Calls"); - static const FunctionDef* GetSVCInfo(u32 func_num) { if (func_num >= ARRAY_SIZE(SVC_Table)) { LOG_ERROR(Kernel_SVC, "unknown svc=0x%02X", func_num); @@ -1044,7 +1041,6 @@ static const FunctionDef* GetSVCInfo(u32 func_num) { MICROPROFILE_DEFINE(Kernel_SVC, "Kernel", "SVC", MP_RGB(70, 200, 70)); void CallSVC(u32 immediate) { - Common::Profiling::ScopeTimer timer_svc(profiler_svc); MICROPROFILE_SCOPE(Kernel_SVC); const FunctionDef* info = GetSVCInfo(immediate); diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 76cfd4f7d..de4082b1f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -16,6 +16,7 @@ set(SRCS shader/shader_interpreter.cpp swrasterizer.cpp utils.cpp + vertex_loader.cpp video_core.cpp ) @@ -43,6 +44,7 @@ set(HEADERS shader/shader_interpreter.h swrasterizer.h utils.h + vertex_loader.h video_core.h ) diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index 3abe79c09..58883e374 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp @@ -7,7 +7,6 @@ #include "common/alignment.h" #include "common/microprofile.h" -#include "common/profiler.h" #include "core/settings.h" #include "core/hle/service/gsp_gpu.h" @@ -22,6 +21,7 @@ #include "video_core/video_core.h" #include "video_core/debug_utils/debug_utils.h" #include "video_core/shader/shader_interpreter.h" +#include "video_core/vertex_loader.h" namespace Pica { @@ -35,8 +35,6 @@ static int default_attr_counter = 0; static u32 default_attr_write_buffer[3]; -Common::Profiling::TimingCategory category_drawing("Drawing"); - // Expand a 4-bit mask to 4-byte mask, e.g. 0b0101 -> 0x00FF00FF static const u32 expand_bits_to_bytes[] = { 0x00000000, 0x000000ff, 0x0000ff00, 0x0000ffff, @@ -186,60 +184,19 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { case PICA_REG_INDEX(trigger_draw): case PICA_REG_INDEX(trigger_draw_indexed): { - Common::Profiling::ScopeTimer scope_timer(category_drawing); MICROPROFILE_SCOPE(GPU_Drawing); #if PICA_LOG_TEV DebugUtils::DumpTevStageConfig(regs.GetTevStages()); #endif - if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr); - const auto& attribute_config = regs.vertex_attributes; - const u32 base_address = attribute_config.GetPhysicalBaseAddress(); - - // Information about internal vertex attributes - u32 vertex_attribute_sources[16]; - boost::fill(vertex_attribute_sources, 0xdeadbeef); - u32 vertex_attribute_strides[16] = {}; - Regs::VertexAttributeFormat vertex_attribute_formats[16] = {}; - - u32 vertex_attribute_elements[16] = {}; - u32 vertex_attribute_element_size[16] = {}; - - // Setup attribute data from loaders - for (int loader = 0; loader < 12; ++loader) { - const auto& loader_config = attribute_config.attribute_loaders[loader]; - - u32 offset = 0; - - // TODO: What happens if a loader overwrites a previous one's data? - for (unsigned component = 0; component < loader_config.component_count; ++component) { - if (component >= 12) { - LOG_ERROR(HW_GPU, "Overflow in the vertex attribute loader %u trying to load component %u", loader, component); - continue; - } - - u32 attribute_index = loader_config.GetComponent(component); - if (attribute_index < 12) { - int element_size = attribute_config.GetElementSizeInBytes(attribute_index); - offset = Common::AlignUp(offset, element_size); - vertex_attribute_sources[attribute_index] = base_address + loader_config.data_offset + offset; - vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count); - vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index); - vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index); - vertex_attribute_element_size[attribute_index] = element_size; - offset += attribute_config.GetStride(attribute_index); - } else if (attribute_index < 16) { - // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively - offset = Common::AlignUp(offset, 4); - offset += (attribute_index - 11) * 4; - } else { - UNREACHABLE(); // This is truly unreachable due to the number of bits for each component - } - } - } + // Processes information about internal vertex attributes to figure out how a vertex is loaded. + // Later, these can be compiled and cached. + VertexLoader loader; + const u32 base_address = regs.vertex_attributes.GetPhysicalBaseAddress(); + loader.Setup(regs); // Load vertices bool is_indexed = (id == PICA_REG_INDEX(trigger_draw_indexed)); @@ -263,32 +220,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { } } - class { - /// Combine overlapping and close ranges - void SimplifyRanges() { - for (auto it = ranges.begin(); it != ranges.end(); ++it) { - // NOTE: We add 32 to the range end address to make sure "close" ranges are combined, too - auto it2 = std::next(it); - while (it2 != ranges.end() && it->first + it->second + 32 >= it2->first) { - it->second = std::max(it->second, it2->first + it2->second - it->first); - it2 = ranges.erase(it2); - } - } - } - - public: - /// Record a particular memory access in the list - void AddAccess(u32 paddr, u32 size) { - // Create new range or extend existing one - ranges[paddr] = std::max(ranges[paddr], size); - - // Simplify ranges... - SimplifyRanges(); - } - - /// Map of accessed ranges (mapping start address to range size) - std::map<u32, u32> ranges; - } memory_accesses; + DebugUtils::MemoryAccessTracker memory_accesses; // Simple circular-replacement vertex cache // The size has been tuned for optimal balance between hit-rate and the cost of lookup @@ -332,60 +264,13 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { if (!vertex_cache_hit) { // Initialize data for the current vertex Shader::InputVertex input; - - for (int i = 0; i < attribute_config.GetNumTotalAttributes(); ++i) { - if (vertex_attribute_elements[i] != 0) { - // Default attribute values set if array elements have < 4 components. This - // is *not* carried over from the default attribute settings even if they're - // enabled for this attribute. - static const float24 zero = float24::FromFloat32(0.0f); - static const float24 one = float24::FromFloat32(1.0f); - input.attr[i] = Math::Vec4<float24>(zero, zero, zero, one); - - // Load per-vertex data from the loader arrays - for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { - u32 source_addr = vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i]; - const u8* srcdata = Memory::GetPhysicalPointer(source_addr); - - if (g_debug_context && Pica::g_debug_context->recorder) { - memory_accesses.AddAccess(source_addr, - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4 - : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1); - } - - const float srcval = - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::BYTE) ? *reinterpret_cast<const s8*>(srcdata) : - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::UBYTE) ? *reinterpret_cast<const u8*>(srcdata) : - (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? *reinterpret_cast<const s16*>(srcdata) : - *reinterpret_cast<const float*>(srcdata); - - input.attr[i][comp] = float24::FromFloat32(srcval); - LOG_TRACE(HW_GPU, "Loaded component %x of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f", - comp, i, vertex, index, - attribute_config.GetPhysicalBaseAddress(), - vertex_attribute_sources[i] - base_address, - vertex_attribute_strides[i] * vertex + comp * vertex_attribute_element_size[i], - input.attr[i][comp].ToFloat32()); - } - } else if (attribute_config.IsDefaultAttribute(i)) { - // Load the default attribute if we're configured to do so - input.attr[i] = g_state.vs.default_attributes[i]; - LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", - i, vertex, index, - input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), - input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); - } else { - // TODO(yuriks): In this case, no data gets loaded and the vertex - // remains with the last value it had. This isn't currently maintained - // as global state, however, and so won't work in Citra yet. - } - } + loader.LoadVertex(base_address, index, vertex, input, memory_accesses); if (g_debug_context) g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input); // Send to vertex shader - output = Shader::Run(shader_unit, input, attribute_config.GetNumTotalAttributes()); + output = Shader::Run(shader_unit, input, loader.GetNumTotalAttributes()); if (is_indexed) { vertex_cache[vertex_cache_pos] = output; diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h index 56f9bd958..dd0828cee 100644 --- a/src/video_core/debug_utils/debug_utils.h +++ b/src/video_core/debug_utils/debug_utils.h @@ -216,6 +216,36 @@ void DumpTexture(const Pica::Regs::TextureConfig& texture_config, u8* data); void DumpTevStageConfig(const std::array<Pica::Regs::TevStageConfig,6>& stages); +/** + * Used in the vertex loader to merge access records. TODO: Investigate if actually useful. + */ +class MemoryAccessTracker { + /// Combine overlapping and close ranges + void SimplifyRanges() { + for (auto it = ranges.begin(); it != ranges.end(); ++it) { + // NOTE: We add 32 to the range end address to make sure "close" ranges are combined, too + auto it2 = std::next(it); + while (it2 != ranges.end() && it->first + it->second + 32 >= it2->first) { + it->second = std::max(it->second, it2->first + it2->second - it->first); + it2 = ranges.erase(it2); + } + } + } + +public: + /// Record a particular memory access in the list + void AddAccess(u32 paddr, u32 size) { + // Create new range or extend existing one + ranges[paddr] = std::max(ranges[paddr], size); + + // Simplify ranges... + SimplifyRanges(); + } + + /// Map of accessed ranges (mapping start address to range size) + std::map<u32, u32> ranges; +}; + } // namespace } // namespace diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp index 0434ad05a..9cf77b1f2 100644 --- a/src/video_core/rasterizer.cpp +++ b/src/video_core/rasterizer.cpp @@ -9,7 +9,6 @@ #include "common/common_types.h" #include "common/math_util.h" #include "common/microprofile.h" -#include "common/profiler.h" #include "core/memory.h" #include "core/hw/gpu.h" @@ -287,7 +286,6 @@ static int SignedArea (const Math::Vec2<Fix12P4>& vtx1, return Math::Cross(vec1, vec2).z; }; -static Common::Profiling::TimingCategory rasterization_category("Rasterization"); MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240)); /** @@ -300,7 +298,6 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0, bool reversed = false) { const auto& regs = g_state.regs; - Common::Profiling::ScopeTimer timer(rasterization_category); MICROPROFILE_SCOPE(GPU_Rasterization); // vertex positions in rasterizer coordinates diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 30187d4cf..a8c775c80 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -11,7 +11,6 @@ #include "common/file_util.h" #include "common/math_util.h" #include "common/microprofile.h" -#include "common/profiler.h" #include "core/memory.h" #include "core/settings.h" diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 75301accd..043e99190 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp @@ -9,7 +9,6 @@ #include "common/hash.h" #include "common/microprofile.h" -#include "common/profiler.h" #include "video_core/debug_utils/debug_utils.h" #include "video_core/pica.h" @@ -57,13 +56,11 @@ void Shutdown() { #endif // ARCHITECTURE_x86_64 } -static Common::Profiling::TimingCategory shader_category("Vertex Shader"); MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240)); OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) { auto& config = g_state.regs.vs; - Common::Profiling::ScopeTimer timer(shader_category); MICROPROFILE_SCOPE(GPU_VertexShader); state.program_counter = config.main_offset; diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 9c5bd97bd..9ce9344d2 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h @@ -25,7 +25,7 @@ namespace Pica { namespace Shader { struct InputVertex { - Math::Vec4<float24> attr[16]; + alignas(16) Math::Vec4<float24> attr[16]; }; struct OutputVertex { diff --git a/src/video_core/vertex_loader.cpp b/src/video_core/vertex_loader.cpp new file mode 100644 index 000000000..8a3d91896 --- /dev/null +++ b/src/video_core/vertex_loader.cpp @@ -0,0 +1,140 @@ +#include <cmath> +#include <string> + +#include "boost/range/algorithm/fill.hpp" + +#include "common/assert.h" +#include "common/alignment.h" +#include "common/bit_field.h" +#include "common/common_funcs.h" +#include "common/common_types.h" +#include "common/logging/log.h" + +#include "core/memory.h" + +#include "video_core/debug_utils/debug_utils.h" +#include "video_core/pica.h" +#include "video_core/pica_state.h" +#include "video_core/pica_types.h" +#include "video_core/vertex_loader.h" + +namespace Pica { + +void VertexLoader::Setup(const Pica::Regs& regs) { + const auto& attribute_config = regs.vertex_attributes; + num_total_attributes = attribute_config.GetNumTotalAttributes(); + + boost::fill(vertex_attribute_sources, 0xdeadbeef); + + for (int i = 0; i < 16; i++) { + vertex_attribute_is_default[i] = attribute_config.IsDefaultAttribute(i); + } + + // Setup attribute data from loaders + for (int loader = 0; loader < 12; ++loader) { + const auto& loader_config = attribute_config.attribute_loaders[loader]; + + u32 offset = 0; + + // TODO: What happens if a loader overwrites a previous one's data? + for (unsigned component = 0; component < loader_config.component_count; ++component) { + if (component >= 12) { + LOG_ERROR(HW_GPU, "Overflow in the vertex attribute loader %u trying to load component %u", loader, component); + continue; + } + + u32 attribute_index = loader_config.GetComponent(component); + if (attribute_index < 12) { + offset = Common::AlignUp(offset, attribute_config.GetElementSizeInBytes(attribute_index)); + vertex_attribute_sources[attribute_index] = loader_config.data_offset + offset; + vertex_attribute_strides[attribute_index] = static_cast<u32>(loader_config.byte_count); + vertex_attribute_formats[attribute_index] = attribute_config.GetFormat(attribute_index); + vertex_attribute_elements[attribute_index] = attribute_config.GetNumElements(attribute_index); + offset += attribute_config.GetStride(attribute_index); + } else if (attribute_index < 16) { + // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings, respectively + offset = Common::AlignUp(offset, 4); + offset += (attribute_index - 11) * 4; + } else { + UNREACHABLE(); // This is truly unreachable due to the number of bits for each component + } + } + } +} + +void VertexLoader::LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, DebugUtils::MemoryAccessTracker& memory_accesses) { + for (int i = 0; i < num_total_attributes; ++i) { + if (vertex_attribute_elements[i] != 0) { + // Load per-vertex data from the loader arrays + u32 source_addr = base_address + vertex_attribute_sources[i] + vertex_attribute_strides[i] * vertex; + + if (g_debug_context && Pica::g_debug_context->recorder) { + memory_accesses.AddAccess(source_addr, vertex_attribute_elements[i] * ( + (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::FLOAT) ? 4 + : (vertex_attribute_formats[i] == Regs::VertexAttributeFormat::SHORT) ? 2 : 1)); + } + + switch (vertex_attribute_formats[i]) { + case Regs::VertexAttributeFormat::BYTE: + { + const s8* srcdata = reinterpret_cast<const s8*>(Memory::GetPhysicalPointer(source_addr)); + for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { + input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); + } + break; + } + case Regs::VertexAttributeFormat::UBYTE: + { + const u8* srcdata = reinterpret_cast<const u8*>(Memory::GetPhysicalPointer(source_addr)); + for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { + input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); + } + break; + } + case Regs::VertexAttributeFormat::SHORT: + { + const s16* srcdata = reinterpret_cast<const s16*>(Memory::GetPhysicalPointer(source_addr)); + for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { + input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); + } + break; + } + case Regs::VertexAttributeFormat::FLOAT: + { + const float* srcdata = reinterpret_cast<const float*>(Memory::GetPhysicalPointer(source_addr)); + for (unsigned int comp = 0; comp < vertex_attribute_elements[i]; ++comp) { + input.attr[i][comp] = float24::FromFloat32(srcdata[comp]); + } + break; + } + } + + // Default attribute values set if array elements have < 4 components. This + // is *not* carried over from the default attribute settings even if they're + // enabled for this attribute. + for (unsigned int comp = vertex_attribute_elements[i]; comp < 4; ++comp) { + input.attr[i][comp] = comp == 3 ? float24::FromFloat32(1.0f) : float24::FromFloat32(0.0f); + } + + LOG_TRACE(HW_GPU, "Loaded %d components of attribute %x for vertex %x (index %x) from 0x%08x + 0x%08x + 0x%04x: %f %f %f %f", + vertex_attribute_elements[i], i, vertex, index, + base_address, + vertex_attribute_sources[i], + vertex_attribute_strides[i] * vertex, + input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); + } else if (vertex_attribute_is_default[i]) { + // Load the default attribute if we're configured to do so + input.attr[i] = g_state.vs.default_attributes[i]; + LOG_TRACE(HW_GPU, "Loaded default attribute %x for vertex %x (index %x): (%f, %f, %f, %f)", + i, vertex, index, + input.attr[i][0].ToFloat32(), input.attr[i][1].ToFloat32(), + input.attr[i][2].ToFloat32(), input.attr[i][3].ToFloat32()); + } else { + // TODO(yuriks): In this case, no data gets loaded and the vertex + // remains with the last value it had. This isn't currently maintained + // as global state, however, and so won't work in Citra yet. + } + } +} + +} // namespace Pica
\ No newline at end of file diff --git a/src/video_core/vertex_loader.h b/src/video_core/vertex_loader.h new file mode 100644 index 000000000..ff42d1596 --- /dev/null +++ b/src/video_core/vertex_loader.h @@ -0,0 +1,28 @@ +#pragma once + +#include <iterator> +#include <algorithm> + +#include "video_core/pica.h" +#include "video_core/shader/shader.h" +#include "video_core/debug_utils/debug_utils.h" + +namespace Pica { + +class VertexLoader { +public: + void Setup(const Pica::Regs& regs); + void LoadVertex(u32 base_address, int index, int vertex, Shader::InputVertex& input, DebugUtils::MemoryAccessTracker& memory_accesses); + + int GetNumTotalAttributes() const { return num_total_attributes; } + +private: + u32 vertex_attribute_sources[16]; + u32 vertex_attribute_strides[16] = {}; + Regs::VertexAttributeFormat vertex_attribute_formats[16] = {}; + u32 vertex_attribute_elements[16] = {}; + bool vertex_attribute_is_default[16]; + int num_total_attributes; +}; + +} // namespace Pica |