From 5b5e60ffeca1a718cd980e74f0528d6ab91788cf Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Wed, 25 Sep 2019 19:43:23 -0400
Subject: GPU_Async: Correct fences, display events and more.

This commit uses guest fences on vSync event instead of an articial fake
fence we had.
It also corrects to keep signaling display events while loading the game
as the OS is suppose to send buffers to vSync during that time.
---
 src/core/hle/service/nvflinger/nvflinger.cpp | 21 +++++++++++++++++++--
 src/core/hle/service/nvflinger/nvflinger.h   |  2 ++
 src/video_core/gpu.cpp                       | 13 +++++++++++++
 src/video_core/gpu.h                         |  3 +++
 src/video_core/gpu_thread.cpp                | 14 +-------------
 src/video_core/gpu_thread.h                  |  6 ------
 6 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp
index 3b251f8c8..86a90526c 100644
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -36,6 +36,10 @@ NVFlinger::NVFlinger(Core::System& system) : system(system) {
     displays.emplace_back(3, "Internal", system);
     displays.emplace_back(4, "Null", system);
 
+    for (auto& display : displays) {
+        display.SignalVSyncEvent();
+    }
+
     // Schedule the screen composition events
     composition_event = system.CoreTiming().RegisterEvent(
         "ScreenComposition", [this](u64 userdata, s64 cycles_late) {
@@ -173,7 +177,13 @@ void NVFlinger::Compose() {
         bool trigger_event = false;
         // Trigger vsync for this display at the end of drawing
         SCOPE_EXIT({
-            if (trigger_event) {
+            // TODO(Blinkhawk): Correctly send buffers through nvflinger while
+            // loading the game thorugh the OS.
+            // During loading, the OS takes care of sending buffers to vsync,
+            // thus it triggers, since this is not properly emulated due to
+            // HLE complications, we allow it to signal until the game enqueues
+            // it's first buffer.
+            if (trigger_event || !first_buffer_enqueued) {
                 display.SignalVSyncEvent();
             }
         });
@@ -193,13 +203,20 @@ void NVFlinger::Compose() {
 
         if (!buffer) {
             // There was no queued buffer to draw, render previous frame
-            system.GetPerfStats().EndGameFrame();
             system.GPU().SwapBuffers({});
             continue;
         }
 
         const auto& igbp_buffer = buffer->get().igbp_buffer;
         trigger_event = true;
+        first_buffer_enqueued = true;
+
+        const auto& gpu = system.GPU();
+        const auto& multi_fence = buffer->get().multi_fence;
+        for (u32 fence_id = 0; fence_id < multi_fence.num_fences; fence_id++) {
+            const auto& fence = multi_fence.fences[fence_id];
+            gpu.WaitFence(fence.id, fence.value);
+        }
 
         // Now send the buffer to the GPU for drawing.
         // TODO(Subv): Support more than just disp0. The display device selection is probably based
diff --git a/src/core/hle/service/nvflinger/nvflinger.h b/src/core/hle/service/nvflinger/nvflinger.h
index 5d7e3bfb8..95d7278f5 100644
--- a/src/core/hle/service/nvflinger/nvflinger.h
+++ b/src/core/hle/service/nvflinger/nvflinger.h
@@ -102,6 +102,8 @@ private:
 
     u32 swap_interval = 1;
 
+    bool first_buffer_enqueued{};
+
     /// Event that handles screen composition.
     Core::Timing::EventType* composition_event;
 
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index 76cfe8107..d94be9c9d 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -3,6 +3,7 @@
 // Refer to the license.txt file included.
 
 #include "common/assert.h"
+#include "common/microprofile.h"
 #include "core/core.h"
 #include "core/core_timing.h"
 #include "core/memory.h"
@@ -17,6 +18,8 @@
 
 namespace Tegra {
 
+MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
+
 GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer, bool is_async)
     : system{system}, renderer{renderer}, is_async{is_async} {
     auto& rasterizer{renderer.Rasterizer()};
@@ -63,6 +66,16 @@ const DmaPusher& GPU::DmaPusher() const {
     return *dma_pusher;
 }
 
+void GPU::WaitFence(u32 syncpoint_id, u32 value) const {
+    // Synced GPU, is always in sync
+    if (!is_async) {
+        return;
+    }
+    MICROPROFILE_SCOPE(GPU_wait);
+    while (syncpoints[syncpoint_id].load() < value) {
+    }
+}
+
 void GPU::IncrementSyncPoint(const u32 syncpoint_id) {
     syncpoints[syncpoint_id]++;
     std::lock_guard lock{sync_mutex};
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 29fa8e95b..e20b0687a 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -177,6 +177,9 @@ public:
     /// Returns a reference to the GPU DMA pusher.
     Tegra::DmaPusher& DmaPusher();
 
+    /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame.
+    void WaitFence(u32 syncpoint_id, u32 value) const;
+
     void IncrementSyncPoint(u32 syncpoint_id);
 
     u32 GetSyncpointValue(u32 syncpoint_id) const;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index 5f039e4fd..d7048b6ae 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -5,8 +5,6 @@
 #include "common/assert.h"
 #include "common/microprofile.h"
 #include "core/core.h"
-#include "core/core_timing.h"
-#include "core/core_timing_util.h"
 #include "core/frontend/scope_acquire_window_context.h"
 #include "video_core/dma_pusher.h"
 #include "video_core/gpu.h"
@@ -68,14 +66,10 @@ ThreadManager::~ThreadManager() {
 
 void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) {
     thread = std::thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)};
-    synchronization_event = system.CoreTiming().RegisterEvent(
-        "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
 }
 
 void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
-    const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))};
-    const s64 synchronization_ticks{Core::Timing::usToCycles(std::chrono::microseconds{9000})};
-    system.CoreTiming().ScheduleEvent(synchronization_ticks, synchronization_event, fence);
+    PushCommand(SubmitListCommand(std::move(entries)));
 }
 
 void ThreadManager::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) {
@@ -102,10 +96,4 @@ u64 ThreadManager::PushCommand(CommandData&& command_data) {
     return fence;
 }
 
-MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192));
-void SynchState::WaitForSynchronization(u64 fence) {
-    while (signaled_fence.load() < fence)
-        ;
-}
-
 } // namespace VideoCommon::GPUThread
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 3ae0ec9f3..108f456bd 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -21,9 +21,6 @@ class DmaPusher;
 
 namespace Core {
 class System;
-namespace Timing {
-struct EventType;
-} // namespace Timing
 } // namespace Core
 
 namespace VideoCommon::GPUThread {
@@ -89,8 +86,6 @@ struct CommandDataContainer {
 struct SynchState final {
     std::atomic_bool is_running{true};
 
-    void WaitForSynchronization(u64 fence);
-
     using CommandQueue = Common::SPSCQueue<CommandDataContainer>;
     CommandQueue queue;
     u64 last_fence{};
@@ -128,7 +123,6 @@ private:
 private:
     SynchState state;
     Core::System& system;
-    Core::Timing::EventType* synchronization_event{};
     std::thread thread;
     std::thread::id thread_id;
 };
-- 
cgit v1.2.3