53 files changed, 949 insertions, 357 deletions
diff --git a/src/core/core.cpp b/src/core/core.cpp
index ab7181a05..eba2177d1 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -36,7 +36,8 @@
 #include "frontend/applets/software_keyboard.h"
 #include "frontend/applets/web_browser.h"
 #include "video_core/debug_utils/debug_utils.h"
-#include "video_core/gpu.h"
+#include "video_core/gpu_asynch.h"
+#include "video_core/gpu_synch.h"
 #include "video_core/renderer_base.h"
 #include "video_core/video_core.h"
 
@@ -78,6 +79,7 @@ FileSys::VirtualFile GetGameFileFromPath(const FileSys::VirtualFilesystem& vfs,
     return vfs->OpenFile(path, FileSys::Mode::Read);
 }
 struct System::Impl {
+    explicit Impl(System& system) : kernel{system} {}
 
     Cpu& CurrentCpuCore() {
         return cpu_core_manager.GetCurrentCore();
@@ -95,7 +97,7 @@ struct System::Impl {
         LOG_DEBUG(HW_Memory, "initialized OK");
 
         core_timing.Initialize();
-        kernel.Initialize(core_timing);
+        kernel.Initialize();
 
         const auto current_time = std::chrono::duration_cast<std::chrono::seconds>(
             std::chrono::system_clock::now().time_since_epoch());
@@ -128,10 +130,16 @@ struct System::Impl {
             return ResultStatus::ErrorVideoCore;
         }
 
-        gpu_core = std::make_unique<Tegra::GPU>(system, renderer->Rasterizer());
+        is_powered_on = true;
+
+        if (Settings::values.use_asynchronous_gpu_emulation) {
+            gpu_core = std::make_unique<VideoCommon::GPUAsynch>(system, *renderer);
+        } else {
+            gpu_core = std::make_unique<VideoCommon::GPUSynch>(system, *renderer);
+        }
 
         cpu_core_manager.Initialize(system);
-        is_powered_on = true;
+
         LOG_DEBUG(Core, "Initialized OK");
 
         // Reset counters and set time origin to current frame
@@ -182,13 +190,13 @@ struct System::Impl {
 
     void Shutdown() {
         // Log last frame performance stats
-        auto perf_results = GetAndResetPerfStats();
-        Telemetry().AddField(Telemetry::FieldType::Performance, "Shutdown_EmulationSpeed",
-                             perf_results.emulation_speed * 100.0);
-        Telemetry().AddField(Telemetry::FieldType::Performance, "Shutdown_Framerate",
-                             perf_results.game_fps);
-        Telemetry().AddField(Telemetry::FieldType::Performance, "Shutdown_Frametime",
-                             perf_results.frametime * 1000.0);
+        const auto perf_results = GetAndResetPerfStats();
+        telemetry_session->AddField(Telemetry::FieldType::Performance, "Shutdown_EmulationSpeed",
+                                    perf_results.emulation_speed * 100.0);
+        telemetry_session->AddField(Telemetry::FieldType::Performance, "Shutdown_Framerate",
+                                    perf_results.game_fps);
+        telemetry_session->AddField(Telemetry::FieldType::Performance, "Shutdown_Frametime",
+                                    perf_results.frametime * 1000.0);
 
         is_powered_on = false;
 
@@ -265,7 +273,7 @@ struct System::Impl {
     Core::FrameLimiter frame_limiter;
 };
 
-System::System() : impl{std::make_unique<Impl>()} {}
+System::System() : impl{std::make_unique<Impl>(*this)} {}
 System::~System() = default;
 
 Cpu& System::CurrentCpuCore() {
diff --git a/src/core/core.h b/src/core/core.h
index d720013f7..ba76a41d8 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -293,10 +293,6 @@ inline ARM_Interface& CurrentArmInterface() {
     return System::GetInstance().CurrentArmInterface();
 }
 
-inline TelemetrySession& Telemetry() {
-    return System::GetInstance().TelemetrySession();
-}
-
 inline Kernel::Process* CurrentProcess() {
     return System::GetInstance().CurrentProcess();
 }
diff --git a/src/core/hle/ipc.h b/src/core/hle/ipc.h
index ed84197b3..455d1f346 100644
--- a/src/core/hle/ipc.h
+++ b/src/core/hle/ipc.h
@@ -4,10 +4,10 @@
 
 #pragma once
 
+#include "common/bit_field.h"
+#include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/swap.h"
-#include "core/hle/kernel/errors.h"
-#include "core/memory.h"
 
 namespace IPC {
 
diff --git a/src/core/hle/kernel/address_arbiter.cpp b/src/core/hle/kernel/address_arbiter.cpp
index a250d088d..9780a7849 100644
--- a/src/core/hle/kernel/address_arbiter.cpp
+++ b/src/core/hle/kernel/address_arbiter.cpp
@@ -9,6 +9,7 @@
 #include "common/common_types.h"
 #include "core/core.h"
 #include "core/core_cpu.h"
+#include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/object.h"
 #include "core/hle/kernel/process.h"
@@ -17,58 +18,16 @@
 #include "core/hle/result.h"
 #include "core/memory.h"
 
-namespace Kernel::AddressArbiter {
-
-// Performs actual address waiting logic.
-static ResultCode WaitForAddress(VAddr address, s64 timeout) {
-    SharedPtr<Thread> current_thread = GetCurrentThread();
-    current_thread->SetArbiterWaitAddress(address);
-    current_thread->SetStatus(ThreadStatus::WaitArb);
-    current_thread->InvalidateWakeupCallback();
-
-    current_thread->WakeAfterDelay(timeout);
-
-    Core::System::GetInstance().CpuCore(current_thread->GetProcessorID()).PrepareReschedule();
-    return RESULT_TIMEOUT;
-}
-
-// Gets the threads waiting on an address.
-static std::vector<SharedPtr<Thread>> GetThreadsWaitingOnAddress(VAddr address) {
-    const auto RetrieveWaitingThreads = [](std::size_t core_index,
-                                           std::vector<SharedPtr<Thread>>& waiting_threads,
-                                           VAddr arb_addr) {
-        const auto& scheduler = Core::System::GetInstance().Scheduler(core_index);
-        const auto& thread_list = scheduler.GetThreadList();
-
-        for (const auto& thread : thread_list) {
-            if (thread->GetArbiterWaitAddress() == arb_addr)
-                waiting_threads.push_back(thread);
-        }
-    };
-
-    // Retrieve all threads that are waiting for this address.
-    std::vector<SharedPtr<Thread>> threads;
-    RetrieveWaitingThreads(0, threads, address);
-    RetrieveWaitingThreads(1, threads, address);
-    RetrieveWaitingThreads(2, threads, address);
-    RetrieveWaitingThreads(3, threads, address);
-
-    // Sort them by priority, such that the highest priority ones come first.
-    std::sort(threads.begin(), threads.end(),
-              [](const SharedPtr<Thread>& lhs, const SharedPtr<Thread>& rhs) {
-                  return lhs->GetPriority() < rhs->GetPriority();
-              });
-
-    return threads;
-}
-
+namespace Kernel {
+namespace {
 // Wake up num_to_wake (or all) threads in a vector.
-static void WakeThreads(std::vector<SharedPtr<Thread>>& waiting_threads, s32 num_to_wake) {
+void WakeThreads(const std::vector<SharedPtr<Thread>>& waiting_threads, s32 num_to_wake) {
     // Only process up to 'target' threads, unless 'target' is <= 0, in which case process
     // them all.
     std::size_t last = waiting_threads.size();
-    if (num_to_wake > 0)
+    if (num_to_wake > 0) {
         last = num_to_wake;
+    }
 
     // Signal the waiting threads.
     for (std::size_t i = 0; i < last; i++) {
@@ -78,42 +37,41 @@ static void WakeThreads(std::vector<SharedPtr<Thread>>& waiting_threads, s32 num
         waiting_threads[i]->ResumeFromWait();
     }
 }
+} // Anonymous namespace
 
-// Signals an address being waited on.
-ResultCode SignalToAddress(VAddr address, s32 num_to_wake) {
-    std::vector<SharedPtr<Thread>> waiting_threads = GetThreadsWaitingOnAddress(address);
+AddressArbiter::AddressArbiter(Core::System& system) : system{system} {}
+AddressArbiter::~AddressArbiter() = default;
 
+ResultCode AddressArbiter::SignalToAddress(VAddr address, s32 num_to_wake) {
+    const std::vector<SharedPtr<Thread>> waiting_threads = GetThreadsWaitingOnAddress(address);
     WakeThreads(waiting_threads, num_to_wake);
     return RESULT_SUCCESS;
 }
 
-// Signals an address being waited on and increments its value if equal to the value argument.
-ResultCode IncrementAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake) {
+ResultCode AddressArbiter::IncrementAndSignalToAddressIfEqual(VAddr address, s32 value,
+                                                              s32 num_to_wake) {
     // Ensure that we can write to the address.
     if (!Memory::IsValidVirtualAddress(address)) {
         return ERR_INVALID_ADDRESS_STATE;
     }
 
-    if (static_cast<s32>(Memory::Read32(address)) == value) {
-        Memory::Write32(address, static_cast<u32>(value + 1));
-    } else {
+    if (static_cast<s32>(Memory::Read32(address)) != value) {
         return ERR_INVALID_STATE;
     }
 
+    Memory::Write32(address, static_cast<u32>(value + 1));
     return SignalToAddress(address, num_to_wake);
 }
 
-// Signals an address being waited on and modifies its value based on waiting thread count if equal
-// to the value argument.
-ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
-                                                         s32 num_to_wake) {
+ResultCode AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
+                                                                         s32 num_to_wake) {
     // Ensure that we can write to the address.
     if (!Memory::IsValidVirtualAddress(address)) {
         return ERR_INVALID_ADDRESS_STATE;
     }
 
     // Get threads waiting on the address.
-    std::vector<SharedPtr<Thread>> waiting_threads = GetThreadsWaitingOnAddress(address);
+    const std::vector<SharedPtr<Thread>> waiting_threads = GetThreadsWaitingOnAddress(address);
 
     // Determine the modified value depending on the waiting count.
     s32 updated_value;
@@ -125,31 +83,31 @@ ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 valu
         updated_value = value;
     }
 
-    if (static_cast<s32>(Memory::Read32(address)) == value) {
-        Memory::Write32(address, static_cast<u32>(updated_value));
-    } else {
+    if (static_cast<s32>(Memory::Read32(address)) != value) {
         return ERR_INVALID_STATE;
     }
 
+    Memory::Write32(address, static_cast<u32>(updated_value));
     WakeThreads(waiting_threads, num_to_wake);
     return RESULT_SUCCESS;
 }
 
-// Waits on an address if the value passed is less than the argument value, optionally decrementing.
-ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout, bool should_decrement) {
+ResultCode AddressArbiter::WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout,
+                                                    bool should_decrement) {
     // Ensure that we can read the address.
     if (!Memory::IsValidVirtualAddress(address)) {
         return ERR_INVALID_ADDRESS_STATE;
     }
 
-    s32 cur_value = static_cast<s32>(Memory::Read32(address));
-    if (cur_value < value) {
-        if (should_decrement) {
-            Memory::Write32(address, static_cast<u32>(cur_value - 1));
-        }
-    } else {
+    const s32 cur_value = static_cast<s32>(Memory::Read32(address));
+    if (cur_value >= value) {
         return ERR_INVALID_STATE;
     }
+
+    if (should_decrement) {
+        Memory::Write32(address, static_cast<u32>(cur_value - 1));
+    }
+
     // Short-circuit without rescheduling, if timeout is zero.
     if (timeout == 0) {
         return RESULT_TIMEOUT;
@@ -158,8 +116,7 @@ ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout, bool
     return WaitForAddress(address, timeout);
 }
 
-// Waits on an address if the value passed is equal to the argument value.
-ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout) {
+ResultCode AddressArbiter::WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout) {
     // Ensure that we can read the address.
     if (!Memory::IsValidVirtualAddress(address)) {
         return ERR_INVALID_ADDRESS_STATE;
@@ -175,4 +132,46 @@ ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout) {
 
     return WaitForAddress(address, timeout);
 }
-} // namespace Kernel::AddressArbiter
+
+ResultCode AddressArbiter::WaitForAddress(VAddr address, s64 timeout) {
+    SharedPtr<Thread> current_thread = system.CurrentScheduler().GetCurrentThread();
+    current_thread->SetArbiterWaitAddress(address);
+    current_thread->SetStatus(ThreadStatus::WaitArb);
+    current_thread->InvalidateWakeupCallback();
+
+    current_thread->WakeAfterDelay(timeout);
+
+    system.CpuCore(current_thread->GetProcessorID()).PrepareReschedule();
+    return RESULT_TIMEOUT;
+}
+
+std::vector<SharedPtr<Thread>> AddressArbiter::GetThreadsWaitingOnAddress(VAddr address) const {
+    const auto RetrieveWaitingThreads = [this](std::size_t core_index,
+                                               std::vector<SharedPtr<Thread>>& waiting_threads,
+                                               VAddr arb_addr) {
+        const auto& scheduler = system.Scheduler(core_index);
+        const auto& thread_list = scheduler.GetThreadList();
+
+        for (const auto& thread : thread_list) {
+            if (thread->GetArbiterWaitAddress() == arb_addr) {
+                waiting_threads.push_back(thread);
+            }
+        }
+    };
+
+    // Retrieve all threads that are waiting for this address.
+    std::vector<SharedPtr<Thread>> threads;
+    RetrieveWaitingThreads(0, threads, address);
+    RetrieveWaitingThreads(1, threads, address);
+    RetrieveWaitingThreads(2, threads, address);
+    RetrieveWaitingThreads(3, threads, address);
+
+    // Sort them by priority, such that the highest priority ones come first.
+    std::sort(threads.begin(), threads.end(),
+              [](const SharedPtr<Thread>& lhs, const SharedPtr<Thread>& rhs) {
+                  return lhs->GetPriority() < rhs->GetPriority();
+              });
+
+    return threads;
+}
+} // namespace Kernel
diff --git a/src/core/hle/kernel/address_arbiter.h b/src/core/hle/kernel/address_arbiter.h
index b58f21bec..e0c36f2e3 100644
--- a/src/core/hle/kernel/address_arbiter.h
+++ b/src/core/hle/kernel/address_arbiter.h
@@ -5,28 +5,68 @@
 #pragma once
 
 #include "common/common_types.h"
+#include "core/hle/kernel/address_arbiter.h"
 
 union ResultCode;
 
-namespace Kernel::AddressArbiter {
+namespace Core {
+class System;
+}
 
-enum class ArbitrationType {
-    WaitIfLessThan = 0,
-    DecrementAndWaitIfLessThan = 1,
-    WaitIfEqual = 2,
-};
+namespace Kernel {
 
-enum class SignalType {
-    Signal = 0,
-    IncrementAndSignalIfEqual = 1,
-    ModifyByWaitingCountAndSignalIfEqual = 2,
-};
+class Thread;
+
+class AddressArbiter {
+public:
+    enum class ArbitrationType {
+        WaitIfLessThan = 0,
+        DecrementAndWaitIfLessThan = 1,
+        WaitIfEqual = 2,
+    };
+
+    enum class SignalType {
+        Signal = 0,
+        IncrementAndSignalIfEqual = 1,
+        ModifyByWaitingCountAndSignalIfEqual = 2,
+    };
+
+    explicit AddressArbiter(Core::System& system);
+    ~AddressArbiter();
+
+    AddressArbiter(const AddressArbiter&) = delete;
+    AddressArbiter& operator=(const AddressArbiter&) = delete;
+
+    AddressArbiter(AddressArbiter&&) = default;
+    AddressArbiter& operator=(AddressArbiter&&) = delete;
 
-ResultCode SignalToAddress(VAddr address, s32 num_to_wake);
-ResultCode IncrementAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake);
-ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake);
+    /// Signals an address being waited on.
+    ResultCode SignalToAddress(VAddr address, s32 num_to_wake);
 
-ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout, bool should_decrement);
-ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout);
+    /// Signals an address being waited on and increments its value if equal to the value argument.
+    ResultCode IncrementAndSignalToAddressIfEqual(VAddr address, s32 value, s32 num_to_wake);
+
+    /// Signals an address being waited on and modifies its value based on waiting thread count if
+    /// equal to the value argument.
+    ResultCode ModifyByWaitingCountAndSignalToAddressIfEqual(VAddr address, s32 value,
+                                                             s32 num_to_wake);
+
+    /// Waits on an address if the value passed is less than the argument value,
+    /// optionally decrementing.
+    ResultCode WaitForAddressIfLessThan(VAddr address, s32 value, s64 timeout,
+                                        bool should_decrement);
+
+    /// Waits on an address if the value passed is equal to the argument value.
+    ResultCode WaitForAddressIfEqual(VAddr address, s32 value, s64 timeout);
+
+private:
+    // Waits on the given address with a timeout in nanoseconds
+    ResultCode WaitForAddress(VAddr address, s64 timeout);
+
+    // Gets the threads waiting on an address.
+    std::vector<SharedPtr<Thread>> GetThreadsWaitingOnAddress(VAddr address) const;
+
+    Core::System& system;
+};
 
-} // namespace Kernel::AddressArbiter
+} // namespace Kernel
diff --git a/src/core/hle/kernel/hle_ipc.h b/src/core/hle/kernel/hle_ipc.h
index cb1c5aff3..0107acea4 100644
--- a/src/core/hle/kernel/hle_ipc.h
+++ b/src/core/hle/kernel/hle_ipc.h
@@ -15,6 +15,8 @@
 #include "core/hle/ipc.h"
 #include "core/hle/kernel/object.h"
 
+union ResultCode;
+
 namespace Service {
 class ServiceFrameworkBase;
 }
@@ -208,14 +210,12 @@ public:
 
     template <typename T>
     SharedPtr<T> GetCopyObject(std::size_t index) {
-        ASSERT(index < copy_objects.size());
-        return DynamicObjectCast<T>(copy_objects[index]);
+        return DynamicObjectCast<T>(copy_objects.at(index));
     }
 
     template <typename T>
     SharedPtr<T> GetMoveObject(std::size_t index) {
-        ASSERT(index < move_objects.size());
-        return DynamicObjectCast<T>(move_objects[index]);
+        return DynamicObjectCast<T>(move_objects.at(index));
     }
 
     void AddMoveObject(SharedPtr<Object> object) {
@@ -232,7 +232,7 @@ public:
 
     template <typename T>
     std::shared_ptr<T> GetDomainRequestHandler(std::size_t index) const {
-        return std::static_pointer_cast<T>(domain_request_handlers[index]);
+        return std::static_pointer_cast<T>(domain_request_handlers.at(index));
     }
 
     void SetDomainRequestHandlers(
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index dd749eed4..04ea9349e 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -12,6 +12,7 @@
 
 #include "core/core.h"
 #include "core/core_timing.h"
+#include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
@@ -86,11 +87,13 @@ static void ThreadWakeupCallback(u64 thread_handle, [[maybe_unused]] int cycles_
 }
 
 struct KernelCore::Impl {
-    void Initialize(KernelCore& kernel, Core::Timing::CoreTiming& core_timing) {
+    explicit Impl(Core::System& system) : address_arbiter{system}, system{system} {}
+
+    void Initialize(KernelCore& kernel) {
         Shutdown();
 
         InitializeSystemResourceLimit(kernel);
-        InitializeThreads(core_timing);
+        InitializeThreads();
     }
 
     void Shutdown() {
@@ -122,9 +125,9 @@ struct KernelCore::Impl {
         ASSERT(system_resource_limit->SetLimitValue(ResourceType::Sessions, 900).IsSuccess());
     }
 
-    void InitializeThreads(Core::Timing::CoreTiming& core_timing) {
+    void InitializeThreads() {
         thread_wakeup_event_type =
-            core_timing.RegisterEvent("ThreadWakeupCallback", ThreadWakeupCallback);
+            system.CoreTiming().RegisterEvent("ThreadWakeupCallback", ThreadWakeupCallback);
     }
 
     std::atomic<u32> next_object_id{0};
@@ -135,6 +138,8 @@ struct KernelCore::Impl {
     std::vector<SharedPtr<Process>> process_list;
     Process* current_process = nullptr;
 
+    Kernel::AddressArbiter address_arbiter;
+
     SharedPtr<ResourceLimit> system_resource_limit;
 
     Core::Timing::EventType* thread_wakeup_event_type = nullptr;
@@ -145,15 +150,18 @@ struct KernelCore::Impl {
     /// Map of named ports managed by the kernel, which can be retrieved using
     /// the ConnectToPort SVC.
     NamedPortTable named_ports;
+
+    // System context
+    Core::System& system;
 };
 
-KernelCore::KernelCore() : impl{std::make_unique<Impl>()} {}
+KernelCore::KernelCore(Core::System& system) : impl{std::make_unique<Impl>(system)} {}
 KernelCore::~KernelCore() {
     Shutdown();
 }
 
-void KernelCore::Initialize(Core::Timing::CoreTiming& core_timing) {
-    impl->Initialize(*this, core_timing);
+void KernelCore::Initialize() {
+    impl->Initialize(*this);
 }
 
 void KernelCore::Shutdown() {
@@ -184,6 +192,14 @@ const Process* KernelCore::CurrentProcess() const {
     return impl->current_process;
 }
 
+AddressArbiter& KernelCore::AddressArbiter() {
+    return impl->address_arbiter;
+}
+
+const AddressArbiter& KernelCore::AddressArbiter() const {
+    return impl->address_arbiter;
+}
+
 void KernelCore::AddNamedPort(std::string name, SharedPtr<ClientPort> port) {
     impl->named_ports.emplace(std::move(name), std::move(port));
 }
diff --git a/src/core/hle/kernel/kernel.h b/src/core/hle/kernel/kernel.h
index 154bced42..4d292aca9 100644
--- a/src/core/hle/kernel/kernel.h
+++ b/src/core/hle/kernel/kernel.h
@@ -11,6 +11,10 @@
 template <typename T>
 class ResultVal;
 
+namespace Core {
+class System;
+}
+
 namespace Core::Timing {
 class CoreTiming;
 struct EventType;
@@ -18,6 +22,7 @@ struct EventType;
 
 namespace Kernel {
 
+class AddressArbiter;
 class ClientPort;
 class HandleTable;
 class Process;
@@ -30,7 +35,14 @@ private:
     using NamedPortTable = std::unordered_map<std::string, SharedPtr<ClientPort>>;
 
 public:
-    KernelCore();
+    /// Constructs an instance of the kernel using the given System
+    /// instance as a context for any necessary system-related state,
+    /// such as threads, CPU core state, etc.
+    ///
+    /// @post After execution of the constructor, the provided System
+    ///       object *must* outlive the kernel instance itself.
+    ///
+    explicit KernelCore(Core::System& system);
     ~KernelCore();
 
     KernelCore(const KernelCore&) = delete;
@@ -40,11 +52,7 @@ public:
     KernelCore& operator=(KernelCore&&) = delete;
 
     /// Resets the kernel to a clean slate for use.
-    ///
-    /// @param core_timing CoreTiming instance used to create any necessary
-    ///                    kernel-specific callback events.
-    ///
-    void Initialize(Core::Timing::CoreTiming& core_timing);
+    void Initialize();
 
     /// Clears all resources in use by the kernel instance.
     void Shutdown();
@@ -67,6 +75,12 @@ public:
     /// Retrieves a const pointer to the current process.
     const Process* CurrentProcess() const;
 
+    /// Provides a reference to the kernel's address arbiter.
+    Kernel::AddressArbiter& AddressArbiter();
+
+    /// Provides a const reference to the kernel's address arbiter.
+    const Kernel::AddressArbiter& AddressArbiter() const;
+
     /// Adds a port to the named port table
     void AddNamedPort(std::string name, SharedPtr<ClientPort> port);
 
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 223d717e2..7f5c0cc86 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -20,6 +20,7 @@
 #include "core/hle/kernel/address_arbiter.h"
 #include "core/hle/kernel/client_port.h"
 #include "core/hle/kernel/client_session.h"
+#include "core/hle/kernel/errors.h"
 #include "core/hle/kernel/handle_table.h"
 #include "core/hle/kernel/kernel.h"
 #include "core/hle/kernel/mutex.h"
@@ -1478,13 +1479,14 @@ static ResultCode WaitForAddress(VAddr address, u32 type, s32 value, s64 timeout
         return ERR_INVALID_ADDRESS;
     }
 
+    auto& address_arbiter = Core::System::GetInstance().Kernel().AddressArbiter();
     switch (static_cast<AddressArbiter::ArbitrationType>(type)) {
     case AddressArbiter::ArbitrationType::WaitIfLessThan:
-        return AddressArbiter::WaitForAddressIfLessThan(address, value, timeout, false);
+        return address_arbiter.WaitForAddressIfLessThan(address, value, timeout, false);
     case AddressArbiter::ArbitrationType::DecrementAndWaitIfLessThan:
-        return AddressArbiter::WaitForAddressIfLessThan(address, value, timeout, true);
+        return address_arbiter.WaitForAddressIfLessThan(address, value, timeout, true);
     case AddressArbiter::ArbitrationType::WaitIfEqual:
-        return AddressArbiter::WaitForAddressIfEqual(address, value, timeout);
+        return address_arbiter.WaitForAddressIfEqual(address, value, timeout);
     default:
         LOG_ERROR(Kernel_SVC,
                   "Invalid arbitration type, expected WaitIfLessThan, DecrementAndWaitIfLessThan "
@@ -1509,13 +1511,14 @@ static ResultCode SignalToAddress(VAddr address, u32 type, s32 value, s32 num_to
         return ERR_INVALID_ADDRESS;
     }
 
+    auto& address_arbiter = Core::System::GetInstance().Kernel().AddressArbiter();
     switch (static_cast<AddressArbiter::SignalType>(type)) {
     case AddressArbiter::SignalType::Signal:
-        return AddressArbiter::SignalToAddress(address, num_to_wake);
+        return address_arbiter.SignalToAddress(address, num_to_wake);
     case AddressArbiter::SignalType::IncrementAndSignalIfEqual:
-        return AddressArbiter::IncrementAndSignalToAddressIfEqual(address, value, num_to_wake);
+        return address_arbiter.IncrementAndSignalToAddressIfEqual(address, value, num_to_wake);
     case AddressArbiter::SignalType::ModifyByWaitingCountAndSignalIfEqual:
-        return AddressArbiter::ModifyByWaitingCountAndSignalToAddressIfEqual(address, value,
+        return address_arbiter.ModifyByWaitingCountAndSignalToAddressIfEqual(address, value,
                                                                              num_to_wake);
     default:
         LOG_ERROR(Kernel_SVC,
diff --git a/src/core/hle/result.h b/src/core/hle/result.h
index bfb77cc31..1ed144481 100644
--- a/src/core/hle/result.h
+++ b/src/core/hle/result.h
@@ -8,7 +8,6 @@
 #include <utility>
 #include "common/assert.h"
 #include "common/bit_field.h"
-#include "common/common_funcs.h"
 #include "common/common_types.h"
 
 // All the constants in this file come from http://switchbrew.org/index.php?title=Error_codes
diff --git a/src/core/hle/service/am/applets/software_keyboard.cpp b/src/core/hle/service/am/applets/software_keyboard.cpp
index f255f74b5..8c5bd6059 100644
--- a/src/core/hle/service/am/applets/software_keyboard.cpp
+++ b/src/core/hle/service/am/applets/software_keyboard.cpp
@@ -7,6 +7,7 @@
 #include "common/string_util.h"
 #include "core/core.h"
 #include "core/frontend/applets/software_keyboard.h"
+#include "core/hle/result.h"
 #include "core/hle/service/am/am.h"
 #include "core/hle/service/am/applets/software_keyboard.h"
 
diff --git a/src/core/hle/service/am/applets/software_keyboard.h b/src/core/hle/service/am/applets/software_keyboard.h
index efd5753a1..b93a30d28 100644
--- a/src/core/hle/service/am/applets/software_keyboard.h
+++ b/src/core/hle/service/am/applets/software_keyboard.h
@@ -9,10 +9,13 @@
 #include <vector>
 
 #include "common/common_funcs.h"
+#include "common/common_types.h"
 #include "common/swap.h"
 #include "core/hle/service/am/am.h"
 #include "core/hle/service/am/applets/applets.h"
 
+union ResultCode;
+
 namespace Service::AM::Applets {
 
 enum class KeysetDisable : u32 {
diff --git a/src/core/hle/service/audio/audout_u.cpp b/src/core/hle/service/audio/audout_u.cpp
index 1c04d21cf..21f5e64c7 100644
--- a/src/core/hle/service/audio/audout_u.cpp
+++ b/src/core/hle/service/audio/audout_u.cpp
@@ -107,7 +107,9 @@ private:
     void StopAudioOut(Kernel::HLERequestContext& ctx) {
         LOG_DEBUG(Service_Audio, "called");
 
-        audio_core.StopStream(stream);
+        if (stream->IsPlaying()) {
+            audio_core.StopStream(stream);
+        }
 
         IPC::ResponseBuilder rb{ctx, 2};
         rb.Push(RESULT_SUCCESS);
@@ -138,6 +140,7 @@ private:
         if (!audio_core.QueueBuffer(stream, tag, std::move(samples))) {
             IPC::ResponseBuilder rb{ctx, 2};
             rb.Push(ERR_BUFFER_COUNT_EXCEEDED);
+            return;
         }
 
         IPC::ResponseBuilder rb{ctx, 2};
diff --git a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
index dbe7ee6e8..20c7c39aa 100644
--- a/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvdisp_disp0.cpp
@@ -36,7 +36,7 @@ void nvdisp_disp0::flip(u32 buffer_handle, u32 offset, u32 format, u32 width, u3
 
     auto& instance = Core::System::GetInstance();
     instance.GetPerfStats().EndGameFrame();
-    instance.Renderer().SwapBuffers(framebuffer);
+    instance.GPU().SwapBuffers(framebuffer);
 }
 
 } // namespace Service::Nvidia::Devices
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
index 466db7ccd..a34b9e753 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_as_gpu.cpp
@@ -178,7 +178,7 @@ u32 nvhost_as_gpu::UnmapBuffer(const std::vector<u8>& input, std::vector<u8>& ou
     auto& gpu = system_instance.GPU();
     auto cpu_addr = gpu.MemoryManager().GpuToCpuAddress(params.offset);
     ASSERT(cpu_addr);
-    system_instance.Renderer().Rasterizer().FlushAndInvalidateRegion(*cpu_addr, itr->second.size);
+    gpu.FlushAndInvalidateRegion(*cpu_addr, itr->second.size);
 
     params.offset = gpu.MemoryManager().UnmapBuffer(params.offset, itr->second.size);
 
diff --git a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
index 0a650f36c..8ce7bc7a5 100644
--- a/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
+++ b/src/core/hle/service/nvdrv/devices/nvhost_gpu.cpp
@@ -136,16 +136,6 @@ u32 nvhost_gpu::AllocateObjectContext(const std::vector<u8>& input, std::vector<
     return 0;
 }
 
-static void PushGPUEntries(Tegra::CommandList&& entries) {
-    if (entries.empty()) {
-        return;
-    }
-
-    auto& dma_pusher{Core::System::GetInstance().GPU().DmaPusher()};
-    dma_pusher.Push(std::move(entries));
-    dma_pusher.DispatchCalls();
-}
-
 u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& output) {
     if (input.size() < sizeof(IoctlSubmitGpfifo)) {
         UNIMPLEMENTED();
@@ -163,7 +153,7 @@ u32 nvhost_gpu::SubmitGPFIFO(const std::vector<u8>& input, std::vector<u8>& outp
     std::memcpy(entries.data(), &input[sizeof(IoctlSubmitGpfifo)],
                 params.num_entries * sizeof(Tegra::CommandListHeader));
 
-    PushGPUEntries(std::move(entries));
+    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
 
     params.fence_out.id = 0;
     params.fence_out.value = 0;
@@ -184,7 +174,7 @@ u32 nvhost_gpu::KickoffPB(const std::vector<u8>& input, std::vector<u8>& output)
     Memory::ReadBlock(params.address, entries.data(),
                       params.num_entries * sizeof(Tegra::CommandListHeader));
 
-    PushGPUEntries(std::move(entries));
+    Core::System::GetInstance().GPU().PushGPUEntries(std::move(entries));
 
     params.fence_out.id = 0;
     params.fence_out.value = 0;
diff --git a/src/core/hle/service/nvflinger/nvflinger.cpp b/src/core/hle/service/nvflinger/nvflinger.cpp
index 56f31e2ac..fc496b654 100644
--- a/src/core/hle/service/nvflinger/nvflinger.cpp
+++ b/src/core/hle/service/nvflinger/nvflinger.cpp
@@ -186,7 +186,7 @@ void NVFlinger::Compose() {
 
             // There was no queued buffer to draw, render previous frame
             system_instance.GetPerfStats().EndGameFrame();
-            system_instance.Renderer().SwapBuffers({});
+            system_instance.GPU().SwapBuffers({});
             continue;
         }
 
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index ec279cef8..6591c45d2 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -356,16 +356,16 @@ void RasterizerFlushVirtualRegion(VAddr start, u64 size, FlushMode mode) {
         const VAddr overlap_end = std::min(end, region_end);
         const VAddr overlap_size = overlap_end - overlap_start;
 
-        auto& rasterizer = system_instance.Renderer().Rasterizer();
+        auto& gpu = system_instance.GPU();
         switch (mode) {
         case FlushMode::Flush:
-            rasterizer.FlushRegion(overlap_start, overlap_size);
+            gpu.FlushRegion(overlap_start, overlap_size);
             break;
         case FlushMode::Invalidate:
-            rasterizer.InvalidateRegion(overlap_start, overlap_size);
+            gpu.InvalidateRegion(overlap_start, overlap_size);
             break;
         case FlushMode::FlushAndInvalidate:
-            rasterizer.FlushAndInvalidateRegion(overlap_start, overlap_size);
+            gpu.FlushAndInvalidateRegion(overlap_start, overlap_size);
             break;
         }
     };
diff --git a/src/core/settings.h b/src/core/settings.h
index 7e76e0466..cdfb2f742 100644
--- a/src/core/settings.h
+++ b/src/core/settings.h
@@ -393,6 +393,7 @@ struct Values {
     u16 frame_limit;
     bool use_disk_shader_cache;
     bool use_accurate_gpu_emulation;
+    bool use_asynchronous_gpu_emulation;
 
     float bg_red;
     float bg_green;
diff --git a/src/core/telemetry_session.cpp b/src/core/telemetry_session.cpp
index 58dfcc4df..e1db06811 100644
--- a/src/core/telemetry_session.cpp
+++ b/src/core/telemetry_session.cpp
@@ -162,6 +162,8 @@ TelemetrySession::TelemetrySession() {
              Settings::values.use_disk_shader_cache);
     AddField(Telemetry::FieldType::UserConfig, "Renderer_UseAccurateGpuEmulation",
              Settings::values.use_accurate_gpu_emulation);
+    AddField(Telemetry::FieldType::UserConfig, "Renderer_UseAsynchronousGpuEmulation",
+             Settings::values.use_asynchronous_gpu_emulation);
     AddField(Telemetry::FieldType::UserConfig, "System_UseDockedMode",
              Settings::values.use_docked_mode);
 }
diff --git a/src/tests/core/arm/arm_test_common.cpp b/src/tests/core/arm/arm_test_common.cpp
index 9b8a44fa1..ea27ef90d 100644
--- a/src/tests/core/arm/arm_test_common.cpp
+++ b/src/tests/core/arm/arm_test_common.cpp
@@ -13,11 +13,11 @@
 namespace ArmTests {
 
 TestEnvironment::TestEnvironment(bool mutable_memory_)
-    : mutable_memory(mutable_memory_), test_memory(std::make_shared<TestMemory>(this)) {
-
+    : mutable_memory(mutable_memory_),
+      test_memory(std::make_shared<TestMemory>(this)), kernel{Core::System::GetInstance()} {
     auto process = Kernel::Process::Create(kernel, "");
     kernel.MakeCurrentProcess(process.get());
-    page_table = &Core::CurrentProcess()->VMManager().page_table;
+    page_table = &process->VMManager().page_table;
 
     std::fill(page_table->pointers.begin(), page_table->pointers.end(), nullptr);
     page_table->special_regions.clear();
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 3e9d2b3be..57f31cd58 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -17,6 +17,12 @@ add_library(video_core STATIC
     engines/shader_header.h
     gpu.cpp
     gpu.h
+    gpu_asynch.cpp
+    gpu_asynch.h
+    gpu_synch.cpp
+    gpu_synch.h
+    gpu_thread.cpp
+    gpu_thread.h
     macro_interpreter.cpp
     macro_interpreter.h
     memory_manager.cpp
@@ -94,6 +100,8 @@ add_library(video_core STATIC
     surface.h
     textures/astc.cpp
     textures/astc.h
+    textures/convert.cpp
+    textures/convert.h
     textures/decoders.cpp
     textures/decoders.h
     textures/texture.h
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index 4f6126116..aae2a4019 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -48,7 +48,7 @@ void KeplerMemory::ProcessData(u32 data) {
     // We have to invalidate the destination region to evict any outdated surfaces from the cache.
     // We do this before actually writing the new data because the destination address might contain
     // a dirty surface that will have to be written back to memory.
-    rasterizer.InvalidateRegion(*dest_address, sizeof(u32));
+    Core::System::GetInstance().GPU().InvalidateRegion(*dest_address, sizeof(u32));
 
     Memory::Write32(*dest_address, data);
     system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp
index 0474c7ba3..9dfea5999 100644
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@@ -92,12 +92,12 @@ void MaxwellDMA::HandleCopy() {
     const auto FlushAndInvalidate = [&](u32 src_size, u64 dst_size) {
         // TODO(Subv): For now, manually flush the regions until we implement GPU-accelerated
         // copying.
-        rasterizer.FlushRegion(*source_cpu, src_size);
+        Core::System::GetInstance().GPU().FlushRegion(*source_cpu, src_size);
 
         // We have to invalidate the destination region to evict any outdated surfaces from the
         // cache. We do this before actually writing the new data because the destination address
         // might contain a dirty surface that will have to be written back to memory.
-        rasterizer.InvalidateRegion(*dest_cpu, dst_size);
+        Core::System::GetInstance().GPU().InvalidateRegion(*dest_cpu, dst_size);
     };
 
     if (regs.exec.is_dst_linear && !regs.exec.is_src_linear) {
diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp
index ac30d1a89..08abf8ac9 100644
--- a/src/video_core/gpu.cpp
+++ b/src/video_core/gpu.cpp
@@ -12,7 +12,7 @@
 #include "video_core/engines/maxwell_3d.h"
 #include "video_core/engines/maxwell_dma.h"
 #include "video_core/gpu.h"
-#include "video_core/rasterizer_interface.h"
+#include "video_core/renderer_base.h"
 
 namespace Tegra {
 
@@ -28,7 +28,8 @@ u32 FramebufferConfig::BytesPerPixel(PixelFormat format) {
     UNREACHABLE();
 }
 
-GPU::GPU(Core::System& system, VideoCore::RasterizerInterface& rasterizer) {
+GPU::GPU(Core::System& system, VideoCore::RendererBase& renderer) : renderer{renderer} {
+    auto& rasterizer{renderer.Rasterizer()};
     memory_manager = std::make_unique<Tegra::MemoryManager>();
     dma_pusher = std::make_unique<Tegra::DmaPusher>(*this);
     maxwell_3d = std::make_unique<Engines::Maxwell3D>(system, rasterizer, *memory_manager);
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index 6313702f2..14a421cc1 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -16,8 +16,8 @@ class System;
 }
 
 namespace VideoCore {
-class RasterizerInterface;
-}
+class RendererBase;
+} // namespace VideoCore
 
 namespace Tegra {
 
@@ -119,9 +119,10 @@ enum class EngineID {
     MAXWELL_DMA_COPY_A = 0xB0B5,
 };
 
-class GPU final {
+class GPU {
 public:
-    explicit GPU(Core::System& system, VideoCore::RasterizerInterface& rasterizer);
+    explicit GPU(Core::System& system, VideoCore::RendererBase& renderer);
+
     ~GPU();
 
     struct MethodCall {
@@ -200,8 +201,42 @@ public:
         };
     } regs{};
 
+    /// Push GPU command entries to be processed
+    virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
+
+    /// Swap buffers (render frame)
+    virtual void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    virtual void FlushRegion(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    virtual void InvalidateRegion(VAddr addr, u64 size) = 0;
+
+    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
+    virtual void FlushAndInvalidateRegion(VAddr addr, u64 size) = 0;
+
 private:
+    void ProcessBindMethod(const MethodCall& method_call);
+    void ProcessSemaphoreTriggerMethod();
+    void ProcessSemaphoreRelease();
+    void ProcessSemaphoreAcquire();
+
+    /// Calls a GPU puller method.
+    void CallPullerMethod(const MethodCall& method_call);
+
+    /// Calls a GPU engine method.
+    void CallEngineMethod(const MethodCall& method_call);
+
+    /// Determines where the method should be executed.
+    bool ExecuteMethodOnEngine(const MethodCall& method_call);
+
+protected:
     std::unique_ptr<Tegra::DmaPusher> dma_pusher;
+    VideoCore::RendererBase& renderer;
+
+private:
     std::unique_ptr<Tegra::MemoryManager> memory_manager;
 
     /// Mapping of command subchannels to their bound engine ids.
@@ -217,18 +252,6 @@ private:
     std::unique_ptr<Engines::MaxwellDMA> maxwell_dma;
     /// Inline memory engine
     std::unique_ptr<Engines::KeplerMemory> kepler_memory;
-
-    void ProcessBindMethod(const MethodCall& method_call);
-    void ProcessSemaphoreTriggerMethod();
-    void ProcessSemaphoreRelease();
-    void ProcessSemaphoreAcquire();
-
-    // Calls a GPU puller method.
-    void CallPullerMethod(const MethodCall& method_call);
-    // Calls a GPU engine method.
-    void CallEngineMethod(const MethodCall& method_call);
-    // Determines where the method should be executed.
-    bool ExecuteMethodOnEngine(const MethodCall& method_call);
 };
 
 #define ASSERT_REG_POSITION(field_name, position)                                                  \
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
new file mode 100644
index 000000000..ad0a747e3
--- /dev/null
+++ b/src/video_core/gpu_asynch.cpp
@@ -0,0 +1,37 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/gpu_asynch.h"
+#include "video_core/gpu_thread.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon {
+
+GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
+    : Tegra::GPU(system, renderer), gpu_thread{renderer, *dma_pusher} {}
+
+GPUAsynch::~GPUAsynch() = default;
+
+void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
+    gpu_thread.SubmitList(std::move(entries));
+}
+
+void GPUAsynch::SwapBuffers(
+    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
+    gpu_thread.SwapBuffers(std::move(framebuffer));
+}
+
+void GPUAsynch::FlushRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushRegion(addr, size);
+}
+
+void GPUAsynch::InvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.InvalidateRegion(addr, size);
+}
+
+void GPUAsynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    gpu_thread.FlushAndInvalidateRegion(addr, size);
+}
+
+} // namespace VideoCommon
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
new file mode 100644
index 000000000..58046f3e9
--- /dev/null
+++ b/src/video_core/gpu_asynch.h
@@ -0,0 +1,37 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon {
+
+namespace GPUThread {
+class ThreadManager;
+} // namespace GPUThread
+
+/// Implementation of GPU interface that runs the GPU asynchronously
+class GPUAsynch : public Tegra::GPU {
+public:
+    explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer);
+    ~GPUAsynch();
+
+    void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
+    void FlushRegion(VAddr addr, u64 size) override;
+    void InvalidateRegion(VAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+
+private:
+    GPUThread::ThreadManager gpu_thread;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
new file mode 100644
index 000000000..4c00b96c7
--- /dev/null
+++ b/src/video_core/gpu_synch.cpp
@@ -0,0 +1,37 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "video_core/gpu_synch.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon {
+
+GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
+    : Tegra::GPU(system, renderer) {}
+
+GPUSynch::~GPUSynch() = default;
+
+void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
+    dma_pusher->Push(std::move(entries));
+    dma_pusher->DispatchCalls();
+}
+
+void GPUSynch::SwapBuffers(
+    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
+    renderer.SwapBuffers(std::move(framebuffer));
+}
+
+void GPUSynch::FlushRegion(VAddr addr, u64 size) {
+    renderer.Rasterizer().FlushRegion(addr, size);
+}
+
+void GPUSynch::InvalidateRegion(VAddr addr, u64 size) {
+    renderer.Rasterizer().InvalidateRegion(addr, size);
+}
+
+void GPUSynch::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    renderer.Rasterizer().FlushAndInvalidateRegion(addr, size);
+}
+
+} // namespace VideoCommon
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
new file mode 100644
index 000000000..658f683e2
--- /dev/null
+++ b/src/video_core/gpu_synch.h
@@ -0,0 +1,29 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "video_core/gpu.h"
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon {
+
+/// Implementation of GPU interface that runs the GPU synchronously
+class GPUSynch : public Tegra::GPU {
+public:
+    explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer);
+    ~GPUSynch();
+
+    void PushGPUEntries(Tegra::CommandList&& entries) override;
+    void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
+    void FlushRegion(VAddr addr, u64 size) override;
+    void InvalidateRegion(VAddr addr, u64 size) override;
+    void FlushAndInvalidateRegion(VAddr addr, u64 size) override;
+};
+
+} // namespace VideoCommon
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
new file mode 100644
index 000000000..c5bdd2a17
--- /dev/null
+++ b/src/video_core/gpu_thread.cpp
@@ -0,0 +1,152 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include "common/assert.h"
+#include "common/microprofile.h"
+#include "core/frontend/scope_acquire_window_context.h"
+#include "core/settings.h"
+#include "video_core/dma_pusher.h"
+#include "video_core/gpu.h"
+#include "video_core/gpu_thread.h"
+#include "video_core/renderer_base.h"
+
+namespace VideoCommon::GPUThread {
+
+/// Executes a single GPU thread command
+static void ExecuteCommand(CommandData* command, VideoCore::RendererBase& renderer,
+                           Tegra::DmaPusher& dma_pusher) {
+    if (const auto submit_list = std::get_if<SubmitListCommand>(command)) {
+        dma_pusher.Push(std::move(submit_list->entries));
+        dma_pusher.DispatchCalls();
+    } else if (const auto data = std::get_if<SwapBuffersCommand>(command)) {
+        renderer.SwapBuffers(data->framebuffer);
+    } else if (const auto data = std::get_if<FlushRegionCommand>(command)) {
+        renderer.Rasterizer().FlushRegion(data->addr, data->size);
+    } else if (const auto data = std::get_if<InvalidateRegionCommand>(command)) {
+        renderer.Rasterizer().InvalidateRegion(data->addr, data->size);
+    } else if (const auto data = std::get_if<FlushAndInvalidateRegionCommand>(command)) {
+        renderer.Rasterizer().FlushAndInvalidateRegion(data->addr, data->size);
+    } else {
+        UNREACHABLE();
+    }
+}
+
+/// Runs the GPU thread
+static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher,
+                      SynchState& state) {
+
+    MicroProfileOnThreadCreate("GpuThread");
+
+    auto WaitForWakeup = [&]() {
+        std::unique_lock<std::mutex> lock{state.signal_mutex};
+        state.signal_condition.wait(lock, [&] { return !state.is_idle || !state.is_running; });
+    };
+
+    // Wait for first GPU command before acquiring the window context
+    WaitForWakeup();
+
+    // If emulation was stopped during disk shader loading, abort before trying to acquire context
+    if (!state.is_running) {
+        return;
+    }
+
+    Core::Frontend::ScopeAcquireWindowContext acquire_context{renderer.GetRenderWindow()};
+
+    while (state.is_running) {
+        if (!state.is_running) {
+            return;
+        }
+
+        {
+            // Thread has been woken up, so make the previous write queue the next read queue
+            std::lock_guard<std::mutex> lock{state.signal_mutex};
+            std::swap(state.push_queue, state.pop_queue);
+        }
+
+        // Execute all of the GPU commands
+        while (!state.pop_queue->empty()) {
+            ExecuteCommand(&state.pop_queue->front(), renderer, dma_pusher);
+            state.pop_queue->pop();
+        }
+
+        state.UpdateIdleState();
+
+        // Signal that the GPU thread has finished processing commands
+        if (state.is_idle) {
+            state.idle_condition.notify_one();
+        }
+
+        // Wait for CPU thread to send more GPU commands
+        WaitForWakeup();
+    }
+}
+
+ThreadManager::ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher)
+    : renderer{renderer}, dma_pusher{dma_pusher}, thread{RunThread, std::ref(renderer),
+                                                         std::ref(dma_pusher), std::ref(state)},
+      thread_id{thread.get_id()} {}
+
+ThreadManager::~ThreadManager() {
+    {
+        // Notify GPU thread that a shutdown is pending
+        std::lock_guard<std::mutex> lock{state.signal_mutex};
+        state.is_running = false;
+    }
+
+    state.signal_condition.notify_one();
+    thread.join();
+}
+
+void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
+    if (entries.empty()) {
+        return;
+    }
+
+    PushCommand(SubmitListCommand(std::move(entries)), false, false);
+}
+
+void ThreadManager::SwapBuffers(
+    std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) {
+    PushCommand(SwapBuffersCommand(std::move(framebuffer)), true, false);
+}
+
+void ThreadManager::FlushRegion(VAddr addr, u64 size) {
+    // Block the CPU when using accurate emulation
+    PushCommand(FlushRegionCommand(addr, size), Settings::values.use_accurate_gpu_emulation, false);
+}
+
+void ThreadManager::InvalidateRegion(VAddr addr, u64 size) {
+    PushCommand(InvalidateRegionCommand(addr, size), true, true);
+}
+
+void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) {
+    InvalidateRegion(addr, size);
+}
+
+void ThreadManager::PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu) {
+    {
+        std::lock_guard<std::mutex> lock{state.signal_mutex};
+
+        if ((allow_on_cpu && state.is_idle) || IsGpuThread()) {
+            // Execute the command synchronously on the current thread
+            ExecuteCommand(&command_data, renderer, dma_pusher);
+            return;
+        }
+
+        // Push the command to the GPU thread
+        state.UpdateIdleState();
+        state.push_queue->emplace(command_data);
+    }
+
+    // Signal the GPU thread that commands are pending
+    state.signal_condition.notify_one();
+
+    if (wait_for_idle) {
+        // Wait for the GPU to be idle (all commands to be executed)
+        std::unique_lock<std::mutex> lock{state.idle_mutex};
+        state.idle_condition.wait(lock, [this] { return static_cast<bool>(state.is_idle); });
+    }
+}
+
+} // namespace VideoCommon::GPUThread
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
new file mode 100644
index 000000000..2ad8214cc
--- /dev/null
+++ b/src/video_core/gpu_thread.h
@@ -0,0 +1,136 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <thread>
+#include <variant>
+
+namespace Tegra {
+struct FramebufferConfig;
+class DmaPusher;
+} // namespace Tegra
+
+namespace VideoCore {
+class RendererBase;
+} // namespace VideoCore
+
+namespace VideoCommon::GPUThread {
+
+/// Command to signal to the GPU thread that a command list is ready for processing
+struct SubmitListCommand final {
+    explicit SubmitListCommand(Tegra::CommandList&& entries) : entries{std::move(entries)} {}
+
+    Tegra::CommandList entries;
+};
+
+/// Command to signal to the GPU thread that a swap buffers is pending
+struct SwapBuffersCommand final {
+    explicit SwapBuffersCommand(std::optional<const Tegra::FramebufferConfig> framebuffer)
+        : framebuffer{std::move(framebuffer)} {}
+
+    std::optional<const Tegra::FramebufferConfig> framebuffer;
+};
+
+/// Command to signal to the GPU thread to flush a region
+struct FlushRegionCommand final {
+    explicit constexpr FlushRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+
+    const VAddr addr;
+    const u64 size;
+};
+
+/// Command to signal to the GPU thread to invalidate a region
+struct InvalidateRegionCommand final {
+    explicit constexpr InvalidateRegionCommand(VAddr addr, u64 size) : addr{addr}, size{size} {}
+
+    const VAddr addr;
+    const u64 size;
+};
+
+/// Command to signal to the GPU thread to flush and invalidate a region
+struct FlushAndInvalidateRegionCommand final {
+    explicit constexpr FlushAndInvalidateRegionCommand(VAddr addr, u64 size)
+        : addr{addr}, size{size} {}
+
+    const VAddr addr;
+    const u64 size;
+};
+
+using CommandData = std::variant<SubmitListCommand, SwapBuffersCommand, FlushRegionCommand,
+                                 InvalidateRegionCommand, FlushAndInvalidateRegionCommand>;
+
+/// Struct used to synchronize the GPU thread
+struct SynchState final {
+    std::atomic<bool> is_running{true};
+    std::atomic<bool> is_idle{true};
+    std::condition_variable signal_condition;
+    std::mutex signal_mutex;
+    std::condition_variable idle_condition;
+    std::mutex idle_mutex;
+
+    // We use two queues for sending commands to the GPU thread, one for writing (push_queue) to and
+    // one for reading from (pop_queue). These are swapped whenever the current pop_queue becomes
+    // empty. This allows for efficient thread-safe access, as it does not require any copies.
+
+    using CommandQueue = std::queue<CommandData>;
+    std::array<CommandQueue, 2> command_queues;
+    CommandQueue* push_queue{&command_queues[0]};
+    CommandQueue* pop_queue{&command_queues[1]};
+
+    void UpdateIdleState() {
+        std::lock_guard<std::mutex> lock{idle_mutex};
+        is_idle = command_queues[0].empty() && command_queues[1].empty();
+    }
+};
+
+/// Class used to manage the GPU thread
+class ThreadManager final {
+public:
+    explicit ThreadManager(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
+    ~ThreadManager();
+
+    /// Push GPU command entries to be processed
+    void SubmitList(Tegra::CommandList&& entries);
+
+    /// Swap buffers (render frame)
+    void SwapBuffers(
+        std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory
+    void FlushRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be invalidated
+    void InvalidateRegion(VAddr addr, u64 size);
+
+    /// Notify rasterizer that any caches of the specified region should be flushed and invalidated
+    void FlushAndInvalidateRegion(VAddr addr, u64 size);
+
+    /// Waits the caller until the GPU thread is idle, used for synchronization
+    void WaitForIdle();
+
+private:
+    /// Pushes a command to be executed by the GPU thread
+    void PushCommand(CommandData&& command_data, bool wait_for_idle, bool allow_on_cpu);
+
+    /// Returns true if this is called by the GPU thread
+    bool IsGpuThread() const {
+        return std::this_thread::get_id() == thread_id;
+    }
+
+private:
+    SynchState state;
+    std::thread thread;
+    std::thread::id thread_id;
+    VideoCore::RendererBase& renderer;
+    Tegra::DmaPusher& dma_pusher;
+};
+
+} // namespace VideoCommon::GPUThread
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 321d9dd3d..168288088 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -749,11 +749,7 @@ void RasterizerOpenGL::FlushAll() {}
 
 void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) {
     MICROPROFILE_SCOPE(OpenGL_CacheManagement);
-
-    if (Settings::values.use_accurate_gpu_emulation) {
-        // Only flush if use_accurate_gpu_emulation is enabled, as it incurs a performance hit
-        res_cache.FlushRegion(addr, size);
-    }
+    res_cache.FlushRegion(addr, size);
 }
 
 void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) {
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index b5a9722f9..876698b37 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -21,7 +21,7 @@
 #include "video_core/renderer_opengl/gl_rasterizer_cache.h"
 #include "video_core/renderer_opengl/utils.h"
 #include "video_core/surface.h"
-#include "video_core/textures/astc.h"
+#include "video_core/textures/convert.h"
 #include "video_core/textures/decoders.h"
 
 namespace OpenGL {
@@ -597,103 +597,6 @@ CachedSurface::CachedSurface(const SurfaceParams& params)
     }
 }
 
-static void ConvertS8Z24ToZ24S8(std::vector<u8>& data, u32 width, u32 height, bool reverse) {
-    union S8Z24 {
-        BitField<0, 24, u32> z24;
-        BitField<24, 8, u32> s8;
-    };
-    static_assert(sizeof(S8Z24) == 4, "S8Z24 is incorrect size");
-
-    union Z24S8 {
-        BitField<0, 8, u32> s8;
-        BitField<8, 24, u32> z24;
-    };
-    static_assert(sizeof(Z24S8) == 4, "Z24S8 is incorrect size");
-
-    S8Z24 s8z24_pixel{};
-    Z24S8 z24s8_pixel{};
-    constexpr auto bpp{GetBytesPerPixel(PixelFormat::S8Z24)};
-    for (std::size_t y = 0; y < height; ++y) {
-        for (std::size_t x = 0; x < width; ++x) {
-            const std::size_t offset{bpp * (y * width + x)};
-            if (reverse) {
-                std::memcpy(&z24s8_pixel, &data[offset], sizeof(Z24S8));
-                s8z24_pixel.s8.Assign(z24s8_pixel.s8);
-                s8z24_pixel.z24.Assign(z24s8_pixel.z24);
-                std::memcpy(&data[offset], &s8z24_pixel, sizeof(S8Z24));
-            } else {
-                std::memcpy(&s8z24_pixel, &data[offset], sizeof(S8Z24));
-                z24s8_pixel.s8.Assign(s8z24_pixel.s8);
-                z24s8_pixel.z24.Assign(s8z24_pixel.z24);
-                std::memcpy(&data[offset], &z24s8_pixel, sizeof(Z24S8));
-            }
-        }
-    }
-}
-
-/**
- * Helper function to perform software conversion (as needed) when loading a buffer from Switch
- * memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or with
- * typical desktop GPUs.
- */
-static void ConvertFormatAsNeeded_LoadGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
-                                               u32 width, u32 height, u32 depth) {
-    switch (pixel_format) {
-    case PixelFormat::ASTC_2D_4X4:
-    case PixelFormat::ASTC_2D_8X8:
-    case PixelFormat::ASTC_2D_8X5:
-    case PixelFormat::ASTC_2D_5X4:
-    case PixelFormat::ASTC_2D_5X5:
-    case PixelFormat::ASTC_2D_4X4_SRGB:
-    case PixelFormat::ASTC_2D_8X8_SRGB:
-    case PixelFormat::ASTC_2D_8X5_SRGB:
-    case PixelFormat::ASTC_2D_5X4_SRGB:
-    case PixelFormat::ASTC_2D_5X5_SRGB:
-    case PixelFormat::ASTC_2D_10X8:
-    case PixelFormat::ASTC_2D_10X8_SRGB: {
-        // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC.
-        u32 block_width{};
-        u32 block_height{};
-        std::tie(block_width, block_height) = GetASTCBlockSize(pixel_format);
-        data =
-            Tegra::Texture::ASTC::Decompress(data, width, height, depth, block_width, block_height);
-        break;
-    }
-    case PixelFormat::S8Z24:
-        // Convert the S8Z24 depth format to Z24S8, as OpenGL does not support S8Z24.
-        ConvertS8Z24ToZ24S8(data, width, height, false);
-        break;
-    }
-}
-
-/**
- * Helper function to perform software conversion (as needed) when flushing a buffer from OpenGL to
- * Switch memory. This is for Maxwell pixel formats that cannot be represented as-is in OpenGL or
- * with typical desktop GPUs.
- */
-static void ConvertFormatAsNeeded_FlushGLBuffer(std::vector<u8>& data, PixelFormat pixel_format,
-                                                u32 width, u32 height) {
-    switch (pixel_format) {
-    case PixelFormat::ASTC_2D_4X4:
-    case PixelFormat::ASTC_2D_8X8:
-    case PixelFormat::ASTC_2D_4X4_SRGB:
-    case PixelFormat::ASTC_2D_8X8_SRGB:
-    case PixelFormat::ASTC_2D_5X5:
-    case PixelFormat::ASTC_2D_5X5_SRGB:
-    case PixelFormat::ASTC_2D_10X8:
-    case PixelFormat::ASTC_2D_10X8_SRGB: {
-        LOG_CRITICAL(HW_GPU, "Conversion of format {} after texture flushing is not implemented",
-                     static_cast<u32>(pixel_format));
-        UNREACHABLE();
-        break;
-    }
-    case PixelFormat::S8Z24:
-        // Convert the Z24S8 depth format to S8Z24, as OpenGL does not support S8Z24.
-        ConvertS8Z24ToZ24S8(data, width, height, true);
-        break;
-    }
-}
-
 MICROPROFILE_DEFINE(OpenGL_SurfaceLoad, "OpenGL", "Surface Load", MP_RGB(128, 192, 64));
 void CachedSurface::LoadGLBuffer() {
     MICROPROFILE_SCOPE(OpenGL_SurfaceLoad);
@@ -722,8 +625,16 @@ void CachedSurface::LoadGLBuffer() {
         }
     }
     for (u32 i = 0; i < params.max_mip_level; i++) {
-        ConvertFormatAsNeeded_LoadGLBuffer(gl_buffer[i], params.pixel_format, params.MipWidth(i),
-                                           params.MipHeight(i), params.MipDepth(i));
+        const u32 width = params.MipWidth(i);
+        const u32 height = params.MipHeight(i);
+        const u32 depth = params.MipDepth(i);
+        if (VideoCore::Surface::IsPixelFormatASTC(params.pixel_format)) {
+            // Reserve size for RGBA8 conversion
+            constexpr std::size_t rgba_bpp = 4;
+            gl_buffer[i].resize(std::max(gl_buffer[i].size(), width * height * depth * rgba_bpp));
+        }
+        Tegra::Texture::ConvertFromGuestToHost(gl_buffer[i].data(), params.pixel_format, width,
+                                               height, depth, true, true);
     }
 }
 
@@ -746,8 +657,8 @@ void CachedSurface::FlushGLBuffer() {
     glGetTextureImage(texture.handle, 0, tuple.format, tuple.type,
                       static_cast<GLsizei>(gl_buffer[0].size()), gl_buffer[0].data());
     glPixelStorei(GL_PACK_ROW_LENGTH, 0);
-    ConvertFormatAsNeeded_FlushGLBuffer(gl_buffer[0], params.pixel_format, params.width,
-                                        params.height);
+    Tegra::Texture::ConvertFromHostToGuest(gl_buffer[0].data(), params.pixel_format, params.width,
+                                           params.height, params.depth, true, true);
     const u8* const texture_src_data = Memory::GetPointer(params.addr);
     ASSERT(texture_src_data);
     if (params.is_tiled) {
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index e60b2eb44..8b510b6ae 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -244,6 +244,21 @@ void RendererOpenGL::InitOpenGLObjects() {
     LoadColorToActiveGLTexture(0, 0, 0, 0, screen_info.texture);
 }
 
+void RendererOpenGL::AddTelemetryFields() {
+    const char* const gl_version{reinterpret_cast<char const*>(glGetString(GL_VERSION))};
+    const char* const gpu_vendor{reinterpret_cast<char const*>(glGetString(GL_VENDOR))};
+    const char* const gpu_model{reinterpret_cast<char const*>(glGetString(GL_RENDERER))};
+
+    LOG_INFO(Render_OpenGL, "GL_VERSION: {}", gl_version);
+    LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor);
+    LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model);
+
+    auto& telemetry_session = system.TelemetrySession();
+    telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Vendor", gpu_vendor);
+    telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_Model", gpu_model);
+    telemetry_session.AddField(Telemetry::FieldType::UserSystem, "GPU_OpenGL_Version", gl_version);
+}
+
 void RendererOpenGL::CreateRasterizer() {
     if (rasterizer) {
         return;
@@ -466,17 +481,7 @@ bool RendererOpenGL::Init() {
         glDebugMessageCallback(DebugHandler, nullptr);
     }
 
-    const char* gl_version{reinterpret_cast<char const*>(glGetString(GL_VERSION))};
-    const char* gpu_vendor{reinterpret_cast<char const*>(glGetString(GL_VENDOR))};
-    const char* gpu_model{reinterpret_cast<char const*>(glGetString(GL_RENDERER))};
-
-    LOG_INFO(Render_OpenGL, "GL_VERSION: {}", gl_version);
-    LOG_INFO(Render_OpenGL, "GL_VENDOR: {}", gpu_vendor);
-    LOG_INFO(Render_OpenGL, "GL_RENDERER: {}", gpu_model);
-
-    Core::Telemetry().AddField(Telemetry::FieldType::UserSystem, "GPU_Vendor", gpu_vendor);
-    Core::Telemetry().AddField(Telemetry::FieldType::UserSystem, "GPU_Model", gpu_model);
-    Core::Telemetry().AddField(Telemetry::FieldType::UserSystem, "GPU_OpenGL_Version", gl_version);
+    AddTelemetryFields();
 
     if (!GLAD_GL_VERSION_4_3) {
         return false;
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h
index c168fa89e..6cbf9d2cb 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.h
+++ b/src/video_core/renderer_opengl/renderer_opengl.h
@@ -60,6 +60,7 @@ public:
 
 private:
     void InitOpenGLObjects();
+    void AddTelemetryFields();
     void CreateRasterizer();
 
     void ConfigureFramebufferTexture(TextureInfo& texture,
diff --git a/src/video_core/textures/astc.cpp b/src/video_core/textures/astc.cpp
index bc50a4876..b508d64e9 100644
--- a/src/video_core/textures/astc.cpp
+++ b/src/video_core/textures/astc.cpp
@@ -23,28 +23,12 @@
 
 #include "video_core/textures/astc.h"
 
-class BitStream {
+class InputBitStream {
 public:
-    explicit BitStream(unsigned char* ptr, int nBits = 0, int start_offset = 0)
+    explicit InputBitStream(const unsigned char* ptr, int nBits = 0, int start_offset = 0)
         : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
 
-    ~BitStream() = default;
-
-    int GetBitsWritten() const {
-        return m_BitsWritten;
-    }
-
-    void WriteBitsR(unsigned int val, unsigned int nBits) {
-        for (unsigned int i = 0; i < nBits; i++) {
-            WriteBit((val >> (nBits - i - 1)) & 1);
-        }
-    }
-
-    void WriteBits(unsigned int val, unsigned int nBits) {
-        for (unsigned int i = 0; i < nBits; i++) {
-            WriteBit((val >> i) & 1);
-        }
-    }
+    ~InputBitStream() = default;
 
     int GetBitsRead() const {
         return m_BitsRead;
@@ -71,6 +55,38 @@ public:
     }
 
 private:
+    const int m_NumBits;
+    const unsigned char* m_CurByte;
+    int m_NextBit = 0;
+    int m_BitsRead = 0;
+
+    bool done = false;
+};
+
+class OutputBitStream {
+public:
+    explicit OutputBitStream(unsigned char* ptr, int nBits = 0, int start_offset = 0)
+        : m_NumBits(nBits), m_CurByte(ptr), m_NextBit(start_offset % 8) {}
+
+    ~OutputBitStream() = default;
+
+    int GetBitsWritten() const {
+        return m_BitsWritten;
+    }
+
+    void WriteBitsR(unsigned int val, unsigned int nBits) {
+        for (unsigned int i = 0; i < nBits; i++) {
+            WriteBit((val >> (nBits - i - 1)) & 1);
+        }
+    }
+
+    void WriteBits(unsigned int val, unsigned int nBits) {
+        for (unsigned int i = 0; i < nBits; i++) {
+            WriteBit((val >> i) & 1);
+        }
+    }
+
+private:
     void WriteBit(int b) {
 
         if (done)
@@ -238,8 +254,8 @@ public:
     // Fills result with the values that are encoded in the given
     // bitstream. We must know beforehand what the maximum possible
     // value is, and how many values we're decoding.
-    static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result, BitStream& bits,
-                                      uint32_t maxRange, uint32_t nValues) {
+    static void DecodeIntegerSequence(std::vector<IntegerEncodedValue>& result,
+                                      InputBitStream& bits, uint32_t maxRange, uint32_t nValues) {
         // Determine encoding parameters
         IntegerEncodedValue val = IntegerEncodedValue::CreateEncoding(maxRange);
 
@@ -267,7 +283,7 @@ public:
     }
 
 private:
-    static void DecodeTritBlock(BitStream& bits, std::vector<IntegerEncodedValue>& result,
+    static void DecodeTritBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
                                 uint32_t nBitsPerValue) {
         // Implement the algorithm in section C.2.12
         uint32_t m[5];
@@ -327,7 +343,7 @@ private:
         }
     }
 
-    static void DecodeQuintBlock(BitStream& bits, std::vector<IntegerEncodedValue>& result,
+    static void DecodeQuintBlock(InputBitStream& bits, std::vector<IntegerEncodedValue>& result,
                                  uint32_t nBitsPerValue) {
         // Implement the algorithm in section C.2.12
         uint32_t m[3];
@@ -406,7 +422,7 @@ struct TexelWeightParams {
     }
 };
 
-static TexelWeightParams DecodeBlockInfo(BitStream& strm) {
+static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) {
     TexelWeightParams params;
 
     // Read the entire block mode all at once
@@ -605,7 +621,7 @@ static TexelWeightParams DecodeBlockInfo(BitStream& strm) {
     return params;
 }
 
-static void FillVoidExtentLDR(BitStream& strm, uint32_t* const outBuf, uint32_t blockWidth,
+static void FillVoidExtentLDR(InputBitStream& strm, uint32_t* const outBuf, uint32_t blockWidth,
                               uint32_t blockHeight) {
     // Don't actually care about the void extent, just read the bits...
     for (int i = 0; i < 4; ++i) {
@@ -821,7 +837,7 @@ static void DecodeColorValues(uint32_t* out, uint8_t* data, const uint32_t* mode
 
     // We now have enough to decode our integer sequence.
     std::vector<IntegerEncodedValue> decodedColorValues;
-    BitStream colorStream(data);
+    InputBitStream colorStream(data);
     IntegerEncodedValue::DecodeIntegerSequence(decodedColorValues, colorStream, range, nValues);
 
     // Once we have the decoded values, we need to dequantize them to the 0-255 range
@@ -1365,9 +1381,9 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const uint32_t*& colorValue
 #undef READ_INT_VALUES
 }
 
-static void DecompressBlock(uint8_t inBuf[16], const uint32_t blockWidth,
+static void DecompressBlock(const uint8_t inBuf[16], const uint32_t blockWidth,
                             const uint32_t blockHeight, uint32_t* outBuf) {
-    BitStream strm(inBuf);
+    InputBitStream strm(inBuf);
     TexelWeightParams weightParams = DecodeBlockInfo(strm);
 
     // Was there an error?
@@ -1421,7 +1437,7 @@ static void DecompressBlock(uint8_t inBuf[16], const uint32_t blockWidth,
     // Define color data.
     uint8_t colorEndpointData[16];
     memset(colorEndpointData, 0, sizeof(colorEndpointData));
-    BitStream colorEndpointStream(colorEndpointData, 16 * 8, 0);
+    OutputBitStream colorEndpointStream(colorEndpointData, 16 * 8, 0);
 
     // Read extra config data...
     uint32_t baseCEM = 0;
@@ -1549,7 +1565,7 @@ static void DecompressBlock(uint8_t inBuf[16], const uint32_t blockWidth,
     memset(texelWeightData + clearByteStart, 0, 16 - clearByteStart);
 
     std::vector<IntegerEncodedValue> texelWeightValues;
-    BitStream weightStream(texelWeightData);
+    InputBitStream weightStream(texelWeightData);
 
     IntegerEncodedValue::DecodeIntegerSequence(texelWeightValues, weightStream,
                                                weightParams.m_MaxWeight,
@@ -1597,7 +1613,7 @@ static void DecompressBlock(uint8_t inBuf[16], const uint32_t blockWidth,
 
 namespace Tegra::Texture::ASTC {
 
-std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height,
+std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height,
                                 uint32_t depth, uint32_t block_width, uint32_t block_height) {
     uint32_t blockIdx = 0;
     std::vector<uint8_t> outData(height * width * depth * 4);
@@ -1605,7 +1621,7 @@ std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint
         for (uint32_t j = 0; j < height; j += block_height) {
             for (uint32_t i = 0; i < width; i += block_width) {
 
-                uint8_t* blockPtr = data.data() + blockIdx * 16;
+                const uint8_t* blockPtr = data + blockIdx * 16;
 
                 // Blocks can be at most 12x12
                 uint32_t uncompData[144];
diff --git a/src/video_core/textures/astc.h b/src/video_core/textures/astc.h
index d419dd025..991cdba72 100644
--- a/src/video_core/textures/astc.h
+++ b/src/video_core/textures/astc.h
@@ -9,7 +9,7 @@
 
 namespace Tegra::Texture::ASTC {
 
-std::vector<uint8_t> Decompress(std::vector<uint8_t>& data, uint32_t width, uint32_t height,
+std::vector<uint8_t> Decompress(const uint8_t* data, uint32_t width, uint32_t height,
                                 uint32_t depth, uint32_t block_width, uint32_t block_height);
 
 } // namespace Tegra::Texture::ASTC
diff --git a/src/video_core/textures/convert.cpp b/src/video_core/textures/convert.cpp
new file mode 100644
index 000000000..5e439f036
--- /dev/null
+++ b/src/video_core/textures/convert.cpp
@@ -0,0 +1,92 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <cstring>
+#include <tuple>
+#include <vector>
+
+#include "common/assert.h"
+#include "common/common_types.h"
+#include "common/logging/log.h"
+#include "video_core/textures/astc.h"
+#include "video_core/textures/convert.h"
+
+namespace Tegra::Texture {
+
+using VideoCore::Surface::PixelFormat;
+
+template <bool reverse>
+void SwapS8Z24ToZ24S8(u8* data, u32 width, u32 height) {
+    union S8Z24 {
+        BitField<0, 24, u32> z24;
+        BitField<24, 8, u32> s8;
+    };
+    static_assert(sizeof(S8Z24) == 4, "S8Z24 is incorrect size");
+
+    union Z24S8 {
+        BitField<0, 8, u32> s8;
+        BitField<8, 24, u32> z24;
+    };
+    static_assert(sizeof(Z24S8) == 4, "Z24S8 is incorrect size");
+
+    S8Z24 s8z24_pixel{};
+    Z24S8 z24s8_pixel{};
+    constexpr auto bpp{
+        VideoCore::Surface::GetBytesPerPixel(VideoCore::Surface::PixelFormat::S8Z24)};
+    for (std::size_t y = 0; y < height; ++y) {
+        for (std::size_t x = 0; x < width; ++x) {
+            const std::size_t offset{bpp * (y * width + x)};
+            if constexpr (reverse) {
+                std::memcpy(&z24s8_pixel, &data[offset], sizeof(Z24S8));
+                s8z24_pixel.s8.Assign(z24s8_pixel.s8);
+                s8z24_pixel.z24.Assign(z24s8_pixel.z24);
+                std::memcpy(&data[offset], &s8z24_pixel, sizeof(S8Z24));
+            } else {
+                std::memcpy(&s8z24_pixel, &data[offset], sizeof(S8Z24));
+                z24s8_pixel.s8.Assign(s8z24_pixel.s8);
+                z24s8_pixel.z24.Assign(s8z24_pixel.z24);
+                std::memcpy(&data[offset], &z24s8_pixel, sizeof(Z24S8));
+            }
+        }
+    }
+}
+
+static void ConvertS8Z24ToZ24S8(u8* data, u32 width, u32 height) {
+    SwapS8Z24ToZ24S8<false>(data, width, height);
+}
+
+static void ConvertZ24S8ToS8Z24(u8* data, u32 width, u32 height) {
+    SwapS8Z24ToZ24S8<true>(data, width, height);
+}
+
+void ConvertFromGuestToHost(u8* data, PixelFormat pixel_format, u32 width, u32 height, u32 depth,
+                            bool convert_astc, bool convert_s8z24) {
+    if (convert_astc && IsPixelFormatASTC(pixel_format)) {
+        // Convert ASTC pixel formats to RGBA8, as most desktop GPUs do not support ASTC.
+        u32 block_width{};
+        u32 block_height{};
+        std::tie(block_width, block_height) = GetASTCBlockSize(pixel_format);
+        const std::vector<u8> rgba8_data =
+            Tegra::Texture::ASTC::Decompress(data, width, height, depth, block_width, block_height);
+        std::copy(rgba8_data.begin(), rgba8_data.end(), data);
+
+    } else if (convert_s8z24 && pixel_format == PixelFormat::S8Z24) {
+        Tegra::Texture::ConvertS8Z24ToZ24S8(data, width, height);
+    }
+}
+
+void ConvertFromHostToGuest(u8* data, PixelFormat pixel_format, u32 width, u32 height, u32 depth,
+                            bool convert_astc, bool convert_s8z24) {
+    if (convert_astc && IsPixelFormatASTC(pixel_format)) {
+        LOG_CRITICAL(HW_GPU, "Conversion of format {} after texture flushing is not implemented",
+                     static_cast<u32>(pixel_format));
+        UNREACHABLE();
+
+    } else if (convert_s8z24 && pixel_format == PixelFormat::S8Z24) {
+        Tegra::Texture::ConvertZ24S8ToS8Z24(data, width, height);
+    }
+}
+
+} // namespace Tegra::Texture
+\ No newline at end of file
diff --git a/src/video_core/textures/convert.h b/src/video_core/textures/convert.h
new file mode 100644
index 000000000..07cd8b5da
--- /dev/null
+++ b/src/video_core/textures/convert.h
@@ -0,0 +1,18 @@
+// Copyright 2019 yuzu Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common_types.h"
+#include "video_core/surface.h"
+
+namespace Tegra::Texture {
+
+void ConvertFromGuestToHost(u8* data, VideoCore::Surface::PixelFormat pixel_format, u32 width,
+                            u32 height, u32 depth, bool convert_astc, bool convert_s8z24);
+
+void ConvertFromHostToGuest(u8* data, VideoCore::Surface::PixelFormat pixel_format, u32 width,
+                            u32 height, u32 depth, bool convert_astc, bool convert_s8z24);
+
+} // namespace Tegra::Texture
+\ No newline at end of file
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 5db75de22..cad7340f5 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -103,8 +103,8 @@ void FastProcessBlock(u8* const swizzled_data, u8* const unswizzled_data, const
                 const u32 swizzle_offset{y_address + table[(xb / fast_swizzle_align) % 4]};
                 const u32 out_x = xb * out_bytes_per_pixel / bytes_per_pixel;
                 const u32 pixel_index{out_x + pixel_base};
-                data_ptrs[unswizzle] = swizzled_data + swizzle_offset;
-                data_ptrs[!unswizzle] = unswizzled_data + pixel_index;
+                data_ptrs[unswizzle ? 1 : 0] = swizzled_data + swizzle_offset;
+                data_ptrs[unswizzle ? 0 : 1] = unswizzled_data + pixel_index;
                 std::memcpy(data_ptrs[0], data_ptrs[1], fast_swizzle_align);
             }
             pixel_base += stride_x;
@@ -154,7 +154,7 @@ void SwizzledData(u8* const swizzled_data, u8* const unswizzled_data, const bool
             for (u32 xb = 0; xb < blocks_on_x; xb++) {
                 const u32 x_start = xb * block_x_elements;
                 const u32 x_end = std::min(width, x_start + block_x_elements);
-                if (fast) {
+                if constexpr (fast) {
                     FastProcessBlock(swizzled_data, unswizzled_data, unswizzle, x_start, y_start,
                                      z_start, x_end, y_end, z_end, tile_offset, xy_block_size,
                                      layer_z, stride_x, bytes_per_pixel, out_bytes_per_pixel);
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index 85b7e9f7b..65df86890 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -16,16 +16,13 @@ inline std::size_t GetGOBSize() {
     return 512;
 }
 
-/**
- * Unswizzles a swizzled texture without changing its format.
- */
+/// Unswizzles a swizzled texture without changing its format.
 void UnswizzleTexture(u8* unswizzled_data, VAddr address, u32 tile_size_x, u32 tile_size_y,
                       u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
                       u32 block_height = TICEntry::DefaultBlockHeight,
                       u32 block_depth = TICEntry::DefaultBlockHeight, u32 width_spacing = 0);
-/**
- * Unswizzles a swizzled texture without changing its format.
- */
+
+/// Unswizzles a swizzled texture without changing its format.
 std::vector<u8> UnswizzleTexture(VAddr address, u32 tile_size_x, u32 tile_size_y,
                                  u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
                                  u32 block_height = TICEntry::DefaultBlockHeight,
@@ -37,15 +34,11 @@ void CopySwizzledData(u32 width, u32 height, u32 depth, u32 bytes_per_pixel,
                       u32 out_bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data,
                       bool unswizzle, u32 block_height, u32 block_depth, u32 width_spacing);
 
-/**
- * Decodes an unswizzled texture into a A8R8G8B8 texture.
- */
+/// Decodes an unswizzled texture into a A8R8G8B8 texture.
 std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
                               u32 height);
 
-/**
- * This function calculates the correct size of a texture depending if it's tiled or not.
- */
+/// This function calculates the correct size of a texture depending if it's tiled or not.
 std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
                           u32 block_height, u32 block_depth);
 
@@ -53,6 +46,7 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
                     u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
                     u32 block_height);
+
 /// Copies a tiled subrectangle into a linear surface.
 void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32 swizzled_width,
                       u32 bytes_per_pixel, VAddr swizzled_data, VAddr unswizzled_data,
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index 73b04b749..3b070bfbb 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -20,10 +20,7 @@
 EmuThread::EmuThread(GRenderWindow* render_window) : render_window(render_window) {}
 
 void EmuThread::run() {
-    if (!Settings::values.use_multi_core) {
-        // Single core mode must acquire OpenGL context for entire emulation session
-        render_window->MakeCurrent();
-    }
+    render_window->MakeCurrent();
 
     MicroProfileOnThreadCreate("EmuThread");
 
@@ -38,6 +35,11 @@ void EmuThread::run() {
 
     emit LoadProgress(VideoCore::LoadCallbackStage::Complete, 0, 0);
 
+    if (Settings::values.use_asynchronous_gpu_emulation) {
+        // Release OpenGL context for the GPU thread
+        render_window->DoneCurrent();
+    }
+
     // holds whether the cpu was running during the last iteration,
     // so that the DebugModeLeft signal can be emitted before the
     // next execution step
diff --git a/src/yuzu/compatdb.cpp b/src/yuzu/compatdb.cpp
index c09a06520..c8b0a5ec0 100644
--- a/src/yuzu/compatdb.cpp
+++ b/src/yuzu/compatdb.cpp
@@ -53,8 +53,8 @@ void CompatDB::Submit() {
     case CompatDBPage::Final:
         back();
         LOG_DEBUG(Frontend, "Compatibility Rating: {}", compatibility->checkedId());
-        Core::Telemetry().AddField(Telemetry::FieldType::UserFeedback, "Compatibility",
-                                   compatibility->checkedId());
+        Core::System::GetInstance().TelemetrySession().AddField(
+            Telemetry::FieldType::UserFeedback, "Compatibility", compatibility->checkedId());
 
         button(NextButton)->setEnabled(false);
         button(NextButton)->setText(tr("Submitting"));
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp
index e9546dadf..74dc6bb28 100644
--- a/src/yuzu/configuration/config.cpp
+++ b/src/yuzu/configuration/config.cpp
@@ -374,6 +374,8 @@ void Config::ReadValues() {
         qt_config->value("use_disk_shader_cache", false).toBool();
     Settings::values.use_accurate_gpu_emulation =
         qt_config->value("use_accurate_gpu_emulation", false).toBool();
+    Settings::values.use_asynchronous_gpu_emulation =
+        qt_config->value("use_asynchronous_gpu_emulation", false).toBool();
 
     Settings::values.bg_red = qt_config->value("bg_red", 0.0).toFloat();
     Settings::values.bg_green = qt_config->value("bg_green", 0.0).toFloat();
@@ -633,6 +635,8 @@ void Config::SaveValues() {
     qt_config->setValue("frame_limit", Settings::values.frame_limit);
     qt_config->setValue("use_disk_shader_cache", Settings::values.use_disk_shader_cache);
     qt_config->setValue("use_accurate_gpu_emulation", Settings::values.use_accurate_gpu_emulation);
+    qt_config->setValue("use_asynchronous_gpu_emulation",
+                        Settings::values.use_asynchronous_gpu_emulation);
 
     // Cast to double because Qt's written float values are not human-readable
     qt_config->setValue("bg_red", (double)Settings::values.bg_red);
diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp
index 0f5dd534b..dd1d67488 100644
--- a/src/yuzu/configuration/configure_graphics.cpp
+++ b/src/yuzu/configuration/configure_graphics.cpp
@@ -75,6 +75,8 @@ void ConfigureGraphics::setConfiguration() {
     ui->frame_limit->setValue(Settings::values.frame_limit);
     ui->use_disk_shader_cache->setChecked(Settings::values.use_disk_shader_cache);
     ui->use_accurate_gpu_emulation->setChecked(Settings::values.use_accurate_gpu_emulation);
+    ui->use_asynchronous_gpu_emulation->setEnabled(!Core::System::GetInstance().IsPoweredOn());
+    ui->use_asynchronous_gpu_emulation->setChecked(Settings::values.use_asynchronous_gpu_emulation);
     UpdateBackgroundColorButton(QColor::fromRgbF(Settings::values.bg_red, Settings::values.bg_green,
                                                  Settings::values.bg_blue));
 }
@@ -86,6 +88,8 @@ void ConfigureGraphics::applyConfiguration() {
     Settings::values.frame_limit = ui->frame_limit->value();
     Settings::values.use_disk_shader_cache = ui->use_disk_shader_cache->isChecked();
     Settings::values.use_accurate_gpu_emulation = ui->use_accurate_gpu_emulation->isChecked();
+    Settings::values.use_asynchronous_gpu_emulation =
+        ui->use_asynchronous_gpu_emulation->isChecked();
     Settings::values.bg_red = static_cast<float>(bg_color.redF());
     Settings::values.bg_green = static_cast<float>(bg_color.greenF());
     Settings::values.bg_blue = static_cast<float>(bg_color.blueF());
diff --git a/src/yuzu/configuration/configure_graphics.ui b/src/yuzu/configuration/configure_graphics.ui
index 824f5810a..c6767e0ca 100644
--- a/src/yuzu/configuration/configure_graphics.ui
+++ b/src/yuzu/configuration/configure_graphics.ui
@@ -64,6 +64,13 @@
          </widget>
         </item>
         <item>
+         <widget class="QCheckBox" name="use_asynchronous_gpu_emulation">
+          <property name="text">
+           <string>Use asynchronous GPU emulation</string>
+          </property>
+         </widget>
+        </item>
+        <item>
          <layout class="QHBoxLayout" name="horizontalLayout">
           <item>
            <widget class="QLabel" name="label">
diff --git a/src/yuzu/debugger/wait_tree.cpp b/src/yuzu/debugger/wait_tree.cpp
index f50225d5f..06ad74ffe 100644
--- a/src/yuzu/debugger/wait_tree.cpp
+++ b/src/yuzu/debugger/wait_tree.cpp
@@ -81,9 +81,8 @@ QString WaitTreeText::GetText() const {
     return text;
 }
 
-WaitTreeMutexInfo::WaitTreeMutexInfo(VAddr mutex_address) : mutex_address(mutex_address) {
-    const auto& handle_table = Core::CurrentProcess()->GetHandleTable();
-
+WaitTreeMutexInfo::WaitTreeMutexInfo(VAddr mutex_address, const Kernel::HandleTable& handle_table)
+    : mutex_address(mutex_address) {
     mutex_value = Memory::Read32(mutex_address);
     owner_handle = static_cast<Kernel::Handle>(mutex_value & Kernel::Mutex::MutexOwnerMask);
     owner = handle_table.Get<Kernel::Thread>(owner_handle);
@@ -316,7 +315,8 @@ std::vector<std::unique_ptr<WaitTreeItem>> WaitTreeThread::GetChildren() const {
 
     const VAddr mutex_wait_address = thread.GetMutexWaitAddress();
     if (mutex_wait_address != 0) {
-        list.push_back(std::make_unique<WaitTreeMutexInfo>(mutex_wait_address));
+        const auto& handle_table = thread.GetOwnerProcess()->GetHandleTable();
+        list.push_back(std::make_unique<WaitTreeMutexInfo>(mutex_wait_address, handle_table));
     } else {
         list.push_back(std::make_unique<WaitTreeText>(tr("not waiting for mutex")));
     }
diff --git a/src/yuzu/debugger/wait_tree.h b/src/yuzu/debugger/wait_tree.h
index 365c3dbfe..62886609d 100644
--- a/src/yuzu/debugger/wait_tree.h
+++ b/src/yuzu/debugger/wait_tree.h
@@ -17,6 +17,7 @@
 class EmuThread;
 
 namespace Kernel {
+class HandleTable;
 class ReadableEvent;
 class WaitObject;
 class Thread;
@@ -72,7 +73,7 @@ public:
 class WaitTreeMutexInfo : public WaitTreeExpandableItem {
     Q_OBJECT
 public:
-    explicit WaitTreeMutexInfo(VAddr mutex_address);
+    explicit WaitTreeMutexInfo(VAddr mutex_address, const Kernel::HandleTable& handle_table);
     ~WaitTreeMutexInfo() override;
 
     QString GetText() const override;
diff --git a/src/yuzu/main.cpp b/src/yuzu/main.cpp
index 7a833798d..c6f3ab4e4 100644
--- a/src/yuzu/main.cpp
+++ b/src/yuzu/main.cpp
@@ -857,7 +857,7 @@ bool GMainWindow::LoadROM(const QString& filename) {
     }
     game_path = filename;
 
-    Core::Telemetry().AddField(Telemetry::FieldType::App, "Frontend", "Qt");
+    system.TelemetrySession().AddField(Telemetry::FieldType::App, "Frontend", "Qt");
     return true;
 }
 
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp
index ff05b3179..ca880dc65 100644
--- a/src/yuzu_cmd/config.cpp
+++ b/src/yuzu_cmd/config.cpp
@@ -354,6 +354,8 @@ void Config::ReadValues() {
         sdl2_config->GetBoolean("Renderer", "use_disk_shader_cache", false);
     Settings::values.use_accurate_gpu_emulation =
         sdl2_config->GetBoolean("Renderer", "use_accurate_gpu_emulation", false);
+    Settings::values.use_asynchronous_gpu_emulation =
+        sdl2_config->GetBoolean("Renderer", "use_asynchronous_gpu_emulation", false);
 
     Settings::values.bg_red = (float)sdl2_config->GetReal("Renderer", "bg_red", 0.0);
     Settings::values.bg_green = (float)sdl2_config->GetReal("Renderer", "bg_green", 0.0);
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h
index a81986f8e..6538af098 100644
--- a/src/yuzu_cmd/default_ini.h
+++ b/src/yuzu_cmd/default_ini.h
@@ -118,6 +118,10 @@ use_disk_shader_cache =
 # 0 (default): Off (fast), 1 : On (slow)
 use_accurate_gpu_emulation =
 
+# Whether to use asynchronous GPU emulation
+# 0 : Off (slow), 1 (default): On (fast)
+use_asynchronous_gpu_emulation =
+
 # The clear color for the renderer. What shows up on the sides of the bottom screen.
 # Must be in range of 0.0-1.0. Defaults to 1.0 for all.
 bg_red =
diff --git a/src/yuzu_cmd/yuzu.cpp b/src/yuzu_cmd/yuzu.cpp
index c34b5467f..c6c66a787 100644
--- a/src/yuzu_cmd/yuzu.cpp
+++ b/src/yuzu_cmd/yuzu.cpp
@@ -216,7 +216,7 @@ int main(int argc, char** argv) {
         }
     }
 
-    Core::Telemetry().AddField(Telemetry::FieldType::App, "Frontend", "SDL");
+    system.TelemetrySession().AddField(Telemetry::FieldType::App, "Frontend", "SDL");
 
     system.Renderer().Rasterizer().LoadDiskResources();