summaryrefslogblamecommitdiffstats
path: root/external/optick/optick_gpu.vulkan.cpp
blob: 6d6f29d7017e6a365dc09e6f09fc060d33cdf433 (plain) (tree)












































































































































































































































































































































































                                                                                                                                                                                                                  
#include "optick.config.h"

#if USE_OPTICK
#if OPTICK_ENABLE_GPU_VULKAN
#include <vulkan/vulkan.h>

#include "optick_core.h"
#include "optick_gpu.h"

#define OPTICK_VK_CHECK(args) do { VkResult __hr = args; OPTICK_ASSERT(__hr == VK_SUCCESS, "Failed check"); (void)__hr; } while(false);

namespace Optick
{
	class GPUProfilerVulkan : public GPUProfiler
	{
	protected:
		struct Frame
		{
			VkCommandBuffer commandBuffer;
			VkFence fence;
			Frame() : commandBuffer(VK_NULL_HANDLE), fence(VK_NULL_HANDLE) {}
		};

		struct NodePayload
		{
			VkDevice			device;
			VkPhysicalDevice	physicalDevice;
			VkQueue				queue;
			VkQueryPool			queryPool;
			VkCommandPool		commandPool;

			array<Frame, NUM_FRAMES_DELAY> frames;

			NodePayload() : device(VK_NULL_HANDLE), physicalDevice(VK_NULL_HANDLE), queue(VK_NULL_HANDLE), queryPool(VK_NULL_HANDLE), commandPool(VK_NULL_HANDLE) {}
			~NodePayload();
		};
		vector<NodePayload*> nodePayloads;

		void ResolveTimestamps(VkCommandBuffer commandBuffer, uint32_t startIndex, uint32_t count);
		void WaitForFrame(uint64_t frameNumber);

	public:
		GPUProfilerVulkan();
		~GPUProfilerVulkan();

		void InitDevice(VkDevice* devices, VkPhysicalDevice* physicalDevices, VkQueue* cmdQueues, uint32_t* cmdQueuesFamily, uint32_t nodeCount);
		void QueryTimestamp(VkCommandBuffer commandBuffer, int64_t* outCpuTimestamp);


		// Interface implementation
		ClockSynchronization GetClockSynchronization(uint32_t nodeIndex) override;

		void QueryTimestamp(void* context, int64_t* outCpuTimestamp) override
		{
			QueryTimestamp((VkCommandBuffer)context, outCpuTimestamp);
		}

		void Flip(void* swapChain) override;
	};

	void InitGpuVulkan(void* vkDevices, void* vkPhysicalDevices, void* vkQueues, uint32_t* cmdQueuesFamily, uint32_t numQueues)
	{
		GPUProfilerVulkan* gpuProfiler = Memory::New<GPUProfilerVulkan>();
		gpuProfiler->InitDevice((VkDevice*)vkDevices, (VkPhysicalDevice*)vkPhysicalDevices, (VkQueue*)vkQueues, cmdQueuesFamily, numQueues);
		Core::Get().InitGPUProfiler(gpuProfiler);
	}

	GPUProfilerVulkan::GPUProfilerVulkan()
	{
	}

	void GPUProfilerVulkan::InitDevice(VkDevice* devices, VkPhysicalDevice* physicalDevices, VkQueue* cmdQueues, uint32_t* cmdQueuesFamily, uint32_t nodeCount)
	{
		VkQueryPoolCreateInfo queryPoolCreateInfo;
		queryPoolCreateInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
		queryPoolCreateInfo.pNext = 0;
		queryPoolCreateInfo.flags = 0;
		queryPoolCreateInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
		queryPoolCreateInfo.queryCount = MAX_QUERIES_COUNT + 1;

		VkCommandPoolCreateInfo commandPoolCreateInfo;
		commandPoolCreateInfo.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
		commandPoolCreateInfo.pNext = 0;
		commandPoolCreateInfo.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT;

		nodes.resize(nodeCount);
		nodePayloads.resize(nodeCount);

		VkResult r;
		for (uint32_t i = 0; i < nodeCount; ++i)
		{
			VkPhysicalDeviceProperties properties = { 0 };
			vkGetPhysicalDeviceProperties(physicalDevices[i], &properties);
			GPUProfiler::InitNode(properties.deviceName, i);

			NodePayload* nodePayload = Memory::New<NodePayload>();
			nodePayloads[i] = nodePayload;
			nodePayload->device = devices[i];
			nodePayload->physicalDevice = physicalDevices[i];
			nodePayload->queue = cmdQueues[i];
			
			r = vkCreateQueryPool(devices[i], &queryPoolCreateInfo, 0, &nodePayload->queryPool);
			OPTICK_ASSERT(r == VK_SUCCESS, "Failed");

			commandPoolCreateInfo.queueFamilyIndex = cmdQueuesFamily[i];
			r = vkCreateCommandPool(nodePayload->device, &commandPoolCreateInfo, 0, &nodePayload->commandPool);
			OPTICK_ASSERT(r == VK_SUCCESS, "Failed");

			for (uint32_t j = 0; j < nodePayload->frames.size(); ++j)
			{
				Frame& frame = nodePayload->frames[j];

				VkCommandBufferAllocateInfo allocInfo;
				allocInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
				allocInfo.pNext = 0;
				allocInfo.commandBufferCount = 1;
				allocInfo.commandPool = nodePayload->commandPool;
				allocInfo.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY;
				r = vkAllocateCommandBuffers(nodePayload->device, &allocInfo, &frame.commandBuffer);
				OPTICK_ASSERT(r == VK_SUCCESS, "Failed");

				VkFenceCreateInfo fenceCreateInfo;
				fenceCreateInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
				fenceCreateInfo.pNext = 0;
				fenceCreateInfo.flags = j == 0 ? 0 : VK_FENCE_CREATE_SIGNALED_BIT;
				r = vkCreateFence(nodePayload->device, &fenceCreateInfo, 0, &frame.fence);
				OPTICK_ASSERT(r == VK_SUCCESS, "Failed");
				if (j == 0)
				{
					VkCommandBufferBeginInfo commandBufferBeginInfo;
					commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
					commandBufferBeginInfo.pNext = 0;
					commandBufferBeginInfo.pInheritanceInfo = 0;
					commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
					vkBeginCommandBuffer(frame.commandBuffer, &commandBufferBeginInfo);
					vkCmdResetQueryPool(frame.commandBuffer, nodePayload->queryPool, 0, MAX_QUERIES_COUNT);
					vkEndCommandBuffer(frame.commandBuffer);

					VkSubmitInfo submitInfo = {};
					submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
					submitInfo.pNext = nullptr;
					submitInfo.waitSemaphoreCount = 0;
					submitInfo.pWaitSemaphores = nullptr;
					submitInfo.commandBufferCount = 1;
					submitInfo.pCommandBuffers = &frame.commandBuffer;
					submitInfo.signalSemaphoreCount = 0;
					submitInfo.pSignalSemaphores = nullptr;
					vkQueueSubmit(nodePayload->queue, 1, &submitInfo, frame.fence);
					vkWaitForFences(nodePayload->device, 1, &frame.fence, 1, (uint64_t)-1);
					vkResetCommandBuffer(frame.commandBuffer, VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT);
				}
			}
		}
	}

	void GPUProfilerVulkan::QueryTimestamp(VkCommandBuffer commandBuffer, int64_t* outCpuTimestamp)
	{
		if (currentState == STATE_RUNNING)
		{
			uint32_t index = nodes[currentNode]->QueryTimestamp(outCpuTimestamp);
			vkCmdWriteTimestamp(commandBuffer, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, nodePayloads[currentNode]->queryPool, index);
		}
	}

	void GPUProfilerVulkan::ResolveTimestamps(VkCommandBuffer commandBuffer, uint32_t startIndex, uint32_t count)
	{
		if (count)
		{
			Node* node = nodes[currentNode];

			NodePayload* payload = nodePayloads[currentNode];

			OPTICK_VK_CHECK(vkGetQueryPoolResults(payload->device, payload->queryPool, startIndex, count, 8 * count, &nodes[currentNode]->queryGpuTimestamps[startIndex], 8, VK_QUERY_RESULT_64_BIT));
			vkCmdResetQueryPool(commandBuffer, payload->queryPool, startIndex, count);

			// Convert GPU timestamps => CPU Timestamps
			for (uint32_t index = startIndex; index < startIndex + count; ++index)
				*node->queryCpuTimestamps[index] = node->clock.GetCPUTimestamp(node->queryGpuTimestamps[index]);
		}
	}

	void GPUProfilerVulkan::WaitForFrame(uint64_t frameNumberToWait)
	{
		OPTICK_EVENT();

		int r = VK_SUCCESS;
		do
		{
			NodePayload& payload = *nodePayloads[currentNode];
			r = vkWaitForFences(nodePayloads[currentNode]->device, 1, &payload.frames[frameNumberToWait % payload.frames.size()].fence, 1, 1000 * 30);
		} while (r != VK_SUCCESS);
	}

	void GPUProfilerVulkan::Flip(void* /*swapChain*/)
	{
		OPTICK_CATEGORY("GPUProfilerVulkan::Flip", Category::Debug);

		std::lock_guard<std::recursive_mutex> lock(updateLock);

		if (currentState == STATE_STARTING)
			currentState = STATE_RUNNING;

		if (currentState == STATE_RUNNING)
		{
			Node& node = *nodes[currentNode];
			NodePayload& payload = *nodePayloads[currentNode];

			uint32_t currentFrameIndex = frameNumber % NUM_FRAMES_DELAY;
			uint32_t nextFrameIndex = (frameNumber + 1) % NUM_FRAMES_DELAY;

			QueryFrame& currentFrame = node.queryGpuframes[currentFrameIndex];
			QueryFrame& nextFrame = node.queryGpuframes[nextFrameIndex];

			VkCommandBuffer commandBuffer = payload.frames[currentFrameIndex].commandBuffer;
			VkFence fence = payload.frames[currentFrameIndex].fence;
			VkDevice device = payload.device;
			VkQueue queue = payload.queue;

			vkWaitForFences(device, 1, &fence, 1, (uint64_t)-1);

			VkCommandBufferBeginInfo commandBufferBeginInfo;
			commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
			commandBufferBeginInfo.pNext = 0;
			commandBufferBeginInfo.pInheritanceInfo = 0;
			commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
			OPTICK_VK_CHECK(vkBeginCommandBuffer(commandBuffer, &commandBufferBeginInfo));
			vkResetFences(device, 1, &fence);

			if (EventData* frameEvent = currentFrame.frameEvent)
				QueryTimestamp(commandBuffer, &frameEvent->finish);

			// Generate GPU Frame event for the next frame
			EventData& event = AddFrameEvent();
			QueryTimestamp(commandBuffer, &event.start);
			QueryTimestamp(commandBuffer, &AddFrameTag().timestamp);
			nextFrame.frameEvent = &event;

			OPTICK_VK_CHECK(vkEndCommandBuffer(commandBuffer));
			VkSubmitInfo submitInfo = {};
			submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
			submitInfo.pNext = nullptr;
			submitInfo.waitSemaphoreCount = 0;
			submitInfo.pWaitSemaphores = nullptr;
			submitInfo.commandBufferCount = 1;
			submitInfo.pCommandBuffers = &commandBuffer;
			submitInfo.signalSemaphoreCount = 0;
			submitInfo.pSignalSemaphores = nullptr;
			OPTICK_VK_CHECK(vkQueueSubmit(queue, 1, &submitInfo, fence));

			uint32_t queryBegin = currentFrame.queryIndexStart;
			uint32_t queryEnd = node.queryIndex;

			if (queryBegin != (uint32_t)-1)
			{
				currentFrame.queryIndexCount = queryEnd - queryBegin;
			}

			// Preparing Next Frame
			// Try resolve timestamps for the current frame
			if (nextFrame.queryIndexStart != (uint32_t)-1)
			{
				uint32_t startIndex = nextFrame.queryIndexStart % MAX_QUERIES_COUNT;
				uint32_t finishIndex = (startIndex + nextFrame.queryIndexCount) % MAX_QUERIES_COUNT;

				if (startIndex < finishIndex)
				{
					ResolveTimestamps(commandBuffer, startIndex, finishIndex - startIndex);
				}
				else if (startIndex > finishIndex)
				{
					ResolveTimestamps(commandBuffer, startIndex, MAX_QUERIES_COUNT - startIndex);
					ResolveTimestamps(commandBuffer, 0, finishIndex);
				}
			}

			nextFrame.queryIndexStart = queryEnd;
			nextFrame.queryIndexCount = 0;
		}

		++frameNumber;
	}

	GPUProfiler::ClockSynchronization GPUProfilerVulkan::GetClockSynchronization(uint32_t nodeIndex)
	{
		GPUProfiler::ClockSynchronization clock;

		NodePayload& node = *nodePayloads[nodeIndex];
		Frame& currentFrame = node.frames[frameNumber % NUM_FRAMES_DELAY];
		
		VkCommandBufferBeginInfo commandBufferBeginInfo;
		commandBufferBeginInfo.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
		commandBufferBeginInfo.pNext = 0;
		commandBufferBeginInfo.pInheritanceInfo = 0;
		commandBufferBeginInfo.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
		VkCommandBuffer CB = currentFrame.commandBuffer;
		VkDevice Device = node.device;
		VkFence Fence = currentFrame.fence;

		vkWaitForFences(Device, 1, &Fence, 1, (uint64_t)-1);
		vkResetFences(Device, 1, &Fence);
		vkResetCommandBuffer(CB, VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT);
		vkBeginCommandBuffer(CB, &commandBufferBeginInfo);
		vkCmdResetQueryPool(CB, nodePayloads[nodeIndex]->queryPool, 0, 1);
		vkCmdWriteTimestamp(CB, VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, nodePayloads[nodeIndex]->queryPool, 0);
		vkEndCommandBuffer(CB);

		VkSubmitInfo submitInfo = {};
		submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
		submitInfo.pNext = nullptr;
		submitInfo.waitSemaphoreCount = 0;
		submitInfo.pWaitSemaphores = nullptr;
		submitInfo.commandBufferCount = 1;
		submitInfo.pCommandBuffers = &CB;
		submitInfo.signalSemaphoreCount = 0;
		submitInfo.pSignalSemaphores = nullptr;
		vkQueueSubmit(nodePayloads[nodeIndex]->queue, 1, &submitInfo, Fence);
		vkWaitForFences(Device, 1, &Fence, 1, (uint64_t)-1);

		clock.timestampGPU = 0;
		vkGetQueryPoolResults(Device, nodePayloads[nodeIndex]->queryPool, 0, 1, 8, &clock.timestampGPU, 8, VK_QUERY_RESULT_64_BIT);
		clock.timestampCPU = GetHighPrecisionTime();
		clock.frequencyCPU = GetHighPrecisionFrequency();

		VkPhysicalDeviceProperties Properties;
		vkGetPhysicalDeviceProperties(nodePayloads[nodeIndex]->physicalDevice, &Properties);
		clock.frequencyGPU = (uint64_t)(1000000000ll / Properties.limits.timestampPeriod);

		return clock;
	}

	GPUProfilerVulkan::NodePayload::~NodePayload()
	{
		vkDestroyCommandPool(device, commandPool, nullptr);
		vkDestroyQueryPool(device, queryPool, nullptr);
	}

	GPUProfilerVulkan::~GPUProfilerVulkan()
	{
		WaitForFrame(frameNumber - 1);

		for (NodePayload* payload : nodePayloads)
		{
			for (Frame& frame : payload->frames)
			{
				vkDestroyFence(payload->device, frame.fence, nullptr);
				vkFreeCommandBuffers(payload->device, payload->commandPool, 1, &frame.commandBuffer);
			}

			Memory::Delete(payload);
		}

		nodePayloads.clear();
	}
}
#else
#include "optick_common.h"
namespace Optick
{
	void InitGpuVulkan(void* /*devices*/, void* /*physicalDevices*/, void* /*cmdQueues*/, uint32_t* /*cmdQueuesFamily*/, uint32_t /*numQueues*/)
	{
		OPTICK_FAILED("OPTICK_ENABLE_GPU_VULKAN is disabled! Can't initialize GPU Profiler!");
	}
}
#endif //OPTICK_ENABLE_GPU_D3D12
#endif //USE_OPTICK