Commit fff46e9b authored by Laszlo Agocs's avatar Laszlo Agocs

vk: Do timestamp queries when profiling is enabled

...also write more sophisticated output in the profiler
stream when it comes to frame timings.
parent d5952b1f
......@@ -1116,7 +1116,8 @@ public:
enum Feature {
MultisampleTexture = 1,
MultisampleRenderBuffer,
DebugMarkers
DebugMarkers,
Timestamps
};
enum ResourceSizeLimit {
......
......@@ -263,6 +263,8 @@ bool QRhiD3D11::isFeatureSupported(QRhi::Feature feature) const
return true;
case QRhi::DebugMarkers:
return annotations != nullptr;
case QRhi::Timestamps:
return false;
default:
Q_UNREACHABLE();
return false;
......@@ -553,6 +555,9 @@ QRhi::FrameOpResult QRhiD3D11::beginFrame(QRhiSwapChain *swapChain)
swapChainD->msaaRtv[swapChainD->currentFrameSlot] : swapChainD->rtv[swapChainD->currentFrameSlot];
swapChainD->rt.d.dsv = swapChainD->ds ? swapChainD->ds->dsv : nullptr;
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
QRHI_PROF_F(beginSwapChainFrame(swapChain));
finishActiveReadbacks();
return QRhi::FrameOpSuccess;
......@@ -574,6 +579,10 @@ QRhi::FrameOpResult QRhiD3D11::endFrame(QRhiSwapChain *swapChain)
swapChainD->colorFormat);
}
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
// this must be done before the Present
QRHI_PROF_F(endSwapChainFrame(swapChain, swapChainD->frameCount + 1));
const UINT swapInterval = 1;
const UINT presentFlags = 0;
HRESULT hr = swapChainD->swapChain->Present(swapInterval, presentFlags);
......@@ -583,9 +592,6 @@ QRhi::FrameOpResult QRhiD3D11::endFrame(QRhiSwapChain *swapChain)
swapChainD->currentFrameSlot = (swapChainD->currentFrameSlot + 1) % QD3D11SwapChain::BUFFER_COUNT;
swapChainD->frameCount += 1;
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
QRHI_PROF_F(endSwapChainFrame(swapChain, swapChainD->frameCount));
contextState.currentSwapChain = nullptr;
return QRhi::FrameOpSuccess;
......
......@@ -265,6 +265,8 @@ bool QRhiGles2::isFeatureSupported(QRhi::Feature feature) const
return caps.msaaRenderBuffer;
case QRhi::DebugMarkers:
return false;
case QRhi::Timestamps:
return false;
default:
Q_UNREACHABLE();
return false;
......@@ -505,6 +507,9 @@ QRhi::FrameOpResult QRhiGles2::beginFrame(QRhiSwapChain *swapChain)
inFrame = true;
currentSwapChain = swapChainD;
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
QRHI_PROF_F(beginSwapChainFrame(swapChain));
executeDeferredReleases();
QRHI_RES(QGles2CommandBuffer, &swapChainD->cb)->resetState();
......@@ -524,6 +529,10 @@ QRhi::FrameOpResult QRhiGles2::endFrame(QRhiSwapChain *swapChain)
executeCommandBuffer(&swapChainD->cb);
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
// this must be done before the swap
QRHI_PROF_F(endSwapChainFrame(swapChain, swapChainD->frameCount + 1));
if (swapChainD->surface) {
ctx->swapBuffers(swapChainD->surface);
buffersSwapped = true;
......@@ -531,9 +540,6 @@ QRhi::FrameOpResult QRhiGles2::endFrame(QRhiSwapChain *swapChain)
swapChainD->frameCount += 1;
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
QRHI_PROF_F(endSwapChainFrame(swapChain, swapChainD->frameCount));
currentSwapChain = nullptr;
return QRhi::FrameOpSuccess;
......
......@@ -361,6 +361,8 @@ bool QRhiMetal::isFeatureSupported(QRhi::Feature feature) const
Q_FALLTHROUGH();
case QRhi::DebugMarkers:
return true;
case QRhi::Timestamps:
return false;
default:
Q_UNREACHABLE();
return false;
......@@ -703,6 +705,9 @@ QRhi::FrameOpResult QRhiMetal::beginFrame(QRhiSwapChain *swapChain)
swapChainD->rtWrapper.d->fb.dsTex = swapChainD->ds ? swapChainD->ds->d->tex : nil;
swapChainD->rtWrapper.d->fb.hasStencil = swapChainD->ds ? true : false;
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
QRHI_PROF_F(beginSwapChainFrame(swapChain));
executeDeferredReleases();
swapChainD->cbWrapper.resetState();
finishActiveReadbacks();
......
......@@ -67,6 +67,17 @@ void QRhiProfiler::addVMemAllocatorStats()
d->rhiD->sendVMemStatsToProfiler();
}
int QRhiProfiler::frameTimingWriteInterval() const
{
return d->frameTimingWriteInterval;
}
void QRhiProfiler::setFrameTimingWriteInterval(int frameCount)
{
if (frameCount > 0)
d->frameTimingWriteInterval = frameCount;
}
QRhiProfilerPrivate::~QRhiProfilerPrivate()
{
flushStream();
......@@ -306,38 +317,97 @@ void QRhiProfilerPrivate::releaseSwapChain(QRhiSwapChain *sc)
writer->endMap();
}
void QRhiProfilerPrivate::endSwapChainFrame(QRhiSwapChain *sc, int frameCount)
template<typename T>
void calcTiming(QVector<T> *vec, T *minDelta, T *maxDelta, float *avgDelta)
{
if (!swapchains.contains(sc)) {
swapchains[sc].t.start();
if (vec->isEmpty())
return;
*minDelta = *maxDelta = 0;
float totalDelta = 0;
for (T delta : qAsConst(*vec)) {
totalDelta += float(delta);
if (*minDelta == 0 || delta < *minDelta)
*minDelta = delta;
if (*maxDelta == 0 || delta > *maxDelta)
*maxDelta = delta;
}
*avgDelta = totalDelta / vec->count();
vec->clear();
}
void QRhiProfilerPrivate::beginSwapChainFrame(QRhiSwapChain *sc)
{
Sc &scd(swapchains[sc]);
scd.frameDelta[scd.n++] = scd.t.restart();
if (scd.n == Sc::FRAME_SAMPLE_SIZE) {
scd.n = 0;
qint64 minDelta = 0;
qint64 maxDelta = 0;
float totalDelta = 0;
for (int i = 0; i < Sc::FRAME_SAMPLE_SIZE; ++i) {
const qint64 delta = scd.frameDelta[i];
totalDelta += delta;
if (minDelta == 0 || delta < minDelta)
minDelta = delta;
if (maxDelta == 0 || delta > maxDelta)
maxDelta = delta;
scd.beginToEndTimer.start();
}
void QRhiProfilerPrivate::endSwapChainFrame(QRhiSwapChain *sc, int frameCount)
{
Sc &scd(swapchains[sc]);
if (!scd.frameToFrameRunning) {
scd.frameToFrameTimer.start();
scd.frameToFrameRunning = true;
return;
}
scd.frameToFrameDelta.append(scd.frameToFrameTimer.restart());
if (scd.frameToFrameDelta.count() >= frameTimingWriteInterval) {
qint64 minDelta;
qint64 maxDelta;
float avgDelta = 0;
calcTiming(&scd.frameToFrameDelta, &minDelta, &maxDelta, &avgDelta);
if (ensureStream()) {
writer->startMap();
WRITE_OP(FrameToFrameTime);
WRITE_TIMESTAMP;
WRITE_PAIR(QLatin1String("swapchain"), quint64(quintptr(sc)));
WRITE_PAIR(QLatin1String("frames_since_resize"), frameCount);
WRITE_PAIR(QLatin1String("min_ms_frame_delta"), minDelta);
WRITE_PAIR(QLatin1String("max_ms_frame_delta"), maxDelta);
WRITE_PAIR(QLatin1String("avg_ms_frame_delta"), avgDelta);
writer->endMap();
}
const float avgDelta = totalDelta / Sc::FRAME_SAMPLE_SIZE;
}
scd.beginToEndDelta.append(scd.beginToEndTimer.elapsed());
if (scd.beginToEndDelta.count() >= frameTimingWriteInterval) {
qint64 minDelta;
qint64 maxDelta;
float avgDelta = 0;
calcTiming(&scd.beginToEndDelta, &minDelta, &maxDelta, &avgDelta);
if (ensureStream()) {
writer->startMap();
WRITE_OP(FrameTime);
WRITE_OP(FrameBuildTime);
WRITE_TIMESTAMP;
WRITE_PAIR(QLatin1String("swapchain"), quint64(quintptr(sc)));
WRITE_PAIR(QLatin1String("frames_since_resize"), frameCount);
WRITE_PAIR(QLatin1String("min_ms_between_frames"), minDelta);
WRITE_PAIR(QLatin1String("max_ms_between_frames"), maxDelta);
WRITE_PAIR(QLatin1String("avg_ms_between_frames"), avgDelta);
WRITE_PAIR(QLatin1String("min_ms_frame_build"), minDelta);
WRITE_PAIR(QLatin1String("max_ms_frame_build"), maxDelta);
WRITE_PAIR(QLatin1String("avg_ms_frame_build"), avgDelta);
writer->endMap();
}
}
}
void QRhiProfilerPrivate::swapChainFrameGpuTime(QRhiSwapChain *sc, float gpuTime)
{
Sc &scd(swapchains[sc]);
scd.gpuFrameTime.append(gpuTime);
if (scd.gpuFrameTime.count() >= frameTimingWriteInterval) {
float minDelta;
float maxDelta;
float avgDelta = 0;
calcTiming(&scd.gpuFrameTime, &minDelta, &maxDelta, &avgDelta);
if (ensureStream()) {
writer->startMap();
WRITE_OP(GpuFrameTime);
WRITE_TIMESTAMP;
WRITE_PAIR(QLatin1String("swapchain"), quint64(quintptr(sc)));
WRITE_PAIR(QLatin1String("min_ms_gpu_frame_time"), minDelta);
WRITE_PAIR(QLatin1String("max_ms_gpu_frame_time"), maxDelta);
WRITE_PAIR(QLatin1String("avg_ms_gpu_frame_time"), avgDelta);
writer->endMap();
}
}
......
......@@ -63,17 +63,21 @@ public:
NewReadbackBuffer,
ReleaseReadbackBuffer,
VMemAllocStats,
FrameTime
GpuFrameTime,
FrameToFrameTime,
FrameBuildTime
};
~QRhiProfiler();
void setDevice(QIODevice *device);
void flush();
void addVMemAllocatorStats();
int frameTimingWriteInterval() const;
void setFrameTimingWriteInterval(int frameCount);
private:
Q_DISABLE_COPY(QRhiProfiler)
QRhiProfiler();
......
......@@ -73,7 +73,9 @@ public:
void resizeSwapChain(QRhiSwapChain *sc, int bufferCount, int msaaBufferCount, int sampleCount);
void releaseSwapChain(QRhiSwapChain *sc);
void beginSwapChainFrame(QRhiSwapChain *sc);
void endSwapChainFrame(QRhiSwapChain *sc, int frameCount);
void swapChainFrameGpuTime(QRhiSwapChain *sc, float gpuTimeMs);
void newReadbackBuffer(quint64 id, QRhiResource *src, quint32 size);
void releaseReadbackBuffer(quint64 id);
......@@ -89,11 +91,20 @@ public:
QCborStreamWriter *writer = nullptr;
bool active = false;
QElapsedTimer ts;
static const int DEFAULT_FRAME_TIMING_WRITE_INTERVAL = 120; // frames
int frameTimingWriteInterval = DEFAULT_FRAME_TIMING_WRITE_INTERVAL;
struct Sc {
QElapsedTimer t;
int n = 0;
static const int FRAME_SAMPLE_SIZE = 120;
qint64 frameDelta[FRAME_SAMPLE_SIZE];
Sc() {
frameToFrameDelta.reserve(DEFAULT_FRAME_TIMING_WRITE_INTERVAL);
beginToEndDelta.reserve(DEFAULT_FRAME_TIMING_WRITE_INTERVAL);
gpuFrameTime.reserve(DEFAULT_FRAME_TIMING_WRITE_INTERVAL);
}
QElapsedTimer frameToFrameTimer;
bool frameToFrameRunning = false;
QElapsedTimer beginToEndTimer;
QVector<qint64> frameToFrameDelta;
QVector<qint64> beginToEndDelta;
QVector<float> gpuFrameTime;
};
QHash<QRhiSwapChain *, Sc> swapchains;
};
......
......@@ -230,6 +230,8 @@ bool QRhiVulkan::create(QRhi::Flags flags)
return false;
}
timestampValidBits = queueFamilyProps[gfxQueueFamilyIdx].timestampValidBits;
VkDeviceQueueCreateInfo queueInfo[2];
const float prio[] = { 0 };
memset(queueInfo, 0, sizeof(queueInfo));
......@@ -349,6 +351,19 @@ bool QRhiVulkan::create(QRhi::Flags flags)
else
qWarning("Failed to create initial descriptor pool: %d", err);
VkQueryPoolCreateInfo timestampQueryPoolInfo;
memset(&timestampQueryPoolInfo, 0, sizeof(timestampQueryPoolInfo));
timestampQueryPoolInfo.sType = VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO;
timestampQueryPoolInfo.queryType = VK_QUERY_TYPE_TIMESTAMP;
timestampQueryPoolInfo.queryCount = QVK_MAX_ACTIVE_TIMESTAMP_PAIRS * 2;
err = df->vkCreateQueryPool(dev, &timestampQueryPoolInfo, nullptr, &timestampQueryPool);
if (err != VK_SUCCESS) {
qWarning("Failed to create timestamp query pool: %d", err);
return false;
}
timestampQueryPoolMap.resize(QVK_MAX_ACTIVE_TIMESTAMP_PAIRS); // 1 bit per pair
timestampQueryPoolMap.fill(false);
if (debugMarkersAvailable) {
vkCmdDebugMarkerBegin = reinterpret_cast<PFN_vkCmdDebugMarkerBeginEXT>(f->vkGetDeviceProcAddr(dev, "vkCmdDebugMarkerBeginEXT"));
vkCmdDebugMarkerEnd = reinterpret_cast<PFN_vkCmdDebugMarkerEndEXT>(f->vkGetDeviceProcAddr(dev, "vkCmdDebugMarkerEndEXT"));
......@@ -396,6 +411,11 @@ void QRhiVulkan::destroy()
descriptorPools.clear();
if (timestampQueryPool) {
df->vkDestroyQueryPool(dev, timestampQueryPool, nullptr);
timestampQueryPool = VK_NULL_HANDLE;
}
if (!importedDevPoolQueue) {
vmaDestroyAllocator(toVmaAllocator(allocator));
if (cmdPool) {
......@@ -1230,6 +1250,9 @@ QRhi::FrameOpResult QRhiVulkan::beginWrapperFrame(QRhiSwapChain *swapChain)
currentFrameSlot = w->currentFrame();
currentSwapChain = swapChainD;
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
QRHI_PROF_F(beginSwapChainFrame(swapChain));
prepareNewFrame(&swapChainD->cbWrapper);
return QRhi::FrameOpSuccess;
......@@ -1348,6 +1371,7 @@ QRhi::FrameOpResult QRhiVulkan::beginNonWrapperFrame(QRhiSwapChain *swapChain)
{
QVkSwapChain *swapChainD = QRHI_RES(QVkSwapChain, swapChain);
QVkSwapChain::FrameResources &frame(swapChainD->frameRes[swapChainD->currentFrameSlot]);
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
if (!frame.imageAcquired) {
// Wait if we are too far ahead, i.e. the thread gets throttled based on the presentation rate
......@@ -1389,12 +1413,56 @@ QRhi::FrameOpResult QRhiVulkan::beginNonWrapperFrame(QRhiSwapChain *swapChain)
// mess up A's in-flight commands (as they are not in flight anymore).
waitCommandCompletion(swapChainD->currentFrameSlot);
// Now is the time to read the timestamps for the previous frame for this slot.
if (frame.timestampQueryIndex >= 0) {
quint64 timestamp[2] = { 0, 0 };
VkResult err = df->vkGetQueryPoolResults(dev, timestampQueryPool, frame.timestampQueryIndex, 2,
2 * sizeof(quint64), &timestamp, 0, VK_QUERY_RESULT_64_BIT);
timestampQueryPoolMap.clearBit(frame.timestampQueryIndex / 2);
frame.timestampQueryIndex = -1;
if (err == VK_SUCCESS) {
quint64 mask = 0;
for (quint64 i = 0; i < timestampValidBits; i += 8)
mask |= 0xFFULL << i;
const quint64 ts0 = timestamp[0] & mask;
const quint64 ts1 = timestamp[1] & mask;
const float freqNs = physDevProperties.limits.timestampPeriod;
if (!qFuzzyIsNull(freqNs)) {
const float elapsedMs = float(ts1 - ts0) / freqNs / 1000000.0f;
// now we have the gpu time for the previous frame for this slot, report it
// (does not matter that it is not for this frame)
QRHI_PROF_F(swapChainFrameGpuTime(swapChain, elapsedMs));
}
} else {
qWarning("Failed to query timestamp: %d", err);
}
}
// build new draw command buffer
QVkSwapChain::ImageResources &image(swapChainD->imageRes[swapChainD->currentImageIndex]);
QRhi::FrameOpResult cbres = startCommandBuffer(&image.cmdBuf);
if (cbres != QRhi::FrameOpSuccess)
return cbres;
// when profiling is enabled, pick a free query (pair) from the pool
int timestampQueryIdx = -1;
if (profilerPrivateOrNull()) {
for (int i = 0; i < timestampQueryPoolMap.count(); ++i) {
if (!timestampQueryPoolMap.testBit(i)) {
timestampQueryPoolMap.setBit(i);
timestampQueryIdx = i * 2;
break;
}
}
}
if (timestampQueryIdx >= 0) {
df->vkCmdResetQueryPool(image.cmdBuf, timestampQueryPool, timestampQueryIdx, 2);
// record timestamp at the start of the command buffer
df->vkCmdWriteTimestamp(image.cmdBuf, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT,
timestampQueryPool, timestampQueryIdx);
frame.timestampQueryIndex = timestampQueryIdx;
}
swapChainD->cbWrapper.cb = image.cmdBuf;
swapChainD->rtWrapper.d.fb = image.fb;
......@@ -1404,6 +1472,8 @@ QRhi::FrameOpResult QRhiVulkan::beginNonWrapperFrame(QRhiSwapChain *swapChain)
if (swapChainD->ds)
swapChainD->ds->lastActiveFrameSlot = currentFrameSlot;
QRHI_PROF_F(beginSwapChainFrame(swapChain));
prepareNewFrame(&swapChainD->cbWrapper);
return QRhi::FrameOpSuccess;
......@@ -1439,6 +1509,12 @@ QRhi::FrameOpResult QRhiVulkan::endNonWrapperFrame(QRhiSwapChain *swapChain)
image.presentableLayout = true;
}
// record another timestamp, when enabled
if (frame.timestampQueryIndex >= 0) {
df->vkCmdWriteTimestamp(image.cmdBuf, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
timestampQueryPool, frame.timestampQueryIndex + 1);
}
// stop recording and submit to the queue
Q_ASSERT(!image.cmdFenceWaitable);
QRhi::FrameOpResult submitres = endAndSubmitCommandBuffer(image.cmdBuf,
......@@ -1451,6 +1527,10 @@ QRhi::FrameOpResult QRhiVulkan::endNonWrapperFrame(QRhiSwapChain *swapChain)
frame.imageSemWaitable = false;
image.cmdFenceWaitable = true;
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
// this must be done before the Present
QRHI_PROF_F(endSwapChainFrame(swapChain, swapChainD->frameCount + 1));
// add the Present to the queue
VkPresentInfoKHR presInfo;
memset(&presInfo, 0, sizeof(presInfo));
......@@ -1479,9 +1559,6 @@ QRhi::FrameOpResult QRhiVulkan::endNonWrapperFrame(QRhiSwapChain *swapChain)
swapChainD->currentFrameSlot = (swapChainD->currentFrameSlot + 1) % QVK_FRAMES_IN_FLIGHT;
swapChainD->frameCount += 1;
QRhiProfilerPrivate *rhiP = profilerPrivateOrNull();
QRHI_PROF_F(endSwapChainFrame(swapChain, swapChainD->frameCount));
currentSwapChain = nullptr;
return QRhi::FrameOpSuccess;
......@@ -2675,6 +2752,8 @@ bool QRhiVulkan::isFeatureSupported(QRhi::Feature feature) const
return true;
case QRhi::DebugMarkers:
return debugMarkersAvailable;
case QRhi::Timestamps:
return timestampValidBits != 0;
default:
Q_UNREACHABLE();
return false;
......
......@@ -56,6 +56,8 @@ static const int QVK_DESC_SETS_PER_POOL = 128;
static const int QVK_UNIFORM_BUFFERS_PER_POOL = 256;
static const int QVK_COMBINED_IMAGE_SAMPLERS_PER_POOL = 256;
static const int QVK_MAX_ACTIVE_TIMESTAMP_PAIRS = 16;
// no vk_mem_alloc.h available here, void* is good enough
typedef void * QVkAlloc;
typedef void * QVkAllocator;
......@@ -314,6 +316,7 @@ struct QVkSwapChain : public QRhiSwapChain
bool imageAcquired = false;
bool imageSemWaitable = false;
quint32 imageIndex = 0;
int timestampQueryIndex = -1;
} frameRes[QVK_FRAMES_IN_FLIGHT];
quint32 currentImageIndex = 0; // index in imageRes
......@@ -473,6 +476,7 @@ public:
VkCommandPool cmdPool = VK_NULL_HANDLE;
int gfxQueueFamilyIdx = -1;
VkQueue gfxQueue = VK_NULL_HANDLE;
quint32 timestampValidBits = 0;
QVkAllocator allocator = nullptr;
QVulkanFunctions *f = nullptr;
QVulkanDeviceFunctions *df = nullptr;
......@@ -506,6 +510,9 @@ public:
};
QVector<DescriptorPoolData> descriptorPools;
VkQueryPool timestampQueryPool = VK_NULL_HANDLE;
QBitArray timestampQueryPoolMap;
VkFormat optimalDsFormat = VK_FORMAT_UNDEFINED;
QMatrix4x4 clipCorrectMatrix;
......
d3d, vk: timestamp query
d3d: timestamp query
prof report api (fed by a cbor stream)
mtl: report readback temp buf
multiwindow_threaded should demo pulling out the device and importing to another rhi
......@@ -52,6 +52,7 @@ dxc for d3d as an alternative to fxc?
hlsl -> dxc -> spirv -> spirv-cross hmmm...
+++ done
vk: timestamp query
prof: report readback temp buf
merge offscreen examples into one
mtl: max texture size
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment