LumixEngine/src/engine/profiler.cpp
Mikulas Florek 332bf38efe cleanup
2022-02-18 11:52:23 +01:00

636 lines
14 KiB
C++

#ifdef _WIN32
#define INITGUID
#define NOGDI
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>
#include <evntcons.h>
#endif
#include "engine/array.h"
#include "engine/crt.h"
#include "engine/hash_map.h"
#include "engine/allocators.h"
#include "engine/atomic.h"
#include "engine/math.h"
#include "engine/string.h"
#include "engine/sync.h"
#include "engine/thread.h"
#include "engine/os.h"
#include "profiler.h"
namespace Lumix
{
namespace profiler
{
#ifdef LUMIX_DEBUG
static constexpr u32 default_context_size = 5 * 1024 * 1024;
static constexpr u32 default_global_context_size = 10 * 1024 * 1024;
#else
static constexpr u32 default_context_size = 1024 * 1024;
static constexpr u32 default_global_context_size = 2 * 1024 * 1024;
#endif
struct ThreadContext
{
ThreadContext(u32 buffer_size, IAllocator& allocator)
: buffer(allocator)
, open_blocks(allocator)
{
buffer.resize(buffer_size);
open_blocks.reserve(64);
}
Array<i32> open_blocks;
OutputMemoryStream buffer;
u32 begin = 0;
u32 end = 0;
Mutex mutex;
StaticString<64> name;
bool show_in_profiler = false;
u32 thread_id;
};
#ifdef _WIN32
#define SWITCH_CONTEXT_OPCODE 36
#pragma pack(1)
struct TraceProps
{
EVENT_TRACE_PROPERTIES base;
char name[sizeof(KERNEL_LOGGER_NAME) + 1];
};
#pragma pack()
// https://docs.microsoft.com/en-us/windows/desktop/etw/cswitch
struct CSwitch
{
u32 NewThreadId;
u32 OldThreadId;
i8 NewThreadPriority;
i8 OldThreadPriority;
u8 PreviousCState;
i8 SpareByte;
i8 OldThreadWaitReason;
i8 OldThreadWaitMode;
i8 OldThreadState;
i8 OldThreadWaitIdealProcessor;
u32 NewThreadWaitTime;
u32 Reserved;
};
struct TraceTask : Thread {
TraceTask(IAllocator& allocator);
int task() override;
static void callback(PEVENT_RECORD event);
TRACEHANDLE open_handle;
};
#else
struct TraceTask {
TraceTask(IAllocator&) {}
void destroy() {}
int open_handle;
};
void CloseTrace(int) {}
#endif
static struct Instance
{
Instance()
: contexts(allocator)
, trace_task(allocator)
, counters(allocator)
, global_context(default_global_context_size, allocator)
{
startTrace();
}
~Instance()
{
CloseTrace(trace_task.open_handle);
trace_task.destroy();
}
void startTrace()
{
#ifdef _WIN32
static TRACEHANDLE trace_handle;
static TraceProps props = {};
props.base.Wnode.BufferSize = sizeof(props);
props.base.Wnode.Flags = WNODE_FLAG_TRACED_GUID;
props.base.Wnode.ClientContext = 1;
props.base.Wnode.Guid = SystemTraceControlGuid;
props.base.LoggerNameOffset = sizeof(props.base);
props.base.EnableFlags = EVENT_TRACE_FLAG_CSWITCH;
props.base.LogFileMode = EVENT_TRACE_REAL_TIME_MODE;
strcpy_s(props.name, KERNEL_LOGGER_NAME);
TraceProps tmp = props;
ControlTrace(NULL, KERNEL_LOGGER_NAME, &tmp.base, EVENT_TRACE_CONTROL_STOP);
ULONG res = StartTrace(&trace_handle, KERNEL_LOGGER_NAME, &props.base);
switch (res) {
case ERROR_ALREADY_EXISTS:
case ERROR_ACCESS_DENIED:
case ERROR_BAD_LENGTH:
default:
context_switches_enabled = false;
break;
case ERROR_SUCCESS:
context_switches_enabled = true;
break;
}
static EVENT_TRACE_LOGFILE trace = {};
trace.LoggerName = KERNEL_LOGGER_NAME;
trace.ProcessTraceMode = PROCESS_TRACE_MODE_RAW_TIMESTAMP | PROCESS_TRACE_MODE_REAL_TIME | PROCESS_TRACE_MODE_EVENT_RECORD;
trace.EventRecordCallback = TraceTask::callback;
trace_task.open_handle = OpenTrace(&trace);
trace_task.create("profiler trace", true);
#endif
}
ThreadContext* getThreadContext()
{
thread_local ThreadContext* ctx = [&](){
ThreadContext* new_ctx = LUMIX_NEW(allocator, ThreadContext)(default_context_size, allocator);
new_ctx->thread_id = os::getCurrentThreadID();
MutexGuard lock(mutex);
contexts.push(new_ctx);
return new_ctx;
}();
return ctx;
}
DefaultAllocator allocator;
Array<Counter> counters;
Array<ThreadContext*> contexts;
Mutex mutex;
os::Timer timer;
bool paused = false;
bool context_switches_enabled = false;
u64 paused_time = 0;
u64 last_frame_duration = 0;
u64 last_frame_time = 0;
volatile i32 fiber_wait_id = 0;
TraceTask trace_task;
ThreadContext global_context;
} g_instance;
template <typename T>
void write(ThreadContext& ctx, u64 timestamp, EventType type, const T& value)
{
if (g_instance.paused && timestamp > g_instance.paused_time) return;
#pragma pack(1)
struct {
EventHeader header;
T value;
} v;
#pragma pack()
v.header.type = type;
v.header.size = sizeof(v);
v.header.time = timestamp;
v.value = value;
MutexGuard lock(ctx.mutex);
u8* buf = ctx.buffer.getMutableData();
const u32 buf_size = (u32)ctx.buffer.size();
while (sizeof(v) + ctx.end - ctx.begin > buf_size) {
const u8 size = buf[ctx.begin % buf_size];
ctx.begin += size;
}
const u32 lend = ctx.end % buf_size;
if (buf_size - lend >= sizeof(v)) {
memcpy(buf + lend, &v, sizeof(v));
}
else {
memcpy(buf + lend, &v, buf_size - lend);
memcpy(buf, ((u8*)&v) + buf_size - lend, sizeof(v) - (buf_size - lend));
}
ctx.end += sizeof(v);
};
template <typename T>
void write(ThreadContext& ctx, EventType type, const T& value)
{
if (g_instance.paused) return;
#pragma pack(1)
struct {
EventHeader header;
T value;
} v;
#pragma pack()
v.header.type = type;
v.header.size = sizeof(v);
v.header.time = os::Timer::getRawTimestamp();
v.value = value;
MutexGuard lock(ctx.mutex);
u8* buf = ctx.buffer.getMutableData();
const u32 buf_size = (u32)ctx.buffer.size();
while (sizeof(v) + ctx.end - ctx.begin > buf_size) {
const u8 size = buf[ctx.begin % buf_size];
ctx.begin += size;
}
const u32 lend = ctx.end % buf_size;
if (buf_size - lend >= sizeof(v)) {
memcpy(buf + lend, &v, sizeof(v));
}
else {
memcpy(buf + lend, &v, buf_size - lend);
memcpy(buf, ((u8*)&v) + buf_size - lend, sizeof(v) - (buf_size - lend));
}
ctx.end += sizeof(v);
};
void write(ThreadContext& ctx, EventType type, const u8* data, int size)
{
if (g_instance.paused) return;
EventHeader header;
header.type = type;
ASSERT(sizeof(header) + size <= 0xffff);
header.size = u16(sizeof(header) + size);
header.time = os::Timer::getRawTimestamp();
MutexGuard lock(ctx.mutex);
u8* buf = ctx.buffer.getMutableData();
const u32 buf_size = (u32)ctx.buffer.size();
while (header.size + ctx.end - ctx.begin > buf_size) {
const u8 size = buf[ctx.begin % buf_size];
ctx.begin += size;
}
auto cpy = [&](const u8* ptr, u32 size) {
const u32 lend = ctx.end % buf_size;
if (buf_size - lend >= size) {
memcpy(buf + lend, ptr, size);
}
else {
memcpy(buf + lend, ptr, buf_size - lend);
memcpy(buf, ((u8*)ptr) + buf_size - lend, size - (buf_size - lend));
}
ctx.end += size;
};
cpy((u8*)&header, sizeof(header));
cpy(data, size);
};
#ifdef _WIN32
TraceTask::TraceTask(IAllocator& allocator)
: Thread(allocator)
{}
int TraceTask::task() {
ProcessTrace(&open_handle, 1, nullptr, nullptr);
return 0;
}
void TraceTask::callback(PEVENT_RECORD event) {
if (event->EventHeader.EventDescriptor.Opcode != SWITCH_CONTEXT_OPCODE) return;
if (sizeof(CSwitch) != event->UserDataLength) return;
const CSwitch* cs = reinterpret_cast<CSwitch*>(event->UserData);
ContextSwitchRecord rec;
rec.timestamp = event->EventHeader.TimeStamp.QuadPart;
rec.new_thread_id = cs->NewThreadId;
rec.old_thread_id = cs->OldThreadId;
rec.reason = cs->OldThreadWaitReason;
write(g_instance.global_context, rec.timestamp, profiler::EventType::CONTEXT_SWITCH, rec);
};
#endif
u32 createCounter(const char* key_literal, float min) {
Counter& c = g_instance.counters.emplace();
copyString(Span(c.name), key_literal);
c.min = min;
return g_instance.counters.size() - 1;
}
void pushCounter(u32 counter, float value) {
CounterRecord r;
r.counter = counter;
r.value = value;
write(g_instance.global_context, EventType::COUNTER, (u8*)&r, sizeof(r));
}
void pushInt(const char* key, int value)
{
ThreadContext* ctx = g_instance.getThreadContext();
IntRecord r;
r.key = key;
r.value = value;
write(*ctx, EventType::INT, (u8*)&r, sizeof(r));
}
void pushString(const char* value)
{
ThreadContext* ctx = g_instance.getThreadContext();
write(*ctx, EventType::STRING, (u8*)value, stringLength(value) + 1);
}
void blockColor(u8 r, u8 g, u8 b)
{
const u32 color = 0xff000000 + r + (g << 8) + (b << 16);
ThreadContext* ctx = g_instance.getThreadContext();
write(*ctx, EventType::BLOCK_COLOR, color);
}
static volatile i32 last_block_id = 0;
static void continueBlock(i32 block_id) {
ThreadContext* ctx = g_instance.getThreadContext();
ctx->open_blocks.push(block_id);
write(*ctx, EventType::CONTINUE_BLOCK, block_id);
}
void beginBlock(const char* name)
{
BlockRecord r;
r.id = atomicIncrement(&last_block_id);
r.name = name;
ThreadContext* ctx = g_instance.getThreadContext();
ctx->open_blocks.push(r.id);
write(*ctx, EventType::BEGIN_BLOCK, r);
}
void beginGPUBlock(const char* name, u64 timestamp, i64 profiler_link)
{
GPUBlock data;
data.timestamp = timestamp;
copyString(data.name, name);
data.profiler_link = profiler_link;
write(g_instance.global_context, EventType::BEGIN_GPU_BLOCK, data);
}
void gpuStats(u64 primitives_generated) {
write(g_instance.global_context, EventType::GPU_STATS, primitives_generated);
}
void endGPUBlock(u64 timestamp)
{
write(g_instance.global_context, EventType::END_GPU_BLOCK, timestamp);
}
i64 createNewLinkID()
{
static i64 counter = 0;
return atomicIncrement(&counter);
}
void link(i64 link)
{
ThreadContext* ctx = g_instance.getThreadContext();
write(*ctx, EventType::LINK, link);
}
float getLastFrameDuration()
{
return float(g_instance.last_frame_duration / double(frequency()));
}
void beforeFiberSwitch()
{
ThreadContext* ctx = g_instance.getThreadContext();
while(!ctx->open_blocks.empty()) {
write(*ctx, EventType::END_BLOCK, 0);
ctx->open_blocks.pop();
}
}
void pushJobInfo(i32 signal_on_finish)
{
JobRecord r;
r.signal_on_finish = signal_on_finish;
ThreadContext* ctx = g_instance.getThreadContext();
write(*ctx, EventType::JOB_INFO, r);
}
void signalTriggered(i32 job_system_signal) {
ThreadContext* ctx = g_instance.getThreadContext();
write(*ctx, EventType::SIGNAL_TRIGGERED, job_system_signal);
}
FiberSwitchData beginFiberWait(i32 job_system_signal, bool is_mutex)
{
FiberWaitRecord r;
r.id = atomicIncrement(&g_instance.fiber_wait_id);
r.job_system_signal = job_system_signal;
r.is_mutex = is_mutex;
FiberSwitchData res;
ThreadContext* ctx = g_instance.getThreadContext();
res.count = ctx->open_blocks.size();
res.id = r.id;
res.signal = job_system_signal;
memcpy(res.blocks, ctx->open_blocks.begin(), minimum(res.count, lengthOf(res.blocks)) * sizeof(res.blocks[0]));
write(*ctx, EventType::BEGIN_FIBER_WAIT, r);
return res;
}
void endFiberWait(const FiberSwitchData& switch_data)
{
ThreadContext* ctx = g_instance.getThreadContext();
FiberWaitRecord r;
r.id = switch_data.id;
r.job_system_signal = switch_data.signal;
r.is_mutex = false;
write(*ctx, EventType::END_FIBER_WAIT, r);
const u32 count = switch_data.count;
for (u32 i = 0; i < count; ++i) {
if(i < lengthOf(switch_data.blocks)) {
continueBlock(switch_data.blocks[i]);
} else {
continueBlock(-1);
}
}
}
void endBlock()
{
ThreadContext* ctx = g_instance.getThreadContext();
if(!ctx->open_blocks.empty()) {
ctx->open_blocks.pop();
write(*ctx, EventType::END_BLOCK, 0);
}
}
u64 frequency()
{
return g_instance.timer.getFrequency();
}
bool contextSwitchesEnabled()
{
return g_instance.context_switches_enabled;
}
void frame()
{
const u64 n = os::Timer::getRawTimestamp();
if (g_instance.last_frame_time != 0) {
g_instance.last_frame_duration = n - g_instance.last_frame_time;
}
g_instance.last_frame_time = n;
write(g_instance.global_context, EventType::FRAME, 0);
}
void showInProfiler(bool show)
{
ThreadContext* ctx = g_instance.getThreadContext();
MutexGuard lock(ctx->mutex);
ctx->show_in_profiler = show;
}
void setThreadName(const char* name)
{
ThreadContext* ctx = g_instance.getThreadContext();
MutexGuard lock(ctx->mutex);
ctx->name = name;
}
template <typename T>
static void read(const ThreadContext& ctx, u32 p, T& value)
{
const u8* buf = ctx.buffer.data();
const u32 buf_size = (u32)ctx.buffer.size();
const u32 l = p % buf_size;
if (l + sizeof(value) <= buf_size) {
memcpy(&value, buf + l, sizeof(value));
return;
}
memcpy(&value, buf + l, buf_size - l);
memcpy((u8*)&value + (buf_size - l), buf, sizeof(value) - (buf_size - l));
}
static void saveStrings(OutputMemoryStream& blob) {
HashMap<const char*, const char*> map(g_instance.allocator);
map.reserve(512);
auto gather = [&](const ThreadContext& ctx){
u32 p = ctx.begin;
const u32 end = ctx.end;
while (p != end) {
profiler::EventHeader header;
read(ctx, p, header);
switch (header.type) {
case profiler::EventType::BEGIN_BLOCK: {
BlockRecord b;
read(ctx, p + sizeof(profiler::EventHeader), b);
if (!map.find(b.name).isValid()) {
map.insert(b.name, b.name);
}
break;
}
case profiler::EventType::INT: {
IntRecord r;
read(ctx, p + sizeof(profiler::EventHeader), r);
if (!map.find(r.key).isValid()) {
map.insert(r.key, r.key);
}
break;
}
default: break;
}
p += header.size;
}
};
gather(g_instance.global_context);
for (const ThreadContext* ctx : g_instance.contexts) {
gather(*ctx);
}
blob.write(map.size());
for (auto iter : map) {
blob.write((u64)(uintptr)iter);
blob.write(iter, strlen(iter) + 1);
}
}
void serialize(OutputMemoryStream& blob, ThreadContext& ctx) {
MutexGuard lock(ctx.mutex);
blob.writeString(ctx.name);
blob.write(ctx.thread_id);
blob.write(ctx.begin);
blob.write(ctx.end);
blob.write((u8)ctx.show_in_profiler);
blob.write((u32)ctx.buffer.size());
blob.write(ctx.buffer.data(), ctx.buffer.size());
}
void serialize(OutputMemoryStream& blob) {
MutexGuard lock(g_instance.mutex);
const u32 version = 0;
blob.write(version);
blob.write((u32)g_instance.counters.size());
blob.write(g_instance.counters.begin(), g_instance.counters.byte_size());
blob.write((u32)g_instance.contexts.size());
serialize(blob, g_instance.global_context);
for (ThreadContext* ctx : g_instance.contexts) {
serialize(blob, *ctx);
}
saveStrings(blob);
}
void pause(bool paused)
{
if (paused) write(g_instance.global_context, EventType::PAUSE, 0);
g_instance.paused = paused;
if (paused) g_instance.paused_time = os::Timer::getRawTimestamp();
}
} // namespace Lumix
} // namespace profiler