平台检测宏定义
#define PLATFORM_WINDOWS 0
#define PLATFORM_LINUX 0
#define PLATFORM_APPLE 0
#define PLATFORM_IOS 0
#define PLATFORM_ANDROID 0
查阅文档发现各个平台的 RenderDoc 库名称不同,因此需要用宏来检测各个平台,这里我定义了这几个宏,分别代表 Windows、Linux 等
#if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER)
#undef PLATFORM_WINDOWS
#define PLATFORM_WINDOWS 1
#elif defined(__APPLE__)
#undef PLATFORM_APPLE
#define PLATFORM_APPLE 1
// 其他平台检测
Windows 的话,可以通过 __WIN32__
或者检测是否有 MSVC
就行;Android 用 __ANDROID__
; Apple 平台用 __APPLE__
;剩下的识别为 Linux
RenderDoc API 初始化
#define RENDERDOC_API_LATEST RENDERDOC_API_1_6_0
#define eRENDERDOC_API_Version_LATEST eRENDERDOC_API_Version_1_6_0
这里定义 LATEST 宏可以方便后续更新 RenderDoc 版本
Windows 平台
HMODULE mod = GetModuleHandleA("renderdoc.dll")
GetModuleHandleA()
获取已加载模块的句柄,不加载新模块- 这里的话,RenderDoc 必须已经注入进程中,否则会导致 RenderDoc 加载失败
RENDERDOC_GetAPI = (pRENDERDOC_GetAPI)GetProcAddress(mod, "RENDERDOC_GetAPI");
- 获取
RENDERDOC_GetAPI
函数的地址 - RenderDoc API 的入口函数
Linux/Android/macOS 平台
void *mod = dlopen("librenderdoc.so", RTLD_NOW | RTLD_LAZY);
dlopen()
动态加载共享库RTLD_NOW | RTLD_LAZY
立即解析符号但延迟绑定
RENDERDOC_GetAPI = (pRENDERDOC_GetAPI)dlsym(mod, "RENDERDOC_GetAPI");
dlsym()
获取符号地址
API 初始化
if(RENDERDOC_GetAPI)
{
RENDERDOC_API_LATEST* rdoc_api = NULL;
int ret = RENDERDOC_GetAPI(eRENDERDOC_API_Version_LATEST, (void **)&rdoc_api);
if(ret == 1 && rdoc_api) {
rdoc_api->SetCaptureOptionU32(eRENDERDOC_Option_DebugOutputMute, 0);
NCNN_LOGE("RenderDoc API initialized successfully");
return rdoc_api;
}
}
RENDERDOC_GetAPI()
获取 API, 返回 1 表示成功eRENDERDOC_API_Version_LATEST
是先前定义的宏,代表 API 版本SetCaptureOptionU32(eRENDERDOC_Option_DebugOutputMute, 0)
启用调试输出
捕获控制
static void ncnn_vulkan_begin_renderdoc_capture(
RENDERDOC_API_LATEST* renderdoc_api, VkInstance instance) {
if (!renderdoc_api) {
NCNN_LOGE("RenderDoc API is NULL, cannot begin capture");
return;
}
NCNN_LOGE("Starting RenderDoc capture...");
renderdoc_api->StartFrameCapture(
RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(instance), NULL);
}
RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(instance)
是一个宏,它可以将 Vulkan 实例转换为 RenderDoc 设备指针- 第二个参数
NULL
表示使用默认窗口,对于计算着色器来说可以为空 StartFrameCapture()
开始捕获
static void ncnn_vulkan_end_renderdoc_capture(
RENDERDOC_API_LATEST* renderdoc_api, VkInstance instance) {
// ...
renderdoc_api->EndFrameCapture(
RENDERDOC_DEVICEPOINTER_FROM_VKINSTANCE(instance), NULL);
}
EndFrameCapture()
停止捕获
配置 RenderDoc
static void configure_renderdoc_headless(RENDERDOC_API_LATEST* rdoc_api) {
if (!rdoc_api) return;
// 设置捕获选项
rdoc_api->SetCaptureOptionU32(eRENDERDOC_Option_AllowVSync, 0);
rdoc_api->SetCaptureOptionU32(eRENDERDOC_Option_AllowFullscreen, 0);
rdoc_api->SetCaptureOptionU32(eRENDERDOC_Option_APIValidation, 1);
rdoc_api->SetCaptureOptionU32(eRENDERDOC_Option_CaptureCallstacks, 1);
rdoc_api->SetCaptureOptionU32(eRENDERDOC_Option_DebugOutputMute, 0);
// 捕获文件路径
const char* capture_path = getenv("NCNN_RENDERDOC_CAPTURE_PATH");
if (!capture_path) {
capture_path = "./ncnn_capture";
}
rdoc_api->SetCaptureFilePathTemplate(capture_path);
}
这里我定义了一个NCNN_RENDERDOC_CAPTURE_PATH
环境变量用来指定 rdc
文件路径,默认在当前文件夹,文件名为 ncnn_capture.rdc
添加成员
class __ncnn_vulkan_instance_holder
{
public:
// 现有成员
#if NCNN_ENABLE_RENDERDOC_PROFILING
RENDERDOC_API_LATEST* renderdoc_api;
#endif
};
在全局 Vulkan 实例中添加 RenderDoc API 指针
队列选择
static uint32_t find_device_compute_queue(const std::vector<VkQueueFamilyProperties>& queueFamilyProperties, bool prefer_graphics_for_renderdoc = false)
{
if (prefer_graphics_for_renderdoc) {
for (uint32_t i = 0; i < queueFamilyProperties.size(); i++)
{
const VkQueueFamilyProperties& queueFamilyProperty = queueFamilyProperties[i];
if ((queueFamilyProperty.queueFlags & VK_QUEUE_COMPUTE_BIT)
&& (queueFamilyProperty.queueFlags & VK_QUEUE_GRAPHICS_BIT))
{
return i;
}
}
}
// 原有的查找逻辑
当 prefer_graphics_for_renderdoc
为 true 时,优先寻找同时支持图形和计算的队列,VK_QUEUE_COMPUTE_BIT
和 VK_QUEUE_GRAPHICS_BIT
都被设置
修改 CMakeLists.txt
option(NCNN_ENABLE_RENDERDOC_PROFILING "Enables profiling with the RenderDoc tool." OFF)
if(NCNN_ENABLE_RENDERDOC_PROFILING)
add_definitions(-DNCNN_ENABLE_RENDERDOC_PROFILING=1)
endif()
添加 CMake 构建选项,可以通过 -DNCNN_ENABLE_RENDERDOC_PROFILING=ON
启用 RenderDoc 支持
公共 API
#if NCNN_ENABLE_RENDERDOC_PROFILING
void start_renderdoc_capture() {
if (g_instance.renderdoc_api && g_instance.instance) {
configure_renderdoc_headless(g_instance.renderdoc_api);
ncnn_vulkan_begin_renderdoc_capture(g_instance.renderdoc_api, g_instance.instance);
}
}
void end_renderdoc_capture() {
if (g_instance.renderdoc_api && g_instance.instance) {
ncnn_vulkan_end_renderdoc_capture(g_instance.renderdoc_api, g_instance.instance);
}
}
#endif
用户需要捕获调用细节时直接使用这两个函数就行了
构建
这里需要启用 NCNN_VULKAN 并关掉 NCNN_SIMPLEVK,这样 validation layer 才会生效
mkdir -p build
cd build
cmake -DNCNN_VULKAN=ON -DNCNN_SIMPLEVK=OFF -DNCNN_ENABLE_RENDERDOC_PROFILING=ON ..
make -j$(nproc)
使用
在程序中添加捕获调用
int main()
{
// 开始RenderDoc捕获
ncnn::start_renderdoc_capture();
// ... 推理操作 ...
// 结束RenderDoc捕获
ncnn::end_renderdoc_capture();
return 0;
}
捕获 GPU 操作
NCNN_RENDERDOC_CAPTURE_PATH="/path/to/capture_file" renderdoccmd capture your_ncnn_application
我已经在benchncnn中添加了 RenderDoc 支持,跑个 benchmark 看看
$ renderdoccmd capture ./benchncnn 8 4 0 0 1
Launching './benchncnn' with params: 8 4 0 0 1
Launched as ID 38920
RenderDoc API initialized successfully
validation layer: linux_read_sorted_physical_devices:
validation layer: Original order:
validation layer: [0] llvmpipe (LLVM 20.1.8, 256 bits)
validation layer: [1] Intel(R) Iris(R) Xe Graphics (TGL GT2)
validation layer: Sorted order:
validation layer: [0] Intel(R) Iris(R) Xe Graphics (TGL GT2)
validation layer: [1] llvmpipe (LLVM 20.1.8, 256 bits)
validation layer: Copying old device 0 into new device 0
validation layer: Copying old device 1 into new device 1
validation layer: linux_read_sorted_physical_devices:
validation layer: Original order:
validation layer: [0] llvmpipe (LLVM 20.1.8, 256 bits)
validation layer: [1] Intel(R) Iris(R) Xe Graphics (TGL GT2)
validation layer: Sorted order:
validation layer: [0] Intel(R) Iris(R) Xe Graphics (TGL GT2)
validation layer: [1] llvmpipe (LLVM 20.1.8, 256 bits)
validation layer: Copying old device 0 into new device 0
validation layer: Copying old device 1 into new device 1
validation layer: linux_read_sorted_physical_devices:
validation layer: Original order:
validation layer: [0] llvmpipe (LLVM 20.1.8, 256 bits)
validation layer: [1] Intel(R) Iris(R) Xe Graphics (TGL GT2)
validation layer: Sorted order:
validation layer: [0] Intel(R) Iris(R) Xe Graphics (TGL GT2)
validation layer: [1] llvmpipe (LLVM 20.1.8, 256 bits)
validation layer: Copying old device 0 into new device 0
validation layer: Copying old device 1 into new device 1
validation layer: linux_read_sorted_physical_devices:
validation layer: Original order:
validation layer: [0] llvmpipe (LLVM 20.1.8, 256 bits)
validation layer: [1] Intel(R) Iris(R) Xe Graphics (TGL GT2)
validation layer: Sorted order:
validation layer: [0] Intel(R) Iris(R) Xe Graphics (TGL GT2)
validation layer: [1] llvmpipe (LLVM 20.1.8, 256 bits)
validation layer: Copying old device 0 into new device 0
validation layer: Copying old device 1 into new device 1
validation layer: Removing driver /usr/lib64/libvulkan_asahi.so due to not having any physical devices
validation layer: Removing driver /usr/lib64/libvulkan_virtio.so due to not having any physical devices
validation layer: Removing driver /usr/lib64/libvulkan_radeon.so due to not having any physical devices
validation layer: Removing driver /usr/lib64/libvulkan_powervr_mesa.so due to not having any physical devices
validation layer: Removing driver /usr/lib64/libvulkan_panfrost.so due to not having any physical devices
validation layer: Removing driver /usr/lib64/libvulkan_nouveau.so due to not having any physical devices
validation layer: Removing driver /usr/lib64/libvulkan_intel_hasvk.so due to not having any physical devices
validation layer: Removing driver /usr/lib64/libvulkan_freedreno.so due to not having any physical devices
validation layer: Removing driver /usr/lib64/libvulkan_broadcom.so due to not having any physical devices
[0 Intel(R) Iris(R) Xe Graphics (TGL GT2)] queueC=0[1] queueT=0[1]
[0 Intel(R) Iris(R) Xe Graphics (TGL GT2)] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1
[0 Intel(R) Iris(R) Xe Graphics (TGL GT2)] subgroup=32(8~32) ops=1/1/1/1/1/1/1/1/1/1
[0 Intel(R) Iris(R) Xe Graphics (TGL GT2)] fp16-cm=0 int8-cm=0 bf16-cm=0 fp8-cm=0
[1 llvmpipe (LLVM 20.1.8, 256 bits)] queueC=0[1] queueT=0[1]
[1 llvmpipe (LLVM 20.1.8, 256 bits)] fp16-p/s/u/a=1/1/1/1 int8-p/s/u/a=1/1/1/1
[1 llvmpipe (LLVM 20.1.8, 256 bits)] subgroup=8(8~8) ops=1/1/1/1/1/1/1/1/1/1
[1 llvmpipe (LLVM 20.1.8, 256 bits)] fp16-cm=0 int8-cm=0 bf16-cm=0 fp8-cm=0
validation layer: Inserted device layer "VK_LAYER_KHRONOS_validation" (libVkLayer_khronos_validation.so)
validation layer: Inserted device layer "VK_LAYER_RENDERDOC_Capture" (/usr/lib64/renderdoc/librenderdoc.so)
validation layer: vkCreateDevice layer callstack setup to:
validation layer: <Application>
validation layer: ||
validation layer: <Loader>
validation layer: ||
validation layer: VK_LAYER_RENDERDOC_Capture
validation layer: Type: Implicit
validation layer: Enabled By: Implicit Layer
validation layer: Disable Env Var: DISABLE_VULKAN_RENDERDOC_CAPTURE_1_39
validation layer: Manifest: /usr/share/vulkan/implicit_layer.d/renderdoc_capture.json
validation layer: Library: /usr/lib64/renderdoc/librenderdoc.so
validation layer: ||
validation layer: VK_LAYER_KHRONOS_validation
validation layer: Type: Explicit
validation layer: Enabled By: By the Application
validation layer: Manifest: /usr/share/vulkan/explicit_layer.d/VkLayer_khronos_validation.json
validation layer: Library: libVkLayer_khronos_validation.so
validation layer: ||
validation layer: <Device>
validation layer: Using "Intel(R) Iris(R) Xe Graphics (TGL GT2)" with driver: "/usr/lib64/libvulkan_intel.so"
Starting RenderDoc capture...
loop_count = 8
num_threads = 4
powersave = 0
gpu_device = 0
cooling_down = 1
fopen squeezenet.param failed
network graph not ready
squeezenet min = 0.00 max = 0.00 avg = 0.00
mobilenet min = 18.00 max = 19.46 avg = 19.02
mobilenet_v2 min = 17.87 max = 19.36 avg = 18.76
mobilenet_v3 min = 20.07 max = 24.74 avg = 21.90
fopen shufflenet.param failed
network graph not ready
shufflenet min = 0.00 max = 0.00 avg = 0.00
fopen shufflenet_v2.param failed
network graph not ready
shufflenet_v2 min = 0.00 max = 0.00 avg = 0.00
mnasnet min = 18.63 max = 24.55 avg = 20.79
proxylessnasnet min = 21.37 max = 27.65 avg = 24.58
efficientnet_b0 min = 23.01 max = 23.75 avg = 23.36
efficientnetv2_b0 min = 172.44 max = 260.64 avg = 188.77
regnety_400m min = 25.58 max = 27.10 avg = 26.13
blazeface min = 11.35 max = 11.62 avg = 11.44
googlenet min = 38.80 max = 43.13 avg = 40.21
fopen resnet18.param failed
network graph not ready
resnet18 min = 0.00 max = 0.00 avg = 0.00
alexnet min = 39.59 max = 46.78 avg = 43.20
fopen vgg16.param failed
network graph not ready
vgg16 min = 0.00 max = 0.00 avg = 0.00
fopen resnet50.param failed
network graph not ready
resnet50 min = 0.00 max = 0.00 avg = 0.00
fopen squeezenet_ssd.param failed
network graph not ready
squeezenet_ssd min = 0.00 max = 0.00 avg = 0.00
mobilenet_ssd min = 24.16 max = 29.74 avg = 26.34
mobilenet_yolo min = 26.62 max = 32.16 avg = 29.69
mobilenetv2_yolov3 min = 21.91 max = 25.67 avg = 23.68
fopen yolov4-tiny.param failed
network graph not ready
yolov4-tiny min = 0.00 max = 0.00 avg = 0.00
nanodet_m min = 45.24 max = 51.48 avg = 49.53
fopen yolo-fastest-1.1.param failed
network graph not ready
yolo-fastest-1.1 min = 0.00 max = 0.00 avg = 0.00
fopen yolo-fastestv2.param failed
network graph not ready
yolo-fastestv2 min = 0.00 max = 0.00 avg = 0.00
fopen vision_transformer.param failed
network graph not ready
vision_transformer min = 0.00 max = 0.00 avg = 0.00
FastestDet min = 15.22 max = 18.05 avg = 16.52
Ending RenderDoc capture...
使用 RenderDoc GUI 分析 rdc 文件