cached transfer test as well

This commit is contained in:
2025-11-09 22:22:48 +02:00
parent e587c5bceb
commit c40571d3a7

View File

@@ -34,9 +34,11 @@ struct Gpu {
VkCommandPool pool = nullptr; VkCommandPool pool = nullptr;
// for benchmarks // for benchmarks
VkDeviceMemory stagingMem = nullptr; VkDeviceMemory stagingMem = nullptr;
VkDeviceMemory cachedMem = nullptr;
VkDeviceMemory deviceMem = nullptr; VkDeviceMemory deviceMem = nullptr;
VkDeviceMemory pinnedMem = nullptr; VkDeviceMemory pinnedMem = nullptr;
VkBuffer stagingBuf = nullptr; VkBuffer stagingBuf = nullptr;
VkBuffer cachedBuf = nullptr;
VkBuffer deviceBuf = nullptr; VkBuffer deviceBuf = nullptr;
VkBuffer pinnedBuf = nullptr; VkBuffer pinnedBuf = nullptr;
}; };
@@ -184,6 +186,14 @@ Gpu initGpu(VkPhysicalDevice phy, uint32_t buffer_size) {
gpu.deviceMem = allocateMem(gpu.device, phy, gpu.deviceBuf, gpu.deviceMem = allocateMem(gpu.device, phy, gpu.deviceBuf,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
gpu.cachedBuf = createBuffer(gpu.device, buffer_size,
VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_TRANSFER_SRC_BIT);
gpu.cachedMem = allocateMem(gpu.device, phy, gpu.cachedBuf,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
gpu.pinnedBuf = createBuffer(gpu.device, buffer_size, gpu.pinnedBuf = createBuffer(gpu.device, buffer_size,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT); VK_BUFFER_USAGE_TRANSFER_DST_BIT);
@@ -199,9 +209,11 @@ void cleanupGpu(Gpu &gpu) {
vkDestroyBuffer(gpu.device, gpu.stagingBuf, nullptr); vkDestroyBuffer(gpu.device, gpu.stagingBuf, nullptr);
vkDestroyBuffer(gpu.device, gpu.deviceBuf, nullptr); vkDestroyBuffer(gpu.device, gpu.deviceBuf, nullptr);
vkDestroyBuffer(gpu.device, gpu.cachedBuf, nullptr);
vkDestroyBuffer(gpu.device, gpu.pinnedBuf, nullptr); vkDestroyBuffer(gpu.device, gpu.pinnedBuf, nullptr);
vkFreeMemory(gpu.device, gpu.stagingMem, nullptr); vkFreeMemory(gpu.device, gpu.stagingMem, nullptr);
vkFreeMemory(gpu.device, gpu.deviceMem, nullptr); vkFreeMemory(gpu.device, gpu.deviceMem, nullptr);
vkFreeMemory(gpu.device, gpu.cachedMem, nullptr);
vkFreeMemory(gpu.device, gpu.pinnedMem, nullptr); vkFreeMemory(gpu.device, gpu.pinnedMem, nullptr);
vkDestroyCommandPool(gpu.device, gpu.pool, nullptr); vkDestroyCommandPool(gpu.device, gpu.pool, nullptr);
vkDestroyDevice(gpu.device, nullptr); vkDestroyDevice(gpu.device, nullptr);
@@ -254,22 +266,22 @@ BenchmarkResult runStagedBenchmark(Gpu &gpu, BenchmarkConfig config) {
return {tH2D, tD2H}; return {tH2D, tD2H};
} }
BenchmarkResult runPinnedBenchmark(Gpu &gpu, BenchmarkConfig config) { BenchmarkResult runCachedBenchmark(Gpu &gpu, BenchmarkConfig config) {
// fill staging buffer // fill staging buffer
void *mapped; void *mapped;
vkMapMemory(gpu.device, gpu.pinnedMem, 0, config.buffer_size, 0, &mapped); vkMapMemory(gpu.device, gpu.cachedMem, 0, config.buffer_size, 0, &mapped);
std::memset(mapped, 0xAB, config.buffer_size); std::memset(mapped, 0xAB, config.buffer_size);
vkUnmapMemory(gpu.device, gpu.pinnedMem); vkUnmapMemory(gpu.device, gpu.cachedMem);
// warm-up, probably not significant // warm-up, probably not significant
benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.stagingBuf, gpu.deviceBuf, benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.cachedBuf, gpu.deviceBuf,
config.buffer_size); config.buffer_size);
// ---- benchmark host->device ---- // ---- benchmark host->device ----
double tH2D = 0.0; double tH2D = 0.0;
for (uint32_t i = 0; i < config.iterations; ++i) for (uint32_t i = 0; i < config.iterations; ++i)
tH2D += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.pinnedBuf, tH2D += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.cachedBuf,
gpu.pinnedBuf, config.buffer_size); gpu.pinnedBuf, config.buffer_size);
tH2D /= config.iterations; tH2D /= config.iterations;
@@ -277,7 +289,7 @@ BenchmarkResult runPinnedBenchmark(Gpu &gpu, BenchmarkConfig config) {
double tD2H = 0.0; double tD2H = 0.0;
for (uint32_t i = 0; i < config.iterations; ++i) for (uint32_t i = 0; i < config.iterations; ++i)
tD2H += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.pinnedBuf, tD2H += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.pinnedBuf,
gpu.pinnedBuf, config.buffer_size); gpu.cachedBuf, config.buffer_size);
tD2H /= config.iterations; tD2H /= config.iterations;
return {tH2D, tD2H}; return {tH2D, tD2H};
@@ -331,8 +343,8 @@ auto main() -> int {
BenchmarkResult res = runStagedBenchmark(gpu, config); BenchmarkResult res = runStagedBenchmark(gpu, config);
reportBenchmark(res, gpu, config); reportBenchmark(res, gpu, config);
std::cout << "Running pinned benchmark" << std::endl; std::cout << "Running cached benchmark" << std::endl;
res = runPinnedBenchmark(gpu, config); res = runCachedBenchmark(gpu, config);
reportBenchmark(res, gpu, config); reportBenchmark(res, gpu, config);
std::cout << "--------------------" << std::endl; std::cout << "--------------------" << std::endl;