diff --git a/src/main.cpp b/src/main.cpp index 51abccc..99befd8 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -34,9 +34,11 @@ struct Gpu { VkCommandPool pool = nullptr; // for benchmarks VkDeviceMemory stagingMem = nullptr; + VkDeviceMemory cachedMem = nullptr; VkDeviceMemory deviceMem = nullptr; VkDeviceMemory pinnedMem = nullptr; VkBuffer stagingBuf = nullptr; + VkBuffer cachedBuf = nullptr; VkBuffer deviceBuf = nullptr; VkBuffer pinnedBuf = nullptr; }; @@ -184,6 +186,14 @@ Gpu initGpu(VkPhysicalDevice phy, uint32_t buffer_size) { gpu.deviceMem = allocateMem(gpu.device, phy, gpu.deviceBuf, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + gpu.cachedBuf = createBuffer(gpu.device, buffer_size, + VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_TRANSFER_SRC_BIT); + gpu.cachedMem = allocateMem(gpu.device, phy, gpu.cachedBuf, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT | + VK_MEMORY_PROPERTY_HOST_CACHED_BIT); + gpu.pinnedBuf = createBuffer(gpu.device, buffer_size, VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT); @@ -199,9 +209,11 @@ void cleanupGpu(Gpu &gpu) { vkDestroyBuffer(gpu.device, gpu.stagingBuf, nullptr); vkDestroyBuffer(gpu.device, gpu.deviceBuf, nullptr); + vkDestroyBuffer(gpu.device, gpu.cachedBuf, nullptr); vkDestroyBuffer(gpu.device, gpu.pinnedBuf, nullptr); vkFreeMemory(gpu.device, gpu.stagingMem, nullptr); vkFreeMemory(gpu.device, gpu.deviceMem, nullptr); + vkFreeMemory(gpu.device, gpu.cachedMem, nullptr); vkFreeMemory(gpu.device, gpu.pinnedMem, nullptr); vkDestroyCommandPool(gpu.device, gpu.pool, nullptr); vkDestroyDevice(gpu.device, nullptr); @@ -254,22 +266,22 @@ BenchmarkResult runStagedBenchmark(Gpu &gpu, BenchmarkConfig config) { return {tH2D, tD2H}; } -BenchmarkResult runPinnedBenchmark(Gpu &gpu, BenchmarkConfig config) { +BenchmarkResult runCachedBenchmark(Gpu &gpu, BenchmarkConfig config) { // fill staging buffer void *mapped; - vkMapMemory(gpu.device, gpu.pinnedMem, 0, config.buffer_size, 0, &mapped); + vkMapMemory(gpu.device, gpu.cachedMem, 0, config.buffer_size, 0, &mapped); std::memset(mapped, 0xAB, config.buffer_size); - vkUnmapMemory(gpu.device, gpu.pinnedMem); + vkUnmapMemory(gpu.device, gpu.cachedMem); // warm-up, probably not significant - benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.stagingBuf, gpu.deviceBuf, + benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.cachedBuf, gpu.deviceBuf, config.buffer_size); // ---- benchmark host->device ---- double tH2D = 0.0; for (uint32_t i = 0; i < config.iterations; ++i) - tH2D += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.pinnedBuf, + tH2D += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.cachedBuf, gpu.pinnedBuf, config.buffer_size); tH2D /= config.iterations; @@ -277,7 +289,7 @@ BenchmarkResult runPinnedBenchmark(Gpu &gpu, BenchmarkConfig config) { double tD2H = 0.0; for (uint32_t i = 0; i < config.iterations; ++i) tD2H += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.pinnedBuf, - gpu.pinnedBuf, config.buffer_size); + gpu.cachedBuf, config.buffer_size); tD2H /= config.iterations; return {tH2D, tD2H}; @@ -331,8 +343,8 @@ auto main() -> int { BenchmarkResult res = runStagedBenchmark(gpu, config); reportBenchmark(res, gpu, config); - std::cout << "Running pinned benchmark" << std::endl; - res = runPinnedBenchmark(gpu, config); + std::cout << "Running cached benchmark" << std::endl; + res = runCachedBenchmark(gpu, config); reportBenchmark(res, gpu, config); std::cout << "--------------------" << std::endl;