From edb9e941d7e33e0c733a2ffc05e6d05185e27c9d Mon Sep 17 00:00:00 2001 From: Kasper Sauramo Date: Fri, 7 Nov 2025 23:18:23 +0200 Subject: [PATCH] initial version --- .gitignore | 6 ++ CMakeLists.txt | 17 +++++ src/main.cpp | 196 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 219 insertions(+) create mode 100644 .gitignore create mode 100644 CMakeLists.txt create mode 100644 src/main.cpp diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0468e39 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +build/ + +.DS_Store +.idea +*.log +tmp/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..c1515ed --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,17 @@ +cmake_minimum_required(VERSION 3.19) +project(vulkan-transfer-bench CXX) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +add_compile_definitions(VK_ENABLE_BETA_EXTENSIONS) + +add_executable(vulkan-transfer-bench + src/main.cpp +) + +target_link_libraries(vulkan-transfer-bench PRIVATE + vulkan +) + +target_compile_options(vulkan-transfer-bench PRIVATE -Wall -Wextra) diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..5027360 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,196 @@ +#include + +#include +#include +#include +#include +#include + +/// TODO: Print available GPU's +/// TODO: Run benchmark on all GPU's +/// TODO: Pinned memory +/// TODO: Plot by buffer size + +static constexpr uint64_t BUF_SIZE = 256ULL << 20; // 256 MiB +static constexpr uint32_t ITERATIONS = 32; + +// ---------- helpers ---------- +VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback( + VkDebugUtilsMessageSeverityFlagBitsEXT, VkDebugUtilsMessageTypeFlagsEXT, + const VkDebugUtilsMessengerCallbackDataEXT *, void *) { + return VK_FALSE; +} + +uint32_t findMemory(VkPhysicalDevice phy, uint32_t typeBits, + VkMemoryPropertyFlags props) { + VkPhysicalDeviceMemoryProperties mem; + vkGetPhysicalDeviceMemoryProperties(phy, &mem); + for (uint32_t i = 0; i < mem.memoryTypeCount; ++i) + if ((typeBits & (1u << i)) && + (mem.memoryTypes[i].propertyFlags & props) == props) + return i; + return std::numeric_limits::max(); +} + +VkBuffer createBuffer(VkDevice dev, VkDeviceSize size, + VkBufferUsageFlags usage) { + VkBufferCreateInfo ci{}; + ci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO; + ci.size = size; + ci.usage = usage; + ci.sharingMode = VK_SHARING_MODE_EXCLUSIVE; + VkBuffer buf; + vkCreateBuffer(dev, &ci, nullptr, &buf); + return buf; +} + +VkDeviceMemory allocateMem(VkDevice dev, VkPhysicalDevice phy, VkBuffer buf, + VkMemoryPropertyFlags props) { + VkMemoryRequirements req; + vkGetBufferMemoryRequirements(dev, buf, &req); + uint32_t idx = findMemory(phy, req.memoryTypeBits, props); + + VkMemoryAllocateInfo ai{}; + ai.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO; + ai.allocationSize = req.size; + ai.memoryTypeIndex = idx; + VkDeviceMemory mem; + vkAllocateMemory(dev, &ai, nullptr, &mem); + vkBindBufferMemory(dev, buf, mem, 0); + return mem; +} + +double benchCopy(VkDevice dev, VkCommandPool pool, VkQueue queue, VkBuffer src, + VkBuffer dst, VkDeviceSize size) { + VkCommandBuffer cmd; + VkCommandBufferAllocateInfo ai{}; + ai.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO; + ai.commandPool = pool; + ai.level = VK_COMMAND_BUFFER_LEVEL_PRIMARY; + ai.commandBufferCount = 1; + vkAllocateCommandBuffers(dev, &ai, &cmd); + + VkCommandBufferBeginInfo bi{}; + bi.sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO; + bi.flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT; + vkBeginCommandBuffer(cmd, &bi); + VkBufferCopy region{}; + region.size = size; + vkCmdCopyBuffer(cmd, src, dst, 1, ®ion); + vkEndCommandBuffer(cmd); + + VkSubmitInfo si{}; + si.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO; + si.commandBufferCount = 1; + si.pCommandBuffers = &cmd; + + auto t0 = std::chrono::steady_clock::now(); + vkQueueSubmit(queue, 1, &si, VK_NULL_HANDLE); + vkQueueWaitIdle(queue); + auto t1 = std::chrono::steady_clock::now(); + + vkFreeCommandBuffers(dev, pool, 1, &cmd); + return std::chrono::duration(t1 - t0).count(); +} + +// ---------- main ---------- +int main() { + // ---- instance ---- + VkApplicationInfo app{}; + app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; + app.pApplicationName = "VulkanTransferBench"; + app.apiVersion = VK_API_VERSION_1_2; + + VkInstanceCreateInfo ici{}; + ici.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO; + ici.pApplicationInfo = &app; + + VkInstance inst; + vkCreateInstance(&ici, nullptr, &inst); + + // ---- physical device ---- + uint32_t n = 0; + vkEnumeratePhysicalDevices(inst, &n, nullptr); + std::vector gpus(n); + vkEnumeratePhysicalDevices(inst, &n, gpus.data()); + VkPhysicalDevice phy = gpus[0]; + + // ---- logical device ---- + float prio = 1.0f; + VkDeviceQueueCreateInfo qi{}; + qi.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + qi.queueFamilyIndex = 0; // assume family 0 supports transfer + qi.queueCount = 1; + qi.pQueuePriorities = &prio; + + VkDeviceCreateInfo dci{}; + dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + dci.queueCreateInfoCount = 1; + dci.pQueueCreateInfos = &qi; + + VkDevice dev; + vkCreateDevice(phy, &dci, nullptr, &dev); + + VkQueue queue; + vkGetDeviceQueue(dev, 0, 0, &queue); + + // ---- command pool ---- + VkCommandPoolCreateInfo pci{}; + pci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + pci.queueFamilyIndex = 0; + pci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT; + VkCommandPool pool; + vkCreateCommandPool(dev, &pci, nullptr, &pool); + + // ---- buffers ---- + VkBuffer staging = createBuffer(dev, BUF_SIZE, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT); + VkDeviceMemory stagingMem = + allocateMem(dev, phy, staging, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + + VkBuffer deviceBuf = createBuffer(dev, BUF_SIZE, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT); + VkDeviceMemory deviceMem = + allocateMem(dev, phy, deviceBuf, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + + // ---- fill staging buffer ---- + void *mapped; + vkMapMemory(dev, stagingMem, 0, BUF_SIZE, 0, &mapped); + std::memset(mapped, 0xAB, BUF_SIZE); + vkUnmapMemory(dev, stagingMem); + + // ---- warm-up ---- + benchCopy(dev, pool, queue, staging, deviceBuf, BUF_SIZE); + + // ---- benchmark host->device ---- + double tH2D = 0.0; + for (uint32_t i = 0; i < ITERATIONS; ++i) + tH2D += benchCopy(dev, pool, queue, staging, deviceBuf, BUF_SIZE); + tH2D /= ITERATIONS; + + // ---- benchmark device->host ---- + double tD2H = 0.0; + for (uint32_t i = 0; i < ITERATIONS; ++i) + tD2H += benchCopy(dev, pool, queue, deviceBuf, staging, BUF_SIZE); + tD2H /= ITERATIONS; + + const double gib = static_cast(BUF_SIZE) / (1 << 30); + std::cout << "Buffer size : " << BUF_SIZE / (1 << 20) << " MiB\n"; + std::cout << "Iterations : " << ITERATIONS << "\n"; + std::cout << "H→D average : " << (gib / (tH2D * 1e-3)) << " GiB/s\n"; + std::cout << "D→H average : " << (gib / (tD2H * 1e-3)) << " GiB/s\n"; + + // ---- cleanup ---- + vkDestroyBuffer(dev, staging, nullptr); + vkDestroyBuffer(dev, deviceBuf, nullptr); + vkFreeMemory(dev, stagingMem, nullptr); + vkFreeMemory(dev, deviceMem, nullptr); + vkDestroyCommandPool(dev, pool, nullptr); + vkDestroyDevice(dev, nullptr); + vkDestroyInstance(inst, nullptr); + return 0; +}