From eb299020ae926e6d49a2211bbd5b7f7734616795 Mon Sep 17 00:00:00 2001 From: Kasper Sauramo Date: Sat, 8 Nov 2025 23:23:31 +0200 Subject: [PATCH] refactor for adding another type of test --- src/main.cpp | 255 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 160 insertions(+), 95 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 4e65ec3..0ab2c1f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -7,17 +7,30 @@ #include #include -/// TODO: Run benchmark on all GPU's /// TODO: Pinned memory /// TODO: Plot by buffer size -static constexpr uint64_t BUF_SIZE = 256ULL << 20; // 256 MiB -static constexpr uint32_t ITERATIONS = 32; +enum class TransferType { + Memory, + Pinned, +}; + +struct BenchmarkConfig { + uint32_t buffer_size = 256ULL << 20; + uint32_t iterations = 32; + TransferType type = TransferType::Memory; +}; + +struct BenchmarkResult { + double host_to_device_time = 0.0f; + double device_to_host_time = 0.0f; +}; // ---------- helpers ---------- VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback( VkDebugUtilsMessageSeverityFlagBitsEXT, VkDebugUtilsMessageTypeFlagsEXT, const VkDebugUtilsMessengerCallbackDataEXT *, void *) { + return VK_FALSE; } @@ -111,7 +124,121 @@ double benchCopy(VkDevice dev, VkCommandPool pool, VkQueue queue, VkBuffer src, return std::chrono::duration(t1 - t0).count(); } -int main() { +struct Gpu { + VkPhysicalDevice physicalDevice = nullptr; + VkPhysicalDeviceProperties properties = {}; + VkDevice device = nullptr; + VkQueue queue = nullptr; + VkCommandPool pool = nullptr; + // for benchmarks + VkBuffer stagingBuf = nullptr; + VkBuffer deviceBuf = nullptr; + VkDeviceMemory stagingMem = nullptr; + VkDeviceMemory deviceMem = nullptr; +}; + +Gpu initGpu(VkPhysicalDevice phy, uint32_t buffer_size) { + Gpu gpu = {}; + gpu.physicalDevice = phy; + + // properties with lots of nice info + vkGetPhysicalDeviceProperties(phy, &gpu.properties); + + // logical device + float prio = 1.0f; + VkDeviceQueueCreateInfo qi{}; + qi.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; + qi.queueFamilyIndex = 0; // assume family 0 supports transfer + qi.queueCount = 1; + qi.pQueuePriorities = &prio; + + VkDeviceCreateInfo dci{}; + dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; + dci.queueCreateInfoCount = 1; + dci.pQueueCreateInfos = &qi; + + vkCreateDevice(phy, &dci, nullptr, &gpu.device); + vkGetDeviceQueue(gpu.device, 0, 0, &gpu.queue); + + // command pool + VkCommandPoolCreateInfo pci{}; + pci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; + pci.queueFamilyIndex = 0; + pci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT; + + vkCreateCommandPool(gpu.device, &pci, nullptr, &gpu.pool); + + // buffers + gpu.stagingBuf = createBuffer(gpu.device, buffer_size, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT); + gpu.stagingMem = allocateMem(gpu.device, phy, gpu.stagingBuf, + VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | + VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); + + gpu.deviceBuf = createBuffer(gpu.device, buffer_size, + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT); + gpu.deviceMem = allocateMem(gpu.device, phy, gpu.deviceBuf, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); + + return gpu; +} + +void cleanupGpu(Gpu &gpu) { + + vkDestroyBuffer(gpu.device, gpu.stagingBuf, nullptr); + vkDestroyBuffer(gpu.device, gpu.deviceBuf, nullptr); + vkFreeMemory(gpu.device, gpu.stagingMem, nullptr); + vkFreeMemory(gpu.device, gpu.deviceMem, nullptr); + vkDestroyCommandPool(gpu.device, gpu.pool, nullptr); + vkDestroyDevice(gpu.device, nullptr); +} + +void reportBenchmark(const BenchmarkResult &result, const Gpu &gpu, + BenchmarkConfig config) { + + const double gib = static_cast(config.buffer_size) / (1 << 30); + std::cout << "Device: " << gpu.properties.deviceName << std::endl; + std::cout << "Buffer size : " << config.buffer_size / (1 << 20) << " MiB\n"; + std::cout << "Iterations : " << config.iterations << "\n"; + std::cout << "H→D average : " << (gib / (result.host_to_device_time * 1e-3)) + << " GiB/s\n"; + std::cout << "D→H average : " << (gib / (result.device_to_host_time * 1e-3)) + << " GiB/s\n"; + std::cout << std::endl; +} + +BenchmarkResult runBenchmark(Gpu &gpu, BenchmarkConfig config) { + + // ---- fill staging buffer ---- + void *mapped; + vkMapMemory(gpu.device, gpu.stagingMem, 0, config.buffer_size, 0, &mapped); + std::memset(mapped, 0xAB, config.buffer_size); + vkUnmapMemory(gpu.device, gpu.stagingMem); + + // ---- warm-up ---- + benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.stagingBuf, gpu.deviceBuf, + config.buffer_size); + + // ---- benchmark host->device ---- + double tH2D = 0.0; + for (uint32_t i = 0; i < config.iterations; ++i) + tH2D += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.stagingBuf, + gpu.deviceBuf, config.buffer_size); + tH2D /= config.iterations; + + // ---- benchmark device->host ---- + double tD2H = 0.0; + for (uint32_t i = 0; i < config.iterations; ++i) + tD2H += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.deviceBuf, + gpu.stagingBuf, config.buffer_size); + tD2H /= config.iterations; + + return {tH2D, tD2H}; +} + +auto main() -> int { VkApplicationInfo app{}; app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO; @@ -125,105 +252,43 @@ int main() { VkInstance inst; vkCreateInstance(&ici, nullptr, &inst); - std::cout << "Starting benchmark..." << std::endl; + std::cout << "Starting benchmark." << std::endl; - // ---- physical device ---- + // physical devices uint32_t n = 0; vkEnumeratePhysicalDevices(inst, &n, nullptr); - std::vector gpus(n); - vkEnumeratePhysicalDevices(inst, &n, gpus.data()); - VkPhysicalDevice phy = gpus[0]; - std::cout << "Found " << gpus.size() << " gpus." << std::endl; + std::cout << "Found " << n << " gpus." << std::endl; + std::vector vulkanPhysicalDevices(n); - { - std::vector gpu_properties(n); - // Print info - for (uint32_t i = 0; i < gpus.size(); i++) { - VkPhysicalDeviceProperties *prop = &gpu_properties[i]; - vkGetPhysicalDeviceProperties(gpus[i], prop); + vkEnumeratePhysicalDevices(inst, &n, vulkanPhysicalDevices.data()); - std::cout << "GPU: [" << i << "] " << prop->deviceName << " (" - << getGpuTypeName(prop->deviceType) << ")" << std::endl; - } + // for each physical device, add initialize and add details + std::vector gpus; + + BenchmarkConfig config = {}; + + uint32_t counter = 0; + for (const auto phy : vulkanPhysicalDevices) { + gpus.emplace_back(initGpu(phy, config.buffer_size)); + + const Gpu &gpu = gpus.back(); + + std::cout << "Device: [" << counter << "] " << gpu.properties.deviceName + << " (" << getGpuTypeName(gpu.properties.deviceType) << ")" + << std::endl; + counter++; + } + std::cout << "-------------------" << std::endl; + + for (auto &gpu : gpus) { + BenchmarkResult res = runBenchmark(gpu, config); + reportBenchmark(res, gpu, config); } - // ---- logical device ---- - float prio = 1.0f; - VkDeviceQueueCreateInfo qi{}; - qi.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO; - qi.queueFamilyIndex = 0; // assume family 0 supports transfer - qi.queueCount = 1; - qi.pQueuePriorities = &prio; - - VkDeviceCreateInfo dci{}; - dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO; - dci.queueCreateInfoCount = 1; - dci.pQueueCreateInfos = &qi; - - VkDevice dev; - vkCreateDevice(phy, &dci, nullptr, &dev); - - VkQueue queue; - vkGetDeviceQueue(dev, 0, 0, &queue); - - // ---- command pool ---- - VkCommandPoolCreateInfo pci{}; - pci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO; - pci.queueFamilyIndex = 0; - pci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT; - VkCommandPool pool; - vkCreateCommandPool(dev, &pci, nullptr, &pool); - - // ---- buffers ---- - VkBuffer staging = createBuffer(dev, BUF_SIZE, - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT); - VkDeviceMemory stagingMem = - allocateMem(dev, phy, staging, - VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | - VK_MEMORY_PROPERTY_HOST_COHERENT_BIT); - - VkBuffer deviceBuf = createBuffer(dev, BUF_SIZE, - VK_BUFFER_USAGE_TRANSFER_SRC_BIT | - VK_BUFFER_USAGE_TRANSFER_DST_BIT); - VkDeviceMemory deviceMem = - allocateMem(dev, phy, deviceBuf, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT); - - // ---- fill staging buffer ---- - void *mapped; - vkMapMemory(dev, stagingMem, 0, BUF_SIZE, 0, &mapped); - std::memset(mapped, 0xAB, BUF_SIZE); - vkUnmapMemory(dev, stagingMem); - - // ---- warm-up ---- - benchCopy(dev, pool, queue, staging, deviceBuf, BUF_SIZE); - - // ---- benchmark host->device ---- - double tH2D = 0.0; - for (uint32_t i = 0; i < ITERATIONS; ++i) - tH2D += benchCopy(dev, pool, queue, staging, deviceBuf, BUF_SIZE); - tH2D /= ITERATIONS; - - // ---- benchmark device->host ---- - double tD2H = 0.0; - for (uint32_t i = 0; i < ITERATIONS; ++i) - tD2H += benchCopy(dev, pool, queue, deviceBuf, staging, BUF_SIZE); - tD2H /= ITERATIONS; - - const double gib = static_cast(BUF_SIZE) / (1 << 30); - std::cout << "Buffer size : " << BUF_SIZE / (1 << 20) << " MiB\n"; - std::cout << "Iterations : " << ITERATIONS << "\n"; - std::cout << "H→D average : " << (gib / (tH2D * 1e-3)) << " GiB/s\n"; - std::cout << "D→H average : " << (gib / (tD2H * 1e-3)) << " GiB/s\n"; - - // ---- cleanup ---- - vkDestroyBuffer(dev, staging, nullptr); - vkDestroyBuffer(dev, deviceBuf, nullptr); - vkFreeMemory(dev, stagingMem, nullptr); - vkFreeMemory(dev, deviceMem, nullptr); - vkDestroyCommandPool(dev, pool, nullptr); - vkDestroyDevice(dev, nullptr); + for (auto gpu : gpus) { + cleanupGpu(gpu); + } vkDestroyInstance(inst, nullptr); return 0; }