refactor for adding another type of test

This commit is contained in:
2025-11-08 23:23:31 +02:00
parent 776c09908e
commit eb299020ae

View File

@@ -7,17 +7,30 @@
#include <vector>
#include <vulkan/vulkan_core.h>
/// TODO: Run benchmark on all GPU's
/// TODO: Pinned memory
/// TODO: Plot by buffer size
static constexpr uint64_t BUF_SIZE = 256ULL << 20; // 256 MiB
static constexpr uint32_t ITERATIONS = 32;
enum class TransferType {
Memory,
Pinned,
};
struct BenchmarkConfig {
uint32_t buffer_size = 256ULL << 20;
uint32_t iterations = 32;
TransferType type = TransferType::Memory;
};
struct BenchmarkResult {
double host_to_device_time = 0.0f;
double device_to_host_time = 0.0f;
};
// ---------- helpers ----------
VKAPI_ATTR VkBool32 VKAPI_CALL debugCallback(
VkDebugUtilsMessageSeverityFlagBitsEXT, VkDebugUtilsMessageTypeFlagsEXT,
const VkDebugUtilsMessengerCallbackDataEXT *, void *) {
return VK_FALSE;
}
@@ -111,7 +124,121 @@ double benchCopy(VkDevice dev, VkCommandPool pool, VkQueue queue, VkBuffer src,
return std::chrono::duration<double, std::milli>(t1 - t0).count();
}
int main() {
struct Gpu {
VkPhysicalDevice physicalDevice = nullptr;
VkPhysicalDeviceProperties properties = {};
VkDevice device = nullptr;
VkQueue queue = nullptr;
VkCommandPool pool = nullptr;
// for benchmarks
VkBuffer stagingBuf = nullptr;
VkBuffer deviceBuf = nullptr;
VkDeviceMemory stagingMem = nullptr;
VkDeviceMemory deviceMem = nullptr;
};
Gpu initGpu(VkPhysicalDevice phy, uint32_t buffer_size) {
Gpu gpu = {};
gpu.physicalDevice = phy;
// properties with lots of nice info
vkGetPhysicalDeviceProperties(phy, &gpu.properties);
// logical device
float prio = 1.0f;
VkDeviceQueueCreateInfo qi{};
qi.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
qi.queueFamilyIndex = 0; // assume family 0 supports transfer
qi.queueCount = 1;
qi.pQueuePriorities = &prio;
VkDeviceCreateInfo dci{};
dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
dci.queueCreateInfoCount = 1;
dci.pQueueCreateInfos = &qi;
vkCreateDevice(phy, &dci, nullptr, &gpu.device);
vkGetDeviceQueue(gpu.device, 0, 0, &gpu.queue);
// command pool
VkCommandPoolCreateInfo pci{};
pci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
pci.queueFamilyIndex = 0;
pci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
vkCreateCommandPool(gpu.device, &pci, nullptr, &gpu.pool);
// buffers
gpu.stagingBuf = createBuffer(gpu.device, buffer_size,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT);
gpu.stagingMem = allocateMem(gpu.device, phy, gpu.stagingBuf,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
gpu.deviceBuf = createBuffer(gpu.device, buffer_size,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT);
gpu.deviceMem = allocateMem(gpu.device, phy, gpu.deviceBuf,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
return gpu;
}
void cleanupGpu(Gpu &gpu) {
vkDestroyBuffer(gpu.device, gpu.stagingBuf, nullptr);
vkDestroyBuffer(gpu.device, gpu.deviceBuf, nullptr);
vkFreeMemory(gpu.device, gpu.stagingMem, nullptr);
vkFreeMemory(gpu.device, gpu.deviceMem, nullptr);
vkDestroyCommandPool(gpu.device, gpu.pool, nullptr);
vkDestroyDevice(gpu.device, nullptr);
}
void reportBenchmark(const BenchmarkResult &result, const Gpu &gpu,
BenchmarkConfig config) {
const double gib = static_cast<double>(config.buffer_size) / (1 << 30);
std::cout << "Device: " << gpu.properties.deviceName << std::endl;
std::cout << "Buffer size : " << config.buffer_size / (1 << 20) << " MiB\n";
std::cout << "Iterations : " << config.iterations << "\n";
std::cout << "H→D average : " << (gib / (result.host_to_device_time * 1e-3))
<< " GiB/s\n";
std::cout << "D→H average : " << (gib / (result.device_to_host_time * 1e-3))
<< " GiB/s\n";
std::cout << std::endl;
}
BenchmarkResult runBenchmark(Gpu &gpu, BenchmarkConfig config) {
// ---- fill staging buffer ----
void *mapped;
vkMapMemory(gpu.device, gpu.stagingMem, 0, config.buffer_size, 0, &mapped);
std::memset(mapped, 0xAB, config.buffer_size);
vkUnmapMemory(gpu.device, gpu.stagingMem);
// ---- warm-up ----
benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.stagingBuf, gpu.deviceBuf,
config.buffer_size);
// ---- benchmark host->device ----
double tH2D = 0.0;
for (uint32_t i = 0; i < config.iterations; ++i)
tH2D += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.stagingBuf,
gpu.deviceBuf, config.buffer_size);
tH2D /= config.iterations;
// ---- benchmark device->host ----
double tD2H = 0.0;
for (uint32_t i = 0; i < config.iterations; ++i)
tD2H += benchCopy(gpu.device, gpu.pool, gpu.queue, gpu.deviceBuf,
gpu.stagingBuf, config.buffer_size);
tD2H /= config.iterations;
return {tH2D, tD2H};
}
auto main() -> int {
VkApplicationInfo app{};
app.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
@@ -125,105 +252,43 @@ int main() {
VkInstance inst;
vkCreateInstance(&ici, nullptr, &inst);
std::cout << "Starting benchmark..." << std::endl;
std::cout << "Starting benchmark." << std::endl;
// ---- physical device ----
// physical devices
uint32_t n = 0;
vkEnumeratePhysicalDevices(inst, &n, nullptr);
std::vector<VkPhysicalDevice> gpus(n);
vkEnumeratePhysicalDevices(inst, &n, gpus.data());
VkPhysicalDevice phy = gpus[0];
std::cout << "Found " << gpus.size() << " gpus." << std::endl;
std::cout << "Found " << n << " gpus." << std::endl;
std::vector<VkPhysicalDevice> vulkanPhysicalDevices(n);
{
std::vector<VkPhysicalDeviceProperties> gpu_properties(n);
// Print info
for (uint32_t i = 0; i < gpus.size(); i++) {
VkPhysicalDeviceProperties *prop = &gpu_properties[i];
vkGetPhysicalDeviceProperties(gpus[i], prop);
vkEnumeratePhysicalDevices(inst, &n, vulkanPhysicalDevices.data());
std::cout << "GPU: [" << i << "] " << prop->deviceName << " ("
<< getGpuTypeName(prop->deviceType) << ")" << std::endl;
}
// for each physical device, add initialize and add details
std::vector<Gpu> gpus;
BenchmarkConfig config = {};
uint32_t counter = 0;
for (const auto phy : vulkanPhysicalDevices) {
gpus.emplace_back(initGpu(phy, config.buffer_size));
const Gpu &gpu = gpus.back();
std::cout << "Device: [" << counter << "] " << gpu.properties.deviceName
<< " (" << getGpuTypeName(gpu.properties.deviceType) << ")"
<< std::endl;
counter++;
}
std::cout << "-------------------" << std::endl;
for (auto &gpu : gpus) {
BenchmarkResult res = runBenchmark(gpu, config);
reportBenchmark(res, gpu, config);
}
// ---- logical device ----
float prio = 1.0f;
VkDeviceQueueCreateInfo qi{};
qi.sType = VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO;
qi.queueFamilyIndex = 0; // assume family 0 supports transfer
qi.queueCount = 1;
qi.pQueuePriorities = &prio;
VkDeviceCreateInfo dci{};
dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
dci.queueCreateInfoCount = 1;
dci.pQueueCreateInfos = &qi;
VkDevice dev;
vkCreateDevice(phy, &dci, nullptr, &dev);
VkQueue queue;
vkGetDeviceQueue(dev, 0, 0, &queue);
// ---- command pool ----
VkCommandPoolCreateInfo pci{};
pci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
pci.queueFamilyIndex = 0;
pci.flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
VkCommandPool pool;
vkCreateCommandPool(dev, &pci, nullptr, &pool);
// ---- buffers ----
VkBuffer staging = createBuffer(dev, BUF_SIZE,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT);
VkDeviceMemory stagingMem =
allocateMem(dev, phy, staging,
VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VkBuffer deviceBuf = createBuffer(dev, BUF_SIZE,
VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
VK_BUFFER_USAGE_TRANSFER_DST_BIT);
VkDeviceMemory deviceMem =
allocateMem(dev, phy, deviceBuf, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
// ---- fill staging buffer ----
void *mapped;
vkMapMemory(dev, stagingMem, 0, BUF_SIZE, 0, &mapped);
std::memset(mapped, 0xAB, BUF_SIZE);
vkUnmapMemory(dev, stagingMem);
// ---- warm-up ----
benchCopy(dev, pool, queue, staging, deviceBuf, BUF_SIZE);
// ---- benchmark host->device ----
double tH2D = 0.0;
for (uint32_t i = 0; i < ITERATIONS; ++i)
tH2D += benchCopy(dev, pool, queue, staging, deviceBuf, BUF_SIZE);
tH2D /= ITERATIONS;
// ---- benchmark device->host ----
double tD2H = 0.0;
for (uint32_t i = 0; i < ITERATIONS; ++i)
tD2H += benchCopy(dev, pool, queue, deviceBuf, staging, BUF_SIZE);
tD2H /= ITERATIONS;
const double gib = static_cast<double>(BUF_SIZE) / (1 << 30);
std::cout << "Buffer size : " << BUF_SIZE / (1 << 20) << " MiB\n";
std::cout << "Iterations : " << ITERATIONS << "\n";
std::cout << "H→D average : " << (gib / (tH2D * 1e-3)) << " GiB/s\n";
std::cout << "D→H average : " << (gib / (tD2H * 1e-3)) << " GiB/s\n";
// ---- cleanup ----
vkDestroyBuffer(dev, staging, nullptr);
vkDestroyBuffer(dev, deviceBuf, nullptr);
vkFreeMemory(dev, stagingMem, nullptr);
vkFreeMemory(dev, deviceMem, nullptr);
vkDestroyCommandPool(dev, pool, nullptr);
vkDestroyDevice(dev, nullptr);
for (auto gpu : gpus) {
cleanupGpu(gpu);
}
vkDestroyInstance(inst, nullptr);
return 0;
}