From dbc9719d63f3e150beac60c9d17fb066f9816061 Mon Sep 17 00:00:00 2001 From: Wenbin Lu Date: Tue, 15 Oct 2024 22:32:55 +0000 Subject: [PATCH] Fix ze_peak explicit scaling benchmark The explicit scaling code for ze_peak violates L0 spec and has no overlap between sub-devices. This PR corrects these issues. Signed-off-by: Wenbin Lu --- perf_tests/ze_peak/src/global_bw.cpp | 27 +++++------ perf_tests/ze_peak/src/transfer_bw.cpp | 16 +++--- perf_tests/ze_peak/src/ze_peak.cpp | 67 +++++++++++++++----------- 3 files changed, 61 insertions(+), 49 deletions(-) diff --git a/perf_tests/ze_peak/src/global_bw.cpp b/perf_tests/ze_peak/src/global_bw.cpp index 3c63ff6e..dff83faf 100644 --- a/perf_tests/ze_peak/src/global_bw.cpp +++ b/perf_tests/ze_peak/src/global_bw.cpp @@ -1,6 +1,6 @@ /* * - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -37,7 +37,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { numItems = numItems - (numItems % context.sub_device_count); if (verbose) std::cout << "splitting the total work items ::" << numItems - << "across subdevices ::" << context.sub_device_count + << " across subdevices ::" << context.sub_device_count << std::endl; numItems = set_workgroups(context, numItems / context.sub_device_count, &workgroup_info); @@ -121,7 +121,6 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { throw std::runtime_error("zeCommandListAppendMemoryCopy failed: " + std::to_string(result)); } - i++; } } else { result = zeCommandListAppendMemoryCopy( @@ -264,7 +263,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { setup_function(context, global_offset_v16, "global_bandwidth_v16_global_offset", inputBuf, outputBuf); } - std::cout << "Global memory bandwidth (GBPS)\n"; + std::cout << "Global memory bandwidth (GB/s)\n"; timed = 0; timed_lo = 0; @@ -295,7 +294,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { timed = (timed_lo < timed_go) ? timed_lo : timed_go; gbps = calculate_gbps(timed, numItems * context.sub_device_count * sizeof(float)); - std::cout << gbps << " GFLOPS\n"; + std::cout << gbps << " GB/s\n"; } else { timed_lo = run_kernel(context, local_offset_v1, workgroup_info, type); timed_go = run_kernel(context, global_offset_v1, workgroup_info, type); @@ -303,7 +302,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { gbps = calculate_gbps(timed, numItems * sizeof(float)); - std::cout << gbps << " GBPS\n"; + std::cout << gbps << " GB/s\n"; } timed = 0; @@ -332,7 +331,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { timed = (timed_lo < timed_go) ? timed_lo : timed_go; gbps = calculate_gbps(timed, numItems * context.sub_device_count * sizeof(float)); - std::cout << gbps << " GFLOPS\n"; + std::cout << gbps << " GB/s\n"; } else { timed_lo = run_kernel(context, local_offset_v2, workgroup_info, type); timed_go = run_kernel(context, global_offset_v2, workgroup_info, type); @@ -340,7 +339,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { gbps = calculate_gbps(timed, numItems * sizeof(float)); - std::cout << gbps << " GBPS\n"; + std::cout << gbps << " GB/s\n"; } timed = 0; @@ -370,7 +369,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { timed = (timed_lo < timed_go) ? timed_lo : timed_go; gbps = calculate_gbps(timed, numItems * context.sub_device_count * sizeof(float)); - std::cout << gbps << " GFLOPS\n"; + std::cout << gbps << " GB/s\n"; } else { timed_lo = run_kernel(context, local_offset_v4, workgroup_info, type); timed_go = run_kernel(context, global_offset_v4, workgroup_info, type); @@ -378,7 +377,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { gbps = calculate_gbps(timed, numItems * sizeof(float)); - std::cout << gbps << " GBPS\n"; + std::cout << gbps << " GB/s\n"; } timed = 0; @@ -407,7 +406,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { timed = (timed_lo < timed_go) ? timed_lo : timed_go; gbps = calculate_gbps(timed, numItems * context.sub_device_count * sizeof(float)); - std::cout << gbps << " GFLOPS\n"; + std::cout << gbps << " GB/s\n"; } else { timed_lo = run_kernel(context, local_offset_v8, workgroup_info, type); timed_go = run_kernel(context, global_offset_v8, workgroup_info, type); @@ -415,7 +414,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { gbps = calculate_gbps(timed, numItems * sizeof(float)); - std::cout << gbps << " GBPS\n"; + std::cout << gbps << " GB/s\n"; } timed = 0; @@ -443,7 +442,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { timed = (timed_lo < timed_go) ? timed_lo : timed_go; gbps = calculate_gbps(timed, numItems * context.sub_device_count * sizeof(float)); - std::cout << gbps << " GFLOPS\n"; + std::cout << gbps << " GB/s\n"; } else { timed_lo = run_kernel(context, local_offset_v16, workgroup_info, type); timed_go = run_kernel(context, global_offset_v16, workgroup_info, type); @@ -451,7 +450,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) { gbps = calculate_gbps(timed, numItems * sizeof(float)); - std::cout << gbps << " GBPS\n"; + std::cout << gbps << " GB/s\n"; } if (context.sub_device_count) { diff --git a/perf_tests/ze_peak/src/transfer_bw.cpp b/perf_tests/ze_peak/src/transfer_bw.cpp index 2fc985eb..103c03db 100644 --- a/perf_tests/ze_peak/src/transfer_bw.cpp +++ b/perf_tests/ze_peak/src/transfer_bw.cpp @@ -1,6 +1,6 @@ /* * - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -198,7 +198,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context, local_memory_size); } std::cout << "GPU Copy Host to Shared Memory : "; - std::cout << gflops << " GBPS\n"; + std::cout << gflops << " GB/s\n"; gflops = 0; if (context.sub_device_count) { @@ -215,7 +215,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context, local_memory_size); } std::cout << "GPU Copy Shared Memory to Host : "; - std::cout << gflops << " GBPS\n"; + std::cout << gflops << " GB/s\n"; gflops = 0; if (context.sub_device_count) { @@ -232,7 +232,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context, local_memory_size, true); } std::cout << "System Memory Copy to Shared Memory : "; - std::cout << gflops << " GBPS\n"; + std::cout << gflops << " GB/s\n"; gflops = 0; if (context.sub_device_count) { @@ -249,7 +249,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context, local_memory_size, false); } std::cout << "System Memory Copy from Shared Memory : "; - std::cout << gflops << " GBPS\n"; + std::cout << gflops << " GB/s\n"; current_sub_device_id = 0; @@ -328,7 +328,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) { if (verbose) std::cout << "device buffer allocated\n"; - std::cout << "Transfer Bandwidth (GBPS)\n"; + std::cout << "Transfer Bandwidth (GB/s)\n"; gflops = 0; if (context.sub_device_count) { @@ -345,7 +345,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) { local_memory_size); } std::cout << "enqueueWriteBuffer : "; - std::cout << gflops << " GBPS\n"; + std::cout << gflops << " GB/s\n"; gflops = 0; if (context.sub_device_count) { @@ -362,7 +362,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) { local_memory_size); } std::cout << "enqueueReadBuffer : "; - std::cout << gflops << " GBPS\n"; + std::cout << gflops << " GB/s\n"; current_sub_device_id = 0; diff --git a/perf_tests/ze_peak/src/ze_peak.cpp b/perf_tests/ze_peak/src/ze_peak.cpp index 9645ccad..136a2e87 100644 --- a/perf_tests/ze_peak/src/ze_peak.cpp +++ b/perf_tests/ze_peak/src/ze_peak.cpp @@ -1,6 +1,6 @@ /* * - * Copyright (C) 2019 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -40,8 +40,7 @@ std::vector L0Context::load_binary_file(const std::string &file_path) { binary_file.resize(length); stream.read(reinterpret_cast(binary_file.data()), length); if (verbose) - std::cout << "Binary file loaded" - << "\n"; + std::cout << "Binary file loaded\n"; stream.close(); return binary_file; @@ -144,6 +143,7 @@ void L0Context::print_ze_device_properties( << "\n" << " * UUID : " << id << "\n" << " * coreClockRate : " << std::dec << props.coreClockRate << "\n" + << " * maxMemAllocSize : " << props.maxMemAllocSize << " bytes\n" << std::endl; } @@ -846,8 +846,11 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function, if (type == TimingMeasurement::BANDWIDTH) { if (context.sub_device_count) { - SUCCESS_OR_TERMINATE( - zeCommandListReset(context.cmd_list[current_sub_device_id])); + if (current_sub_device_id == 0) { + for (uint32_t i = 0; i < context.sub_device_count; i++) { + SUCCESS_OR_TERMINATE(zeCommandListReset(context.cmd_list[i])); + } + } } else { SUCCESS_OR_TERMINATE(zeCommandListReset(context.command_list)); } @@ -864,6 +867,12 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function, throw std::runtime_error("zeCommandListAppendLaunchKernel failed: " + std::to_string(result)); } + result = zeCommandListAppendBarrier( + context.cmd_list[current_sub_device_id], nullptr, 0, nullptr); + if (result) { + throw std::runtime_error("zeCommandListAppendBarrier failed: " + + std::to_string(result)); + } } else { result = zeCommandListAppendLaunchKernel( context.command_list, function, @@ -894,35 +903,39 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function, for (uint32_t i = 0; i < warmup_iterations; i++) { run_command_queue(context); + synchronize_command_queue(context); + } + if (verbose) + std::cout << "Warmup finished\n"; - if (context.sub_device_count) { - if (context.sub_device_count == current_sub_device_id + 1) { - current_sub_device_id = 0; - while (current_sub_device_id < context.sub_device_count) { - synchronize_command_queue(context); - current_sub_device_id++; - } - current_sub_device_id = context.sub_device_count - 1; - } - } else { - synchronize_command_queue(context); + if (context.sub_device_count) { + SUCCESS_OR_TERMINATE( + zeCommandListReset(context.cmd_list[current_sub_device_id])); + for (uint32_t i = 0; i < iters; i++) { + SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchKernel( + context.cmd_list[current_sub_device_id], function, + &workgroup_info.thread_group_dimensions, nullptr, 0, nullptr)); + SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier( + context.cmd_list[current_sub_device_id], nullptr, 0, nullptr)); } + SUCCESS_OR_TERMINATE( + zeCommandListClose(context.cmd_list[current_sub_device_id])); } timer.start(); - for (uint32_t i = 0; i < iters; i++) { + if (context.sub_device_count) { run_command_queue(context); - - if (context.sub_device_count) { - if (context.sub_device_count == current_sub_device_id + 1) { - current_sub_device_id = 0; - while (current_sub_device_id < context.sub_device_count) { - synchronize_command_queue(context); - current_sub_device_id++; - } - current_sub_device_id = context.sub_device_count - 1; + if (context.sub_device_count == current_sub_device_id + 1) { + current_sub_device_id = 0; + while (current_sub_device_id < context.sub_device_count) { + synchronize_command_queue(context); + current_sub_device_id++; } - } else { + current_sub_device_id = context.sub_device_count - 1; + } + } else { + for (uint32_t i = 0; i < iters; i++) { + run_command_queue(context); synchronize_command_queue(context); } }