From dbc9719d63f3e150beac60c9d17fb066f9816061 Mon Sep 17 00:00:00 2001
From: Wenbin Lu <wenbin.lu@intel.com>
Date: Tue, 15 Oct 2024 22:32:55 +0000
Subject: [PATCH] Fix ze_peak explicit scaling benchmark

The explicit scaling code for ze_peak violates L0 spec and has no
overlap between sub-devices. This PR corrects these issues.

Signed-off-by: Wenbin Lu <wenbin.lu@intel.com>
---
 perf_tests/ze_peak/src/global_bw.cpp   | 27 +++++------
 perf_tests/ze_peak/src/transfer_bw.cpp | 16 +++---
 perf_tests/ze_peak/src/ze_peak.cpp     | 67 +++++++++++++++-----------
 3 files changed, 61 insertions(+), 49 deletions(-)

diff --git a/perf_tests/ze_peak/src/global_bw.cpp b/perf_tests/ze_peak/src/global_bw.cpp
index 3c63ff6e..dff83faf 100644
--- a/perf_tests/ze_peak/src/global_bw.cpp
+++ b/perf_tests/ze_peak/src/global_bw.cpp
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     numItems = numItems - (numItems % context.sub_device_count);
     if (verbose)
       std::cout << "splitting the total work items ::" << numItems
-                << "across subdevices ::" << context.sub_device_count
+                << " across subdevices ::" << context.sub_device_count
                 << std::endl;
     numItems = set_workgroups(context, numItems / context.sub_device_count,
                               &workgroup_info);
@@ -121,7 +121,6 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
         throw std::runtime_error("zeCommandListAppendMemoryCopy failed: " +
                                  std::to_string(result));
       }
-      i++;
     }
   } else {
     result = zeCommandListAppendMemoryCopy(
@@ -264,7 +263,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     setup_function(context, global_offset_v16,
                    "global_bandwidth_v16_global_offset", inputBuf, outputBuf);
   }
-  std::cout << "Global memory bandwidth (GBPS)\n";
+  std::cout << "Global memory bandwidth (GB/s)\n";
 
   timed = 0;
   timed_lo = 0;
@@ -295,7 +294,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v1, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v1, workgroup_info, type);
@@ -303,7 +302,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   timed = 0;
@@ -332,7 +331,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v2, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v2, workgroup_info, type);
@@ -340,7 +339,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   timed = 0;
@@ -370,7 +369,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v4, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v4, workgroup_info, type);
@@ -378,7 +377,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   timed = 0;
@@ -407,7 +406,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v8, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v8, workgroup_info, type);
@@ -415,7 +414,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   timed = 0;
@@ -443,7 +442,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
     timed = (timed_lo < timed_go) ? timed_lo : timed_go;
     gbps = calculate_gbps(timed,
                           numItems * context.sub_device_count * sizeof(float));
-    std::cout << gbps << " GFLOPS\n";
+    std::cout << gbps << " GB/s\n";
   } else {
     timed_lo = run_kernel(context, local_offset_v16, workgroup_info, type);
     timed_go = run_kernel(context, global_offset_v16, workgroup_info, type);
@@ -451,7 +450,7 @@ void ZePeak::ze_peak_global_bw(L0Context &context) {
 
     gbps = calculate_gbps(timed, numItems * sizeof(float));
 
-    std::cout << gbps << " GBPS\n";
+    std::cout << gbps << " GB/s\n";
   }
 
   if (context.sub_device_count) {
diff --git a/perf_tests/ze_peak/src/transfer_bw.cpp b/perf_tests/ze_peak/src/transfer_bw.cpp
index 2fc985eb..103c03db 100644
--- a/perf_tests/ze_peak/src/transfer_bw.cpp
+++ b/perf_tests/ze_peak/src/transfer_bw.cpp
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -198,7 +198,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
                                    local_memory_size);
   }
   std::cout << "GPU Copy Host to Shared Memory : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -215,7 +215,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
                                    local_memory_size);
   }
   std::cout << "GPU Copy Shared Memory to Host : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -232,7 +232,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
                                     local_memory_size, true);
   }
   std::cout << "System Memory Copy to Shared Memory : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -249,7 +249,7 @@ void ZePeak::_transfer_bw_shared_memory(L0Context &context,
                                     local_memory_size, false);
   }
   std::cout << "System Memory Copy from Shared Memory : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   current_sub_device_id = 0;
 
@@ -328,7 +328,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
   if (verbose)
     std::cout << "device buffer allocated\n";
 
-  std::cout << "Transfer Bandwidth (GBPS)\n";
+  std::cout << "Transfer Bandwidth (GB/s)\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -345,7 +345,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
                                    local_memory_size);
   }
   std::cout << "enqueueWriteBuffer : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   gflops = 0;
   if (context.sub_device_count) {
@@ -362,7 +362,7 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
                                    local_memory_size);
   }
   std::cout << "enqueueReadBuffer : ";
-  std::cout << gflops << " GBPS\n";
+  std::cout << gflops << " GB/s\n";
 
   current_sub_device_id = 0;
 
diff --git a/perf_tests/ze_peak/src/ze_peak.cpp b/perf_tests/ze_peak/src/ze_peak.cpp
index 9645ccad..136a2e87 100644
--- a/perf_tests/ze_peak/src/ze_peak.cpp
+++ b/perf_tests/ze_peak/src/ze_peak.cpp
@@ -1,6 +1,6 @@
 /*
  *
- * Copyright (C) 2019 Intel Corporation
+ * Copyright (C) 2019-2024 Intel Corporation
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,8 +40,7 @@ std::vector<uint8_t> L0Context::load_binary_file(const std::string &file_path) {
   binary_file.resize(length);
   stream.read(reinterpret_cast<char *>(binary_file.data()), length);
   if (verbose)
-    std::cout << "Binary file loaded"
-              << "\n";
+    std::cout << "Binary file loaded\n";
   stream.close();
 
   return binary_file;
@@ -144,6 +143,7 @@ void L0Context::print_ze_device_properties(
             << "\n"
             << " * UUID : " << id << "\n"
             << " * coreClockRate : " << std::dec << props.coreClockRate << "\n"
+            << " * maxMemAllocSize : " << props.maxMemAllocSize << " bytes\n"
             << std::endl;
 }
 
@@ -846,8 +846,11 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,
 
   if (type == TimingMeasurement::BANDWIDTH) {
     if (context.sub_device_count) {
-      SUCCESS_OR_TERMINATE(
-          zeCommandListReset(context.cmd_list[current_sub_device_id]));
+      if (current_sub_device_id == 0) {
+        for (uint32_t i = 0; i < context.sub_device_count; i++) {
+          SUCCESS_OR_TERMINATE(zeCommandListReset(context.cmd_list[i]));
+        }
+      }
     } else {
       SUCCESS_OR_TERMINATE(zeCommandListReset(context.command_list));
     }
@@ -864,6 +867,12 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,
         throw std::runtime_error("zeCommandListAppendLaunchKernel failed: " +
                                  std::to_string(result));
       }
+      result = zeCommandListAppendBarrier(
+          context.cmd_list[current_sub_device_id], nullptr, 0, nullptr);
+      if (result) {
+        throw std::runtime_error("zeCommandListAppendBarrier failed: " +
+                                 std::to_string(result));
+      }
     } else {
       result = zeCommandListAppendLaunchKernel(
           context.command_list, function,
@@ -894,35 +903,39 @@ long double ZePeak::run_kernel(L0Context context, ze_kernel_handle_t &function,
 
     for (uint32_t i = 0; i < warmup_iterations; i++) {
       run_command_queue(context);
+      synchronize_command_queue(context);
+    }
+    if (verbose)
+      std::cout << "Warmup finished\n";
 
-      if (context.sub_device_count) {
-        if (context.sub_device_count == current_sub_device_id + 1) {
-          current_sub_device_id = 0;
-          while (current_sub_device_id < context.sub_device_count) {
-            synchronize_command_queue(context);
-            current_sub_device_id++;
-          }
-          current_sub_device_id = context.sub_device_count - 1;
-        }
-      } else {
-        synchronize_command_queue(context);
+    if (context.sub_device_count) {
+      SUCCESS_OR_TERMINATE(
+          zeCommandListReset(context.cmd_list[current_sub_device_id]));
+      for (uint32_t i = 0; i < iters; i++) {
+        SUCCESS_OR_TERMINATE(zeCommandListAppendLaunchKernel(
+            context.cmd_list[current_sub_device_id], function,
+            &workgroup_info.thread_group_dimensions, nullptr, 0, nullptr));
+        SUCCESS_OR_TERMINATE(zeCommandListAppendBarrier(
+            context.cmd_list[current_sub_device_id], nullptr, 0, nullptr));
       }
+      SUCCESS_OR_TERMINATE(
+          zeCommandListClose(context.cmd_list[current_sub_device_id]));
     }
 
     timer.start();
-    for (uint32_t i = 0; i < iters; i++) {
+    if (context.sub_device_count) {
       run_command_queue(context);
-
-      if (context.sub_device_count) {
-        if (context.sub_device_count == current_sub_device_id + 1) {
-          current_sub_device_id = 0;
-          while (current_sub_device_id < context.sub_device_count) {
-            synchronize_command_queue(context);
-            current_sub_device_id++;
-          }
-          current_sub_device_id = context.sub_device_count - 1;
+      if (context.sub_device_count == current_sub_device_id + 1) {
+        current_sub_device_id = 0;
+        while (current_sub_device_id < context.sub_device_count) {
+          synchronize_command_queue(context);
+          current_sub_device_id++;
         }
-      } else {
+        current_sub_device_id = context.sub_device_count - 1;
+      }
+    } else {
+      for (uint32_t i = 0; i < iters; i++) {
+        run_command_queue(context);
         synchronize_command_queue(context);
       }
     }