implemented async- and threaded- block partition, still not better th…

…an simpler model
adeeconometrics · May 12, 2024 · b573e3c · b573e3c
1 parent a117d56
commit b573e3c
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 14 deletions.
diff --git a/matmul/gflops.py b/matmul/gflops.py
@@ -0,0 +1,15 @@
+def gflops(n: int, time: int) -> float:
+    return (2*n**3/(time*1e-9))/(1e9)  # 2n^3 flops / time (in ns) in GFLOPS
+
+
+if __name__ == '__main__':
+    n = 2048
+    bench: dict = {
+        'iterative': 634_936_604,
+        'blocked': 19_087_074_146,
+        'threaded_blocked': 11_113_214_750,
+        'async_blocked': 10_820_800_292,
+    }
+    print(f'GFLOPS for {n}x{n} matrix multiplication')
+    for name, time in bench.items():
+        print(f'{name}: {gflops(n, time):.6f} GFLOPS')
diff --git a/matmul/include/matmul.hpp b/matmul/include/matmul.hpp
@@ -6,6 +6,7 @@
 #include "../include/utils.hpp"
 
 #include <algorithm>
+#include <future>
 #include <memory>
 #include <thread>
 #include <type_traits>
@@ -21,6 +22,7 @@ template <typename T, std::size_t M, std::size_t N,
 auto iterative(const Matrix<T, M, N> &t_lhs,
                const Matrix<T, M, N> &t_rhs) -> Matrix<T, M, N> {
   Matrix<T, M, N> result;
+
   for (std::size_t i = 0; i < M; ++i) {
     for (std::size_t j = 0; j < N; ++j) {
       T sum = 0;
@@ -38,6 +40,7 @@ template <typename T, std::size_t M, std::size_t N,
 auto loop_reorder(const Matrix<T, M, N> &t_lhs,
                   const Matrix<T, M, N> &t_rhs) -> Matrix<T, M, N> {
   Matrix<T, M, N> result;
+
   for (std::size_t i = 0; i < M; ++i) {
     for (std::size_t j = 0; j < N; ++j) {
       for (std::size_t k = 0; k < N; ++k) {
@@ -53,7 +56,8 @@ template <typename T, std::size_t N, std::size_t M,
 auto gemm(const Matrix<T, N, M> &t_lhs,
           const Matrix<T, N, M> &t_rhs) -> Matrix<T, N, M> {
   Matrix<T, N, M> result;
-  const std::size_t block_size = 16;
+  constexpr std::size_t block_size = (16 * 1024) / sizeof(T);
+
   // Loop over the blocks
   for (std::size_t i = 0; i < N; i += block_size) {
     for (std::size_t j = 0; j < N; j += block_size) {
@@ -80,9 +84,10 @@ template <typename T, std::size_t N, std::size_t M,
 auto threaded_gemm(const Matrix<T, N, M> &t_lhs,
                    const Matrix<T, N, M> &t_rhs) -> Matrix<T, N, M> {
   Matrix<T, N, M> result;
-  const std::size_t block_size = 16;
+  constexpr std::size_t block_size = (32 * 1024) / sizeof(T);
   const std::size_t num_threads = std::thread::hardware_concurrency();
   std::vector<std::thread> threads;
+
   for (std::size_t i = 0; i < num_threads; ++i) {
     threads.emplace_back([i, num_threads, block_size, &t_lhs, &t_rhs,
                           &result]() {
@@ -107,4 +112,26 @@ auto threaded_gemm(const Matrix<T, N, M> &t_lhs,
   return result;
 }
 
+template <typename T, std::size_t N, std::size_t M,
+          typename = typename std::enable_if_t<std::is_arithmetic_v<T>>>
+auto async_gemm(const Matrix<T, N, M> &t_lhs,
+                const Matrix<T, N, M> &t_rhs) -> Matrix<T, N, M> {
+  Matrix<T, N, M> result;
+  std::array<std::future<void>, N> futures;
+
+  for (std::size_t i = 0; i < N; ++i) {
+    futures[i] = std::async(std::launch::async, [i, &t_lhs, &t_rhs, &result]() {
+      for (std::size_t j = 0; j < N; ++j) {
+        T sum{};
+        for (std::size_t k = 0; k < N; ++k) {
+          sum += t_lhs(i, k) * t_rhs(k, j);
+        }
+        result(i, j) = sum;
+      }
+    });
+  }
+
+  return result;
+}
+
 #endif // __MATMUL_H__
diff --git a/matmul/src/main.cxx b/matmul/src/main.cxx
@@ -12,7 +12,7 @@ auto bench(
     const std::function<Matrix<T, Rows, Cols>(
         const Matrix<T, Rows, Cols> &, const Matrix<T, Rows, Cols> &)> &t_func,
     const Matrix<T, Rows, Cols> &t_lhs, const Matrix<T, Rows, Cols> &t_rhs,
-    std::string t_name, std::size_t t_iter = 3) -> Matrix<T, Rows, Cols> {
+    std::string t_name, std::size_t t_iter = 2) -> Matrix<T, Rows, Cols> {
 
   {
     Timer timer{t_iter, t_name};
@@ -26,36 +26,39 @@ auto bench(
 auto test_matmul() -> void {
 
   std::mt19937 prng(42);
-  constexpr std::size_t Rows = 2048;
-  constexpr std::size_t Cols = 2048;
+  constexpr std::size_t Rows = 255;
+  constexpr std::size_t Cols = 255;
 
-  Matrix<float, Rows, Cols> lhs_matrix{rand_vector<float, Rows, Cols>(prng),
-                                       Rows, Cols};
-  Matrix<float, Rows, Cols> rhs_matrix{rand_vector<float, Rows, Cols>(prng),
-                                       Rows, Cols};
+  Matrix<float, Rows, Cols> lhs_matrix{rand_array<float, Rows, Cols>(prng)};
+  Matrix<float, Rows, Cols> rhs_matrix{rand_array<float, Rows, Cols>(prng)};
 
   auto iter_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
       loop_reorder<float, Rows, Cols>);
-  auto iter_mat = bench(iter_func, lhs_matrix, rhs_matrix, "iterative", 2);
+  auto iter_mat = bench(iter_func, lhs_matrix, rhs_matrix, "iterative");
   // auto loop_reorder_func = std::function<Matrix<float, Rows, Cols>(
-  //     const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
-  //     loop_reorder<float, Rows, Cols>);
+  //     const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols>
+  //     &)>( loop_reorder<float, Rows, Cols>);
   // auto loop_reorder_mat =
   //     bench(loop_reorder_func, lhs_matrix, rhs_matrix, "loop_reorder");
 
   // auto blocked = block_multiply<float, Rows, Cols>(lhs_matrix, rhs_matrix);
   auto blocked_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
       gemm<float, Rows, Cols>);
-  auto blocked = bench(blocked_func, lhs_matrix, rhs_matrix, "blocked", 2);
+  auto blocked = bench(blocked_func, lhs_matrix, rhs_matrix, "blocked");
 
   auto threaded_gemm_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
       threaded_gemm<float, Rows, Cols>);
   auto threaded_gemm =
-      bench(threaded_gemm_func, lhs_matrix, rhs_matrix, "threaded_gemm", 2);
+      bench(threaded_gemm_func, lhs_matrix, rhs_matrix, "threaded_gemm");
 
+  auto async_gemm_func = std::function<Matrix<float, Rows, Cols>(
+      const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
+      async_gemm<float, Rows, Cols>);
+  auto async_gemm =
+      bench(async_gemm_func, lhs_matrix, rhs_matrix, "async_gemm", 2);
   assert(iter_mat == blocked);
 
   // for (std::size_t i = 0; i < Rows; ++i) {