From ec937c0c1c7830465404ddc8d0edcbf3b5a4ff3f Mon Sep 17 00:00:00 2001
From: Dave Amiana <amianism25@gmail.com>
Date: Sat, 18 May 2024 22:22:12 +0800
Subject: [PATCH] removed py script

---
 matmul/Makefile           |  2 +-
 matmul/gflops.py          | 17 -----------------
 matmul/include/matmul.hpp |  5 +++--
 matmul/include/utils.hpp  | 10 ++++------
 matmul/src/main.cxx       |  4 ++--
 5 files changed, 10 insertions(+), 28 deletions(-)
 delete mode 100644 matmul/gflops.py
diff --git a/matmul/Makefile b/matmul/Makefile
index ff4c965..24f5770 100644
--- a/matmul/Makefile
+++ b/matmul/Makefile
@@ -4,7 +4,7 @@ BUILD_DIR := build
 CXX := g++
 
 # Compiler flags
-CXXFLAGS := -O0 -Wall -Wextra -pedantic -std=c++17 -pthread -mfpu=neon
+CXXFLAGS := -O3 -Wall -Wextra -pedantic -Xclang -std=c++17 -pthread -mfpu=neon -ffast-math
 
 # Source files
 SRCS := $(wildcard src/*.cxx)
diff --git a/matmul/gflops.py b/matmul/gflops.py
deleted file mode 100644
index 85b88ab..0000000
--- a/matmul/gflops.py
+++ /dev/null
@@ -1,17 +0,0 @@
-def gflops(n: int, time: int) -> float:
-    return (2*n**3/(time*1e-9))/(1e9)  # 2n^3 flops / time (in ns) in GFLOPS
-
-
-if __name__ == '__main__':
-    n = 1024
-    bench: dict = {
-        'iterative': 84_804_104,
-        'loop_reorder': 77_021_354,
-        'blocked': 1_201_245_771,
-        'threaded_gemm': 464_923_229,
-        'neon': 478_612_646,
-        'neon_threaded': 646_018_875,
-    }
-    print(f'GFLOPS for {n}x{n} matrix multiplication')
-    for name, time in bench.items():
-        print(f'{name}: {gflops(n, time):.6f} GFLOPS')
diff --git a/matmul/include/matmul.hpp b/matmul/include/matmul.hpp
index 768ac61..895ff4b 100644
--- a/matmul/include/matmul.hpp
+++ b/matmul/include/matmul.hpp
@@ -3,11 +3,9 @@
 
 #include "../include/matmul.hpp"
 #include "../include/matrix.hpp"
-#include "../include/utils.hpp"
 
 #include <algorithm>
 #include <future>
-#include <memory>
 #include <thread>
 #include <type_traits>
 
@@ -24,6 +22,7 @@ auto iterative(const Matrix<T, M, N> &t_lhs,
   Matrix<T, M, N> result;
 
   for (std::size_t i = 0; i < M; ++i) {
+#pragma clang loop vectorize(enable)
     for (std::size_t j = 0; j < N; ++j) {
       T sum = 0;
       for (std::size_t k = 0; k < N; ++k) {
@@ -43,6 +42,7 @@ auto loop_reorder(const Matrix<T, M, N> &t_lhs,
 
   for (std::size_t i = 0; i < M; ++i) {
     for (std::size_t j = 0; j < N; ++j) {
+#pragma clang loop vectorize(enable)
       for (std::size_t k = 0; k < N; ++k) {
         result(i, k) += t_lhs(i, j) * t_rhs(j, k);
       }
@@ -61,6 +61,7 @@ auto gemm(const Matrix<T, N, M> &t_lhs,
   // Loop over the blocks
   for (std::size_t i = 0; i < N; i += block_size) {
     for (std::size_t j = 0; j < N; j += block_size) {
+#pragma clang loop vectorize(enable)
       for (std::size_t k = 0; k < N; k += block_size) {
         // Multiply the blocks
         for (std::size_t ii = i; ii < std::min(i + block_size, N); ++ii) {
diff --git a/matmul/include/utils.hpp b/matmul/include/utils.hpp
index d28a201..833c138 100644
--- a/matmul/include/utils.hpp
+++ b/matmul/include/utils.hpp
@@ -23,12 +23,10 @@ class Timer {
         std::chrono::duration_cast<std::chrono::nanoseconds>(end_time -
                                                              start_time)
             .count();
-    const auto mean_duration = total_duration / m_iterations;
-    const float gflops =
-        (2 * 1024 * 1024 * 1024 / mean_duration / 1'000'000'000);
-    std::cout << "mean elapsed time took: " << mean_duration << " or " << gflops
-              << "GFlops"
-              << " ns for " << m_name << std::endl;
+    const double mean_duration = total_duration / m_iterations;
+    const double gflops = (2147483648 / mean_duration);
+    std::cout << "mean elapsed time took: " << mean_duration << " ns for "
+              << m_name << " or " << gflops << "GFlops" << std::endl;
   }
 
   auto start() -> void {
diff --git a/matmul/src/main.cxx b/matmul/src/main.cxx
index e4cc4d4..1d6fe07 100644
--- a/matmul/src/main.cxx
+++ b/matmul/src/main.cxx
@@ -45,7 +45,6 @@ auto test_matmul() -> void {
   auto loop_reorder_mat =
       bench(loop_reorder_func, lhs_matrix, rhs_matrix, "loop_reorder");
 
-  // auto blocked = block_multiply<float, Rows, Cols>(lhs_matrix, rhs_matrix);
   auto blocked_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
       gemm<float, Rows, Cols>);
@@ -60,7 +59,8 @@ auto test_matmul() -> void {
   auto async_gemm_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
       gemm_neon<float, Rows, Cols>);
-  auto async_gemm = bench(async_gemm_func, lhs_matrix, rhs_matrix, "neon", 2);
+  auto async_gemm =
+      bench(async_gemm_func, lhs_matrix, rhs_matrix, "async_gemm", 2);
 
   auto neon_gemm_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(