removed py script

adeeconometrics · May 18, 2024 · ec937c0 · ec937c0
1 parent 595c276
commit ec937c0
Show file tree

Hide file tree

Showing 5 changed files with 10 additions and 28 deletions.
diff --git a/matmul/Makefile b/matmul/Makefile
@@ -4,7 +4,7 @@ BUILD_DIR := build
 CXX := g++
 
 # Compiler flags
-CXXFLAGS := -O0 -Wall -Wextra -pedantic -std=c++17 -pthread -mfpu=neon
+CXXFLAGS := -O3 -Wall -Wextra -pedantic -Xclang -std=c++17 -pthread -mfpu=neon -ffast-math
 
 # Source files
 SRCS := $(wildcard src/*.cxx)

diff --git a/matmul/gflops.py b/matmul/gflops.py
diff --git a/matmul/include/matmul.hpp b/matmul/include/matmul.hpp
@@ -3,11 +3,9 @@
 
 #include "../include/matmul.hpp"
 #include "../include/matrix.hpp"
-#include "../include/utils.hpp"
 
 #include <algorithm>
 #include <future>
-#include <memory>
 #include <thread>
 #include <type_traits>
 
@@ -24,6 +22,7 @@ auto iterative(const Matrix<T, M, N> &t_lhs,
   Matrix<T, M, N> result;
 
   for (std::size_t i = 0; i < M; ++i) {
+#pragma clang loop vectorize(enable)
     for (std::size_t j = 0; j < N; ++j) {
       T sum = 0;
       for (std::size_t k = 0; k < N; ++k) {
@@ -43,6 +42,7 @@ auto loop_reorder(const Matrix<T, M, N> &t_lhs,
 
   for (std::size_t i = 0; i < M; ++i) {
     for (std::size_t j = 0; j < N; ++j) {
+#pragma clang loop vectorize(enable)
       for (std::size_t k = 0; k < N; ++k) {
         result(i, k) += t_lhs(i, j) * t_rhs(j, k);
       }
@@ -61,6 +61,7 @@ auto gemm(const Matrix<T, N, M> &t_lhs,
   // Loop over the blocks
   for (std::size_t i = 0; i < N; i += block_size) {
     for (std::size_t j = 0; j < N; j += block_size) {
+#pragma clang loop vectorize(enable)
       for (std::size_t k = 0; k < N; k += block_size) {
         // Multiply the blocks
         for (std::size_t ii = i; ii < std::min(i + block_size, N); ++ii) {

diff --git a/matmul/include/utils.hpp b/matmul/include/utils.hpp
@@ -23,12 +23,10 @@ class Timer {
         std::chrono::duration_cast<std::chrono::nanoseconds>(end_time -
                                                              start_time)
             .count();
-    const auto mean_duration = total_duration / m_iterations;
-    const float gflops =
-        (2 * 1024 * 1024 * 1024 / mean_duration / 1'000'000'000);
-    std::cout << "mean elapsed time took: " << mean_duration << " or " << gflops
-              << "GFlops"
-              << " ns for " << m_name << std::endl;
+    const double mean_duration = total_duration / m_iterations;
+    const double gflops = (2147483648 / mean_duration);
+    std::cout << "mean elapsed time took: " << mean_duration << " ns for "
+              << m_name << " or " << gflops << "GFlops" << std::endl;
   }
 
   auto start() -> void {

diff --git a/matmul/src/main.cxx b/matmul/src/main.cxx
@@ -45,7 +45,6 @@ auto test_matmul() -> void {
   auto loop_reorder_mat =
       bench(loop_reorder_func, lhs_matrix, rhs_matrix, "loop_reorder");
 
-  // auto blocked = block_multiply<float, Rows, Cols>(lhs_matrix, rhs_matrix);
   auto blocked_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
       gemm<float, Rows, Cols>);
@@ -60,7 +59,8 @@ auto test_matmul() -> void {
   auto async_gemm_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
       gemm_neon<float, Rows, Cols>);
-  auto async_gemm = bench(async_gemm_func, lhs_matrix, rhs_matrix, "neon", 2);
+  auto async_gemm =
+      bench(async_gemm_func, lhs_matrix, rhs_matrix, "async_gemm", 2);
 
   auto neon_gemm_func = std::function<Matrix<float, Rows, Cols>(
       const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(