From ec937c0c1c7830465404ddc8d0edcbf3b5a4ff3f Mon Sep 17 00:00:00 2001 From: Dave Amiana Date: Sat, 18 May 2024 22:22:12 +0800 Subject: [PATCH] removed py script --- matmul/Makefile | 2 +- matmul/gflops.py | 17 ----------------- matmul/include/matmul.hpp | 5 +++-- matmul/include/utils.hpp | 10 ++++------ matmul/src/main.cxx | 4 ++-- 5 files changed, 10 insertions(+), 28 deletions(-) delete mode 100644 matmul/gflops.py diff --git a/matmul/Makefile b/matmul/Makefile index ff4c965..24f5770 100644 --- a/matmul/Makefile +++ b/matmul/Makefile @@ -4,7 +4,7 @@ BUILD_DIR := build CXX := g++ # Compiler flags -CXXFLAGS := -O0 -Wall -Wextra -pedantic -std=c++17 -pthread -mfpu=neon +CXXFLAGS := -O3 -Wall -Wextra -pedantic -Xclang -std=c++17 -pthread -mfpu=neon -ffast-math # Source files SRCS := $(wildcard src/*.cxx) diff --git a/matmul/gflops.py b/matmul/gflops.py deleted file mode 100644 index 85b88ab..0000000 --- a/matmul/gflops.py +++ /dev/null @@ -1,17 +0,0 @@ -def gflops(n: int, time: int) -> float: - return (2*n**3/(time*1e-9))/(1e9) # 2n^3 flops / time (in ns) in GFLOPS - - -if __name__ == '__main__': - n = 1024 - bench: dict = { - 'iterative': 84_804_104, - 'loop_reorder': 77_021_354, - 'blocked': 1_201_245_771, - 'threaded_gemm': 464_923_229, - 'neon': 478_612_646, - 'neon_threaded': 646_018_875, - } - print(f'GFLOPS for {n}x{n} matrix multiplication') - for name, time in bench.items(): - print(f'{name}: {gflops(n, time):.6f} GFLOPS') diff --git a/matmul/include/matmul.hpp b/matmul/include/matmul.hpp index 768ac61..895ff4b 100644 --- a/matmul/include/matmul.hpp +++ b/matmul/include/matmul.hpp @@ -3,11 +3,9 @@ #include "../include/matmul.hpp" #include "../include/matrix.hpp" -#include "../include/utils.hpp" #include #include -#include #include #include @@ -24,6 +22,7 @@ auto iterative(const Matrix &t_lhs, Matrix result; for (std::size_t i = 0; i < M; ++i) { +#pragma clang loop vectorize(enable) for (std::size_t j = 0; j < N; ++j) { T sum = 0; for (std::size_t k = 0; k < N; ++k) { @@ -43,6 +42,7 @@ auto loop_reorder(const Matrix &t_lhs, for (std::size_t i = 0; i < M; ++i) { for (std::size_t j = 0; j < N; ++j) { +#pragma clang loop vectorize(enable) for (std::size_t k = 0; k < N; ++k) { result(i, k) += t_lhs(i, j) * t_rhs(j, k); } @@ -61,6 +61,7 @@ auto gemm(const Matrix &t_lhs, // Loop over the blocks for (std::size_t i = 0; i < N; i += block_size) { for (std::size_t j = 0; j < N; j += block_size) { +#pragma clang loop vectorize(enable) for (std::size_t k = 0; k < N; k += block_size) { // Multiply the blocks for (std::size_t ii = i; ii < std::min(i + block_size, N); ++ii) { diff --git a/matmul/include/utils.hpp b/matmul/include/utils.hpp index d28a201..833c138 100644 --- a/matmul/include/utils.hpp +++ b/matmul/include/utils.hpp @@ -23,12 +23,10 @@ class Timer { std::chrono::duration_cast(end_time - start_time) .count(); - const auto mean_duration = total_duration / m_iterations; - const float gflops = - (2 * 1024 * 1024 * 1024 / mean_duration / 1'000'000'000); - std::cout << "mean elapsed time took: " << mean_duration << " or " << gflops - << "GFlops" - << " ns for " << m_name << std::endl; + const double mean_duration = total_duration / m_iterations; + const double gflops = (2147483648 / mean_duration); + std::cout << "mean elapsed time took: " << mean_duration << " ns for " + << m_name << " or " << gflops << "GFlops" << std::endl; } auto start() -> void { diff --git a/matmul/src/main.cxx b/matmul/src/main.cxx index e4cc4d4..1d6fe07 100644 --- a/matmul/src/main.cxx +++ b/matmul/src/main.cxx @@ -45,7 +45,6 @@ auto test_matmul() -> void { auto loop_reorder_mat = bench(loop_reorder_func, lhs_matrix, rhs_matrix, "loop_reorder"); - // auto blocked = block_multiply(lhs_matrix, rhs_matrix); auto blocked_func = std::function( const Matrix &, const Matrix &)>( gemm); @@ -60,7 +59,8 @@ auto test_matmul() -> void { auto async_gemm_func = std::function( const Matrix &, const Matrix &)>( gemm_neon); - auto async_gemm = bench(async_gemm_func, lhs_matrix, rhs_matrix, "neon", 2); + auto async_gemm = + bench(async_gemm_func, lhs_matrix, rhs_matrix, "async_gemm", 2); auto neon_gemm_func = std::function( const Matrix &, const Matrix &)>(