Skip to content

Commit

Permalink
implemented async- and threaded- block partition, still not better th…
Browse files Browse the repository at this point in the history
…an simpler model
  • Loading branch information
adeeconometrics committed May 12, 2024
1 parent a117d56 commit b573e3c
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 14 deletions.
15 changes: 15 additions & 0 deletions matmul/gflops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
def gflops(n: int, time: int) -> float:
return (2*n**3/(time*1e-9))/(1e9) # 2n^3 flops / time (in ns) in GFLOPS


if __name__ == '__main__':
n = 2048
bench: dict = {
'iterative': 634_936_604,
'blocked': 19_087_074_146,
'threaded_blocked': 11_113_214_750,
'async_blocked': 10_820_800_292,
}
print(f'GFLOPS for {n}x{n} matrix multiplication')
for name, time in bench.items():
print(f'{name}: {gflops(n, time):.6f} GFLOPS')
31 changes: 29 additions & 2 deletions matmul/include/matmul.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "../include/utils.hpp"

#include <algorithm>
#include <future>
#include <memory>
#include <thread>
#include <type_traits>
Expand All @@ -21,6 +22,7 @@ template <typename T, std::size_t M, std::size_t N,
auto iterative(const Matrix<T, M, N> &t_lhs,
const Matrix<T, M, N> &t_rhs) -> Matrix<T, M, N> {
Matrix<T, M, N> result;

for (std::size_t i = 0; i < M; ++i) {
for (std::size_t j = 0; j < N; ++j) {
T sum = 0;
Expand All @@ -38,6 +40,7 @@ template <typename T, std::size_t M, std::size_t N,
auto loop_reorder(const Matrix<T, M, N> &t_lhs,
const Matrix<T, M, N> &t_rhs) -> Matrix<T, M, N> {
Matrix<T, M, N> result;

for (std::size_t i = 0; i < M; ++i) {
for (std::size_t j = 0; j < N; ++j) {
for (std::size_t k = 0; k < N; ++k) {
Expand All @@ -53,7 +56,8 @@ template <typename T, std::size_t N, std::size_t M,
auto gemm(const Matrix<T, N, M> &t_lhs,
const Matrix<T, N, M> &t_rhs) -> Matrix<T, N, M> {
Matrix<T, N, M> result;
const std::size_t block_size = 16;
constexpr std::size_t block_size = (16 * 1024) / sizeof(T);

// Loop over the blocks
for (std::size_t i = 0; i < N; i += block_size) {
for (std::size_t j = 0; j < N; j += block_size) {
Expand All @@ -80,9 +84,10 @@ template <typename T, std::size_t N, std::size_t M,
auto threaded_gemm(const Matrix<T, N, M> &t_lhs,
const Matrix<T, N, M> &t_rhs) -> Matrix<T, N, M> {
Matrix<T, N, M> result;
const std::size_t block_size = 16;
constexpr std::size_t block_size = (32 * 1024) / sizeof(T);
const std::size_t num_threads = std::thread::hardware_concurrency();
std::vector<std::thread> threads;

for (std::size_t i = 0; i < num_threads; ++i) {
threads.emplace_back([i, num_threads, block_size, &t_lhs, &t_rhs,
&result]() {
Expand All @@ -107,4 +112,26 @@ auto threaded_gemm(const Matrix<T, N, M> &t_lhs,
return result;
}

template <typename T, std::size_t N, std::size_t M,
typename = typename std::enable_if_t<std::is_arithmetic_v<T>>>
auto async_gemm(const Matrix<T, N, M> &t_lhs,
const Matrix<T, N, M> &t_rhs) -> Matrix<T, N, M> {
Matrix<T, N, M> result;
std::array<std::future<void>, N> futures;

for (std::size_t i = 0; i < N; ++i) {
futures[i] = std::async(std::launch::async, [i, &t_lhs, &t_rhs, &result]() {
for (std::size_t j = 0; j < N; ++j) {
T sum{};
for (std::size_t k = 0; k < N; ++k) {
sum += t_lhs(i, k) * t_rhs(k, j);
}
result(i, j) = sum;
}
});
}

return result;
}

#endif // __MATMUL_H__
27 changes: 15 additions & 12 deletions matmul/src/main.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ auto bench(
const std::function<Matrix<T, Rows, Cols>(
const Matrix<T, Rows, Cols> &, const Matrix<T, Rows, Cols> &)> &t_func,
const Matrix<T, Rows, Cols> &t_lhs, const Matrix<T, Rows, Cols> &t_rhs,
std::string t_name, std::size_t t_iter = 3) -> Matrix<T, Rows, Cols> {
std::string t_name, std::size_t t_iter = 2) -> Matrix<T, Rows, Cols> {

{
Timer timer{t_iter, t_name};
Expand All @@ -26,36 +26,39 @@ auto bench(
auto test_matmul() -> void {

std::mt19937 prng(42);
constexpr std::size_t Rows = 2048;
constexpr std::size_t Cols = 2048;
constexpr std::size_t Rows = 255;
constexpr std::size_t Cols = 255;

Matrix<float, Rows, Cols> lhs_matrix{rand_vector<float, Rows, Cols>(prng),
Rows, Cols};
Matrix<float, Rows, Cols> rhs_matrix{rand_vector<float, Rows, Cols>(prng),
Rows, Cols};
Matrix<float, Rows, Cols> lhs_matrix{rand_array<float, Rows, Cols>(prng)};
Matrix<float, Rows, Cols> rhs_matrix{rand_array<float, Rows, Cols>(prng)};

auto iter_func = std::function<Matrix<float, Rows, Cols>(
const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
loop_reorder<float, Rows, Cols>);
auto iter_mat = bench(iter_func, lhs_matrix, rhs_matrix, "iterative", 2);
auto iter_mat = bench(iter_func, lhs_matrix, rhs_matrix, "iterative");
// auto loop_reorder_func = std::function<Matrix<float, Rows, Cols>(
// const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
// loop_reorder<float, Rows, Cols>);
// const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols>
// &)>( loop_reorder<float, Rows, Cols>);
// auto loop_reorder_mat =
// bench(loop_reorder_func, lhs_matrix, rhs_matrix, "loop_reorder");

// auto blocked = block_multiply<float, Rows, Cols>(lhs_matrix, rhs_matrix);
auto blocked_func = std::function<Matrix<float, Rows, Cols>(
const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
gemm<float, Rows, Cols>);
auto blocked = bench(blocked_func, lhs_matrix, rhs_matrix, "blocked", 2);
auto blocked = bench(blocked_func, lhs_matrix, rhs_matrix, "blocked");

auto threaded_gemm_func = std::function<Matrix<float, Rows, Cols>(
const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
threaded_gemm<float, Rows, Cols>);
auto threaded_gemm =
bench(threaded_gemm_func, lhs_matrix, rhs_matrix, "threaded_gemm", 2);
bench(threaded_gemm_func, lhs_matrix, rhs_matrix, "threaded_gemm");

auto async_gemm_func = std::function<Matrix<float, Rows, Cols>(
const Matrix<float, Rows, Cols> &, const Matrix<float, Rows, Cols> &)>(
async_gemm<float, Rows, Cols>);
auto async_gemm =
bench(async_gemm_func, lhs_matrix, rhs_matrix, "async_gemm", 2);
assert(iter_mat == blocked);

// for (std::size_t i = 0; i < Rows; ++i) {
Expand Down

0 comments on commit b573e3c

Please sign in to comment.