From ba6c267a3a77646e124b24438dcbce578916a754 Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 17 Jan 2023 18:40:05 -0800 Subject: [PATCH 1/8] [SYCL][Matrix]Add test for odd sizes --- SYCL/Matrix/joint_matrix_all_sizes.cpp | 23 +++ SYCL/Matrix/joint_matrix_all_sizes_impl.hpp | 191 ++++++++++++++++++++ 2 files changed, 214 insertions(+) create mode 100644 SYCL/Matrix/joint_matrix_all_sizes.cpp create mode 100644 SYCL/Matrix/joint_matrix_all_sizes_impl.hpp diff --git a/SYCL/Matrix/joint_matrix_all_sizes.cpp b/SYCL/Matrix/joint_matrix_all_sizes.cpp new file mode 100644 index 0000000000..1760694444 --- /dev/null +++ b/SYCL/Matrix/joint_matrix_all_sizes.cpp @@ -0,0 +1,23 @@ +//==-------- joint_matrix_all_sizes.cpp - DPC++ joint_matrix---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::bfloat16; + +#define SG_SZ 16 + +#include "joint_matrix_all_sizes_impl.hpp" diff --git a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp new file mode 100644 index 0000000000..65ef75034d --- /dev/null +++ b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp @@ -0,0 +1,191 @@ +#define BF16_EPSILON 0.00781250 + +template struct big_matrix { +private: + T *mat; + +public: + T *get_data() { return mat; } + void set_data(T *data) { mat = data; } + big_matrix(T *data) : mat(data) {} +}; + +template +void matrix_vnni(unsigned int rows, unsigned int cols, T *src, T *dest, + unsigned int vnniFactor) { + for (unsigned int i = 0; i < rows / vnniFactor; i++) { + for (unsigned int j = 0; j < cols; j++) { + for (unsigned int k = 0; k < vnniFactor; k++) { + dest[i * cols * vnniFactor + j * vnniFactor + k] = + src[(i * vnniFactor + k) * cols + j]; + } + } + } +} + +template +void matrix_multiply(big_matrix &C, big_matrix &A, + big_matrix &B) { + size_t NDRangeM = M / TM; + size_t NDRangeN = N / TN; + buffer bufA(A.get_data(), range<2>(M, K)); + buffer bufB(B.get_data(), range<2>(K, N)); + buffer bufC(C.get_data(), range<2>(M, N)); + + queue q; + q.submit([&](handler &cgh) { + sycl::accessor accC{bufC, cgh, sycl::read_write}; + sycl::accessor accA{bufA, cgh, sycl::read_only}; + sycl::accessor accB{bufB, cgh, sycl::read_only}; + + cgh.parallel_for( + nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}), + [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] + + { + // The submatrix API has to be accessed by all the workitems in a + // subgroup these functions will be called once by the subgroup no + // code divergence between the workitems + const auto global_idx = spmd_item.get_global_id(0); + const auto global_idy = spmd_item.get_global_id(1); + const auto sg_startx = global_idx - spmd_item.get_local_id(0); + const auto sg_starty = global_idy - spmd_item.get_local_id(1); + + sub_group sg = spmd_item.get_sub_group(); + joint_matrix + sub_a; + // For B, we assume B has been already VNNIed. + joint_matrix + sub_b; + joint_matrix sub_c; + + joint_matrix_load(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + for (int k = 0; k < K / TK; k += 1) { + joint_matrix_load( + sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, + K); + joint_matrix_load(sg, sub_b, + accB.get_pointer() + (k * TK / vnniFactor) * (N * vnniFactor) + + sg_starty / SG_SZ * TN * vnniFactor, + N * vnniFactor); + sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); + } + joint_matrix_store(sg, sub_c, + accC.get_pointer() + (sg_startx * TM) * N + + sg_starty / SG_SZ * TN, + N, layout::row_major); + }); // parallel for + }).wait(); +} + +static constexpr size_t MATRIX_M = 128; +static constexpr size_t MATRIX_N = 128; +static constexpr size_t MATRIX_K = 128; + +float make_fp32(bfloat16 x) { + unsigned int y = *((int *)&x); + y = y << 16; + float *res = reinterpret_cast(&y); + return *res; +} + +template +void matrix_multiply_ref(Ta *A, Ta *B, Tc *C, int M, int N, int K) { + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + if (std::is_same_v && std::is_same_v) + C[m * N + n] += make_fp32(A[m * K + k]) * make_fp32(B[k * N + n]); + if (std::is_same_v && std::is_same_v) + C[m * N + n] += A[m * K + k] * B[k * N + n]; + } + } +} + +template +int init_and_multiply() { + Ta A[MATRIX_M][MATRIX_K]; + Ta B[MATRIX_K][MATRIX_N]; + Ta Bvnni[MATRIX_K / vnni_factor][MATRIX_N * vnni_factor]; + Tc C[MATRIX_M][MATRIX_N]; + Tc D[MATRIX_M][MATRIX_N]; + + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_K; j++) { + if (std::is_same_v && std::is_same_v) + A[i][j] = bfloat16(1.0f * (i + j)); + if (std::is_same_v && std::is_same_v) + A[i][j] = i + j; + } + } + for (int i = 0; i < MATRIX_K; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if (std::is_same_v && std::is_same_v) + B[i][j] = bfloat16(2.0f * i + 3.0f * j); + if (std::is_same_v && std::is_same_v) + B[i][j] = i + 2 * j; + } + } + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + C[i][j] = 1; + D[i][j] = 1; + } + } + + big_matrix MC((Tc *)&C); + big_matrix MD((Tc *)&D); + big_matrix MA((Ta *)&A); + matrix_vnni(MATRIX_K, MATRIX_N, (Ta *)&B, (Ta *)&Bvnni, vnni_factor); + big_matrix MBvnni((Ta *)&Bvnni); + + matrix_multiply(MC, MA, MBvnni); + matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, + MATRIX_N, MATRIX_K); + + bool res = true; + for (int i = 0; i < MATRIX_M; i++) { + for (int j = 0; j < MATRIX_N; j++) { + if constexpr(std::is_same_v && std::is_same_v) { + if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON) { + std::cout << (res ? "passed" : "failed bfloat ") << C[i][j] <<" D is " << D[i][j]<< std::endl; + res = false; + } + } + else if (std::is_same_v && std::is_same_v) { + if (C[i][j] != D[i][j]) { + std::cout << (res ? "passed" : "failed") << C[i][j] <<" D is " << D[i][j]<< std::endl; + + res = false; + } + } + } + } + std::cout << (res ? "passed" : "failed") << std::endl; + return !res; +} + +int main() { + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + //init_and_multiply(); + return 0; +} From 399939f42e9581e69da328941094a7bf16ef4113 Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 17 Jan 2023 18:47:04 -0800 Subject: [PATCH 2/8] set to xfail on GPU --- SYCL/Matrix/joint_matrix_all_sizes.cpp | 2 + SYCL/Matrix/joint_matrix_all_sizes_impl.hpp | 96 +++++++++++---------- 2 files changed, 52 insertions(+), 46 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_all_sizes.cpp b/SYCL/Matrix/joint_matrix_all_sizes.cpp index 1760694444..197ffdf48b 100644 --- a/SYCL/Matrix/joint_matrix_all_sizes.cpp +++ b/SYCL/Matrix/joint_matrix_all_sizes.cpp @@ -11,6 +11,8 @@ // RUN: %CPU_RUN_PLACEHOLDER %t.out // RUN: %GPU_RUN_PLACEHOLDER %t.out +// XFAIL: gpu + #include #include diff --git a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp index 65ef75034d..883f3477d2 100644 --- a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp @@ -23,7 +23,8 @@ void matrix_vnni(unsigned int rows, unsigned int cols, T *src, T *dest, } } -template +template void matrix_multiply(big_matrix &C, big_matrix &A, big_matrix &B) { size_t NDRangeM = M / TM; @@ -52,8 +53,7 @@ void matrix_multiply(big_matrix &C, big_matrix &A, const auto sg_starty = global_idy - spmd_item.get_local_id(1); sub_group sg = spmd_item.get_sub_group(); - joint_matrix - sub_a; + joint_matrix sub_a; // For B, we assume B has been already VNNIed. joint_matrix @@ -64,12 +64,13 @@ void matrix_multiply(big_matrix &C, big_matrix &A, accC.get_pointer() + (sg_startx * TM) * N + sg_starty / SG_SZ * TN, N, layout::row_major); - for (int k = 0; k < K / TK; k += 1) { + for (int k = 0; k < K / TK; k += 1) { joint_matrix_load( sg, sub_a, accA.get_pointer() + (sg_startx * TM) * K + k * TK, K); joint_matrix_load(sg, sub_b, - accB.get_pointer() + (k * TK / vnniFactor) * (N * vnniFactor) + + accB.get_pointer() + + (k * TK / vnniFactor) * (N * vnniFactor) + sg_starty / SG_SZ * TN * vnniFactor, N * vnniFactor); sub_c = joint_matrix_mad(sg, sub_a, sub_b, sub_c); @@ -99,14 +100,15 @@ void matrix_multiply_ref(Ta *A, Ta *B, Tc *C, int M, int N, int K) { for (int n = 0; n < N; n++) { for (int k = 0; k < K; k++) { if (std::is_same_v && std::is_same_v) - C[m * N + n] += make_fp32(A[m * K + k]) * make_fp32(B[k * N + n]); - if (std::is_same_v && std::is_same_v) - C[m * N + n] += A[m * K + k] * B[k * N + n]; + C[m * N + n] += make_fp32(A[m * K + k]) * make_fp32(B[k * N + n]); + if (std::is_same_v && std::is_same_v) + C[m * N + n] += A[m * K + k] * B[k * N + n]; } } } -template +template int init_and_multiply() { Ta A[MATRIX_M][MATRIX_K]; Ta B[MATRIX_K][MATRIX_N]; @@ -117,17 +119,17 @@ int init_and_multiply() { for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_K; j++) { if (std::is_same_v && std::is_same_v) - A[i][j] = bfloat16(1.0f * (i + j)); + A[i][j] = bfloat16(1.0f * (i + j)); if (std::is_same_v && std::is_same_v) - A[i][j] = i + j; + A[i][j] = i + j; } } for (int i = 0; i < MATRIX_K; i++) { for (int j = 0; j < MATRIX_N; j++) { if (std::is_same_v && std::is_same_v) - B[i][j] = bfloat16(2.0f * i + 3.0f * j); + B[i][j] = bfloat16(2.0f * i + 3.0f * j); if (std::is_same_v && std::is_same_v) - B[i][j] = i + 2 * j; + B[i][j] = i + 2 * j; } } for (int i = 0; i < MATRIX_M; i++) { @@ -141,27 +143,29 @@ int init_and_multiply() { big_matrix MD((Tc *)&D); big_matrix MA((Ta *)&A); matrix_vnni(MATRIX_K, MATRIX_N, (Ta *)&B, (Ta *)&Bvnni, vnni_factor); - big_matrix MBvnni((Ta *)&Bvnni); - - matrix_multiply(MC, MA, MBvnni); - matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, - MATRIX_N, MATRIX_K); + big_matrix MBvnni( + (Ta *)&Bvnni); + + matrix_multiply(MC, MA, MBvnni); + matrix_multiply_ref((Ta *)A, (Ta *)B, (Tc *)D, MATRIX_M, MATRIX_N, MATRIX_K); bool res = true; for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_N; j++) { - if constexpr(std::is_same_v && std::is_same_v) { - if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON) { - std::cout << (res ? "passed" : "failed bfloat ") << C[i][j] <<" D is " << D[i][j]<< std::endl; - res = false; - } - } - else if (std::is_same_v && std::is_same_v) { - if (C[i][j] != D[i][j]) { - std::cout << (res ? "passed" : "failed") << C[i][j] <<" D is " << D[i][j]<< std::endl; - - res = false; - } + if constexpr (std::is_same_v && std::is_same_v) { + if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON) { + std::cout << (res ? "passed" : "failed bfloat ") << C[i][j] + << " D is " << D[i][j] << std::endl; + res = false; + } + } else if (std::is_same_v && std::is_same_v) { + if (C[i][j] != D[i][j]) { + std::cout << (res ? "passed" : "failed") << C[i][j] << " D is " + << D[i][j] << std::endl; + + res = false; + } } } } @@ -170,22 +174,22 @@ int init_and_multiply() { } int main() { - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); - - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); - //init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); + // init_and_multiply(); return 0; } From c5e5e6fc6a73bb492b07f6c96006c7e0fafbe219 Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 24 Jan 2023 07:24:04 -0800 Subject: [PATCH 3/8] Uncomment the rest of the calls --- SYCL/Matrix/joint_matrix_all_sizes_impl.hpp | 32 ++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp index 883f3477d2..0884a4ad77 100644 --- a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp @@ -174,22 +174,22 @@ int init_and_multiply() { } int main() { - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); - - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); - // init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); + init_and_multiply(); return 0; } From 642d1c48adc5f89e6b4106dfa31bc7ea3870db43 Mon Sep 17 00:00:00 2001 From: Dounia Date: Tue, 28 Feb 2023 09:28:55 -0800 Subject: [PATCH 4/8] correct the verification code --- SYCL/Matrix/joint_matrix_all_sizes_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp index 0884a4ad77..36072f983e 100644 --- a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp @@ -154,7 +154,7 @@ int init_and_multiply() { for (int i = 0; i < MATRIX_M; i++) { for (int j = 0; j < MATRIX_N; j++) { if constexpr (std::is_same_v && std::is_same_v) { - if ((fabs(C[i][j]) - fabs(D[i][j])) > BF16_EPSILON) { + if (fabs(C[i][j] - D[i][j]) > BF16_EPSILON) { std::cout << (res ? "passed" : "failed bfloat ") << C[i][j] << " D is " << D[i][j] << std::endl; res = false; From b581424cf15c7c6b20d6a64aa3e27a48fe640c39 Mon Sep 17 00:00:00 2001 From: Yury Plyakhin Date: Thu, 2 Mar 2023 12:19:40 -0800 Subject: [PATCH 5/8] fixed M size of the big matrix to be multiplier of M size of joint matrix --- SYCL/Matrix/joint_matrix_all_sizes_impl.hpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp index 36072f983e..8e93592e7c 100644 --- a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp @@ -1,4 +1,5 @@ #define BF16_EPSILON 0.00781250 +static constexpr size_t M_MULTIPLIER = 16; template struct big_matrix { private: @@ -83,7 +84,6 @@ void matrix_multiply(big_matrix &C, big_matrix &A, }).wait(); } -static constexpr size_t MATRIX_M = 128; static constexpr size_t MATRIX_N = 128; static constexpr size_t MATRIX_K = 128; @@ -110,6 +110,10 @@ void matrix_multiply_ref(Ta *A, Ta *B, Tc *C, int M, int N, int K) { template int init_and_multiply() { + + static constexpr size_t MATRIX_M = tM * M_MULTIPLIER; + std::cout << "MATRIX_M=" << MATRIX_M << "\n"; + Ta A[MATRIX_M][MATRIX_K]; Ta B[MATRIX_K][MATRIX_N]; Ta Bvnni[MATRIX_K / vnni_factor][MATRIX_N * vnni_factor]; @@ -155,16 +159,15 @@ int init_and_multiply() { for (int j = 0; j < MATRIX_N; j++) { if constexpr (std::is_same_v && std::is_same_v) { if (fabs(C[i][j] - D[i][j]) > BF16_EPSILON) { - std::cout << (res ? "passed" : "failed bfloat ") << C[i][j] - << " D is " << D[i][j] << std::endl; res = false; + std::cout << "Failed bfloat16: C is " << C[i][j] + << ", D is " << D[i][j] << std::endl; } } else if (std::is_same_v && std::is_same_v) { if (C[i][j] != D[i][j]) { - std::cout << (res ? "passed" : "failed") << C[i][j] << " D is " - << D[i][j] << std::endl; - res = false; + std::cout << "Failed int8_t: C is " << C[i][j] + << ", D is " << D[i][j] << std::endl; } } } From c6d6053bf694831e975e4333ada9f10961a163fa Mon Sep 17 00:00:00 2001 From: Yury Plyakhin Date: Thu, 9 Mar 2023 11:06:11 -0800 Subject: [PATCH 6/8] [SYCL][Matrix][NFC] updated formatting Updated formatting. --- SYCL/Matrix/joint_matrix_all_sizes_impl.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp index 8e93592e7c..aed61bd8aa 100644 --- a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp @@ -160,14 +160,14 @@ int init_and_multiply() { if constexpr (std::is_same_v && std::is_same_v) { if (fabs(C[i][j] - D[i][j]) > BF16_EPSILON) { res = false; - std::cout << "Failed bfloat16: C is " << C[i][j] - << ", D is " << D[i][j] << std::endl; + std::cout << "Failed bfloat16: C is " << C[i][j] << ", D is " + << D[i][j] << std::endl; } } else if (std::is_same_v && std::is_same_v) { if (C[i][j] != D[i][j]) { res = false; - std::cout << "Failed int8_t: C is " << C[i][j] - << ", D is " << D[i][j] << std::endl; + std::cout << "Failed int8_t: C is " << C[i][j] << ", D is " << D[i][j] + << std::endl; } } } From 302efe48f2503f49dd8108f4264dcff74d687ca8 Mon Sep 17 00:00:00 2001 From: Yury Plyakhin Date: Fri, 10 Mar 2023 15:46:24 -0800 Subject: [PATCH 7/8] [SYCL][Matrix] Updated tests to pass on GPU Updated tests to pass on GPU, splitting compilation of each kernel. Added new environment variables for different optimization modes Added XMX8 variant. --- .../XMX8/joint_matrix_all_sizes_bfloat16.cpp | 62 +++++++++++++++++++ .../XMX8/joint_matrix_all_sizes_int8.cpp | 62 +++++++++++++++++++ SYCL/Matrix/joint_matrix_all_sizes.cpp | 25 -------- .../joint_matrix_all_sizes_bfloat16.cpp | 62 +++++++++++++++++++ SYCL/Matrix/joint_matrix_all_sizes_impl.hpp | 32 ++++++++++ SYCL/Matrix/joint_matrix_all_sizes_int8.cpp | 62 +++++++++++++++++++ 6 files changed, 280 insertions(+), 25 deletions(-) create mode 100644 SYCL/Matrix/XMX8/joint_matrix_all_sizes_bfloat16.cpp create mode 100644 SYCL/Matrix/XMX8/joint_matrix_all_sizes_int8.cpp delete mode 100644 SYCL/Matrix/joint_matrix_all_sizes.cpp create mode 100644 SYCL/Matrix/joint_matrix_all_sizes_bfloat16.cpp create mode 100644 SYCL/Matrix/joint_matrix_all_sizes_int8.cpp diff --git a/SYCL/Matrix/XMX8/joint_matrix_all_sizes_bfloat16.cpp b/SYCL/Matrix/XMX8/joint_matrix_all_sizes_bfloat16.cpp new file mode 100644 index 0000000000..f16d2e1f9a --- /dev/null +++ b/SYCL/Matrix/XMX8/joint_matrix_all_sizes_bfloat16.cpp @@ -0,0 +1,62 @@ +//==-------- joint_matrix_all_sizes.cpp - DPC++ joint_matrix---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-xmx8 + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_ALL_BF16 +// RUN: %CPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_1 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_2 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_3 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_4 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_5 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_6 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_7 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_8 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::bfloat16; + +#define SG_SZ 8 + +#include "../joint_matrix_all_sizes_impl.hpp" diff --git a/SYCL/Matrix/XMX8/joint_matrix_all_sizes_int8.cpp b/SYCL/Matrix/XMX8/joint_matrix_all_sizes_int8.cpp new file mode 100644 index 0000000000..549337f0f1 --- /dev/null +++ b/SYCL/Matrix/XMX8/joint_matrix_all_sizes_int8.cpp @@ -0,0 +1,62 @@ +//==-------- joint_matrix_all_sizes.cpp - DPC++ joint_matrix---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix-xmx8 + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_ALL_INT8 +// RUN: %CPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_1 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_2 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_3 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_4 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_5 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_6 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_7 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_8 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::bfloat16; + +#define SG_SZ 8 + +#include "../joint_matrix_all_sizes_impl.hpp" diff --git a/SYCL/Matrix/joint_matrix_all_sizes.cpp b/SYCL/Matrix/joint_matrix_all_sizes.cpp deleted file mode 100644 index 197ffdf48b..0000000000 --- a/SYCL/Matrix/joint_matrix_all_sizes.cpp +++ /dev/null @@ -1,25 +0,0 @@ -//==-------- joint_matrix_all_sizes.cpp - DPC++ joint_matrix---------------==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// REQUIRES: matrix - -// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -// RUN: %CPU_RUN_PLACEHOLDER %t.out -// RUN: %GPU_RUN_PLACEHOLDER %t.out - -// XFAIL: gpu - -#include -#include - -using namespace sycl; -using namespace sycl::ext::oneapi::experimental::matrix; -using bfloat16 = sycl::ext::oneapi::bfloat16; - -#define SG_SZ 16 - -#include "joint_matrix_all_sizes_impl.hpp" diff --git a/SYCL/Matrix/joint_matrix_all_sizes_bfloat16.cpp b/SYCL/Matrix/joint_matrix_all_sizes_bfloat16.cpp new file mode 100644 index 0000000000..db87153d41 --- /dev/null +++ b/SYCL/Matrix/joint_matrix_all_sizes_bfloat16.cpp @@ -0,0 +1,62 @@ +//==-------- joint_matrix_all_sizes.cpp - DPC++ joint_matrix---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_ALL_BF16 +// RUN: %CPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_1 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_2 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_3 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_4 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_5 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_6 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_7 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_BF16_8 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::bfloat16; + +#define SG_SZ 16 + +#include "joint_matrix_all_sizes_impl.hpp" diff --git a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp index aed61bd8aa..48266165db 100644 --- a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp @@ -177,22 +177,54 @@ int init_and_multiply() { } int main() { +#if defined(JOINT_MATRIX_TEST_BF16_1) || defined(JOINT_MATRIX_TEST_ALL_BF16) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_BF16_2) || defined(JOINT_MATRIX_TEST_ALL_BF16) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_BF16_3) || defined(JOINT_MATRIX_TEST_ALL_BF16) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_BF16_4) || defined(JOINT_MATRIX_TEST_ALL_BF16) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_BF16_5) || defined(JOINT_MATRIX_TEST_ALL_BF16) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_BF16_6) || defined(JOINT_MATRIX_TEST_ALL_BF16) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_BF16_7) || defined(JOINT_MATRIX_TEST_ALL_BF16) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_BF16_8) || defined(JOINT_MATRIX_TEST_ALL_BF16) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_INT8_1) || defined(JOINT_MATRIX_TEST_ALL_INT8) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_INT8_2) || defined(JOINT_MATRIX_TEST_ALL_INT8) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_INT8_3) || defined(JOINT_MATRIX_TEST_ALL_INT8) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_INT8_4) || defined(JOINT_MATRIX_TEST_ALL_INT8) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_INT8_5) || defined(JOINT_MATRIX_TEST_ALL_INT8) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_INT8_6) || defined(JOINT_MATRIX_TEST_ALL_INT8) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_INT8_7) || defined(JOINT_MATRIX_TEST_ALL_INT8) init_and_multiply(); +#endif +#if defined(JOINT_MATRIX_TEST_INT8_8) || defined(JOINT_MATRIX_TEST_ALL_INT8) init_and_multiply(); +#endif return 0; } diff --git a/SYCL/Matrix/joint_matrix_all_sizes_int8.cpp b/SYCL/Matrix/joint_matrix_all_sizes_int8.cpp new file mode 100644 index 0000000000..fa227ee3bd --- /dev/null +++ b/SYCL/Matrix/joint_matrix_all_sizes_int8.cpp @@ -0,0 +1,62 @@ +//==-------- joint_matrix_all_sizes.cpp - DPC++ joint_matrix---------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: matrix + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_ALL_INT8 +// RUN: %CPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_1 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_2 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_3 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_4 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_5 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_6 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_7 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +// RUN: %clangxx -fsycl %s -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -DJOINT_MATRIX_TEST_INT8_8 +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=0 %GPU_RUN_PLACEHOLDER %t.out +// RUN: env IGC_JointMatrixLoadStoreOpt=1 %GPU_RUN_PLACEHOLDER %t.out + +#include +#include + +using namespace sycl; +using namespace sycl::ext::oneapi::experimental::matrix; +using bfloat16 = sycl::ext::oneapi::bfloat16; + +#define SG_SZ 16 + +#include "joint_matrix_all_sizes_impl.hpp" From 163ebf94ba5812ad839e0710ad60f7eb09dbe51b Mon Sep 17 00:00:00 2001 From: "Plyakhin, Yury" Date: Thu, 16 Mar 2023 17:51:53 +0000 Subject: [PATCH 8/8] [SYCL][Matrix] Fixed size N - it should be different depending on architecture --- SYCL/Matrix/joint_matrix_all_sizes_impl.hpp | 32 ++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp index 48266165db..f379f43c27 100644 --- a/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp +++ b/SYCL/Matrix/joint_matrix_all_sizes_impl.hpp @@ -178,53 +178,53 @@ int init_and_multiply() { int main() { #if defined(JOINT_MATRIX_TEST_BF16_1) || defined(JOINT_MATRIX_TEST_ALL_BF16) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_BF16_2) || defined(JOINT_MATRIX_TEST_ALL_BF16) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_BF16_3) || defined(JOINT_MATRIX_TEST_ALL_BF16) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_BF16_4) || defined(JOINT_MATRIX_TEST_ALL_BF16) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_BF16_5) || defined(JOINT_MATRIX_TEST_ALL_BF16) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_BF16_6) || defined(JOINT_MATRIX_TEST_ALL_BF16) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_BF16_7) || defined(JOINT_MATRIX_TEST_ALL_BF16) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_BF16_8) || defined(JOINT_MATRIX_TEST_ALL_BF16) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_INT8_1) || defined(JOINT_MATRIX_TEST_ALL_INT8) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_INT8_2) || defined(JOINT_MATRIX_TEST_ALL_INT8) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_INT8_3) || defined(JOINT_MATRIX_TEST_ALL_INT8) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_INT8_4) || defined(JOINT_MATRIX_TEST_ALL_INT8) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_INT8_5) || defined(JOINT_MATRIX_TEST_ALL_INT8) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_INT8_6) || defined(JOINT_MATRIX_TEST_ALL_INT8) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_INT8_7) || defined(JOINT_MATRIX_TEST_ALL_INT8) - init_and_multiply(); + init_and_multiply(); #endif #if defined(JOINT_MATRIX_TEST_INT8_8) || defined(JOINT_MATRIX_TEST_ALL_INT8) - init_and_multiply(); + init_and_multiply(); #endif return 0; }