From 04915be8295f0c28723f9dd6a69fc55a6f680118 Mon Sep 17 00:00:00 2001 From: Vaisakh K V Date: Thu, 3 Apr 2025 12:18:43 +0530 Subject: [PATCH 1/2] Add vector registers to clobber list to prevent compiler optimization. SME based SGEMMDIRECT kernel uses the vector registers (z) and adding clobber list informs compiler not to optimize these registers. --- kernel/arm64/sgemm_direct_arm64_sme1.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/kernel/arm64/sgemm_direct_arm64_sme1.c b/kernel/arm64/sgemm_direct_arm64_sme1.c index bd7e548894..50c2a9a2dc 100644 --- a/kernel/arm64/sgemm_direct_arm64_sme1.c +++ b/kernel/arm64/sgemm_direct_arm64_sme1.c @@ -7,7 +7,6 @@ #include #include #include - #if defined(HAVE_SME) /* Function prototypes */ @@ -44,7 +43,17 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ m_mod = ceil((double)M/(double)vl_elms) * vl_elms; float *A_mod = (float *) malloc(m_mod*K*sizeof(float)); - + + /* Prevent compiler optimization by reading from memory instead + * of reading directly from vector (z) registers. + * */ + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); + /* Pre-process the left matrix to make it suitable for matrix sum of outer-product calculation */ @@ -52,7 +61,13 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\ /* Calculate C = A*B */ sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R); - + + asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", + "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15", + "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", + "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", + "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", + "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"); free(A_mod); } From 4873c95f3656a2e98b4c2b5597227619b4e0d7b8 Mon Sep 17 00:00:00 2001 From: Vaisakh K V Date: Fri, 4 Apr 2025 11:12:50 +0530 Subject: [PATCH 2/2] Add ARMV9SME to AArch64 Dynamic Dispatch --- driver/others/dynamic_arm64.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 31821ae789..6243b02d59 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -157,7 +157,7 @@ extern gotoblas_t gotoblas_A64FX; #ifndef NO_SME extern gotoblas_t gotoblas_ARMV9SME; #else -#define gotoblas_ARMV9SME gotoblas_ARMV8SVE +#define gotoblas_ARMV9SME gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; @@ -168,7 +168,7 @@ extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 18 +#define NUM_CORETYPES 19 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -207,6 +207,7 @@ static char *corename[] = { "cortexa55", "armv8sve", "a64fx", + "armv9sme" "unknown" }; @@ -229,6 +230,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; if (gotoblas == &gotoblas_A64FX) return corename[17]; + if (gotoblas == &gotoblas_ARMV9SME) return corename[18]; return corename[NUM_CORETYPES]; } @@ -266,6 +268,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 15: return (&gotoblas_CORTEXA55); case 16: return (&gotoblas_ARMV8SVE); case 17: return (&gotoblas_A64FX); + case 18: return (&gotoblas_ARMV9SME); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -446,7 +449,7 @@ static gotoblas_t *get_coretype(void) { openblas_warning(1, coremsg); } -#if !defined(NO_SME) && defined(HWCAP2_SME) +#ifndef NO_SME if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) { return &gotoblas_ARMV9SME; }