Skip to content

Add vector registers to clobber list to prevent compiler optimization. #5203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions driver/others/dynamic_arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ extern gotoblas_t gotoblas_A64FX;
#ifndef NO_SME
extern gotoblas_t gotoblas_ARMV9SME;
#else
#define gotoblas_ARMV9SME gotoblas_ARMV8SVE
#define gotoblas_ARMV9SME gotoblas_ARMV8
#endif

extern gotoblas_t gotoblas_THUNDERX3T110;
Expand All @@ -168,7 +168,7 @@ extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"

#define NUM_CORETYPES 18
#define NUM_CORETYPES 19

/*
* In case asm/hwcap.h is outdated on the build system, make sure
Expand Down Expand Up @@ -207,6 +207,7 @@ static char *corename[] = {
"cortexa55",
"armv8sve",
"a64fx",
"armv9sme"
"unknown"
};

Expand All @@ -229,6 +230,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_CORTEXA55) return corename[15];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[16];
if (gotoblas == &gotoblas_A64FX) return corename[17];
if (gotoblas == &gotoblas_ARMV9SME) return corename[18];
return corename[NUM_CORETYPES];
}

Expand Down Expand Up @@ -266,6 +268,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 15: return (&gotoblas_CORTEXA55);
case 16: return (&gotoblas_ARMV8SVE);
case 17: return (&gotoblas_A64FX);
case 18: return (&gotoblas_ARMV9SME);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
Expand Down Expand Up @@ -446,7 +449,7 @@ static gotoblas_t *get_coretype(void) {
openblas_warning(1, coremsg);
}

#if !defined(NO_SME) && defined(HWCAP2_SME)
#ifndef NO_SME
if ((getauxval(AT_HWCAP2) & HWCAP2_SME)) {
return &gotoblas_ARMV9SME;
}
Expand Down
21 changes: 18 additions & 3 deletions kernel/arm64/sgemm_direct_arm64_sme1.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include <stdlib.h>
#include <inttypes.h>
#include <math.h>

#if defined(HAVE_SME)

/* Function prototypes */
Expand Down Expand Up @@ -44,15 +43,31 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A,\
m_mod = ceil((double)M/(double)vl_elms) * vl_elms;

float *A_mod = (float *) malloc(m_mod*K*sizeof(float));


/* Prevent compiler optimization by reading from memory instead
* of reading directly from vector (z) registers.
* */
asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");

/* Pre-process the left matrix to make it suitable for
matrix sum of outer-product calculation
*/
sgemm_direct_sme1_preprocess(M, K, A, A_mod);

/* Calculate C = A*B */
sgemm_direct_sme1_2VLx2VL(M, K, N, A_mod, B, R);


asm volatile("" : : :"p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7",
"p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15",
"z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7",
"z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",
"z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23",
"z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
free(A_mod);
}

Expand Down
Loading