Skip to content

Commit 6b27f17

Browse files
committed
LoongArch64: Fixed LASX version of cscal and zscal
1 parent 60fd286 commit 6b27f17

File tree

1 file changed

+253
-35
lines changed

1 file changed

+253
-35
lines changed

kernel/loongarch64/cscal_lasx.S

+253-35
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333
#define ALPHAI $f1
3434
#define X $r7
3535
#define INCX $r8
36+
#define DUMMY2 $r9
3637

3738
#define I $r12
3839
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6566

6667
bge $r0, N, .L999
6768
bge $r0, INCX, .L999
69+
ld.d DUMMY2, $sp, 0
6870
li.d TEMP, 1
6971
movgr2fr.d a1, $r0
7072
FFINT a1, a1
@@ -86,24 +88,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8688
#endif
8789
bne INCX, TEMP, .L22
8890

91+
/////// INCX == 1 ////////
8992
.L11:
90-
bge $r0, I, .L997
9193
CMPEQ $fcc0, ALPHAR, a1
9294
CMPEQ $fcc1, ALPHAI, a1
95+
bge $r0, I, .L19
96+
/////// INCX == 1 && N >= 4 ////////
97+
bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
98+
9399
bceqz $fcc0, .L13
94100
b .L14
95101
.align 3
96102

97103
.L13:
98-
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
99-
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
104+
bceqz $fcc1, .L17 //alpha_r != 0.0 && alpha_i != 0.0
105+
b .L16 //alpha_r != 0.0 && alpha_i == 0.0
100106

101107
.L14:
102-
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
103-
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
108+
bceqz $fcc1, .L18 //alpha_r == 0.0 && alpha_i != 0.0
109+
b .L15 //alpha_r == 0.0 && alpha_i == 0.0
104110
.align 3
105111

106-
.L111: //alpha_r == 0.0 && alpha_i == 0.0
112+
.L15: //alpha_r == 0.0 && alpha_i == 0.0
107113
xvst VXZ, X, 0 * SIZE
108114
#ifdef DOUBLE
109115
xvst VXZ, X, 4 * SIZE
@@ -113,11 +119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113119
addi.d X, X, 16 * SIZE
114120
#endif
115121
addi.d I, I, -1
116-
blt $r0, I, .L111
117-
b .L997
122+
blt $r0, I, .L15
123+
b .L19
118124
.align 3
119125

120-
.L113: //alpha_r != 0.0 && alpha_i == 0.0
126+
.L16: //alpha_r != 0.0 && alpha_i == 0.0
121127
xvld VX0, X, 0 * SIZE
122128
#ifdef DOUBLE
123129
xvld VX1, X, 4 * SIZE
@@ -143,11 +149,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
143149
addi.d X, X, 16 * SIZE
144150
#endif
145151
addi.d I, I, -1
146-
blt $r0, I, .L113
147-
b .L997
152+
blt $r0, I, .L16
153+
b .L19
148154
.align 3
149155

150-
.L114: //alpha_r != 0.0 && alpha_i != 0.0
156+
.L17: //alpha_r != 0.0 && alpha_i != 0.0
151157
xvld VX0, X, 0 * SIZE
152158
#ifdef DOUBLE
153159
xvld VX1, X, 4 * SIZE
@@ -177,29 +183,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
177183
addi.d X, X, 16 * SIZE
178184
#endif
179185
addi.d I, I, -1
180-
blt $r0, I, .L114
181-
b .L997
186+
blt $r0, I, .L17
187+
b .L19
188+
.align 3
189+
190+
.L18: //alpha_r == 0.0 && alpha_i != 0.0
191+
xvld VX0, X, 0 * SIZE
192+
#ifdef DOUBLE
193+
xvld VX1, X, 4 * SIZE
194+
xvpickev.d x1, VX1, VX0
195+
xvpickod.d x2, VX1, VX0
196+
xvfmul.d x3, VXAI, x2
197+
xvfsub.d x3, VXZ, x3
198+
xvfmul.d x4, VXAI, x1
199+
xvilvl.d VX2, x4 ,x3
200+
xvilvh.d VX3, x4, x3
201+
xvst VX2, X, 0 * SIZE
202+
xvst VX3, X, 4 * SIZE
203+
addi.d X, X, 8 * SIZE
204+
#else
205+
xvld VX1, X, 8 * SIZE
206+
xvpickev.w x1, VX1, VX0
207+
xvpickod.w x2, VX1, VX0
208+
xvfmul.s x3, VXAI, x2
209+
xvfsub.s x3, VXZ, x3
210+
xvfmul.s x4, VXAI, x1
211+
xvilvl.w VX2, x4 ,x3
212+
xvilvh.w VX3, x4, x3
213+
xvst VX2, X, 0 * SIZE
214+
xvst VX3, X, 8 * SIZE
215+
addi.d X, X, 16 * SIZE
216+
#endif
217+
addi.d I, I, -1
218+
blt $r0, I, .L18
219+
b .L19
220+
.align 3
221+
222+
/////// INCX == 1 && N < 8 ///////
223+
.L19:
224+
#ifdef DOUBLE
225+
andi I, N, 3
226+
#else
227+
andi I, N, 7
228+
#endif
229+
beqz I, .L999
230+
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
231+
232+
bceqz $fcc0, .L13_1
233+
b .L14_1
234+
235+
.L13_1:
236+
bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
237+
b .L997 // alpha_r != 0.0 && alpha_i == 0.0
238+
239+
.L14_1:
240+
bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
241+
b .L995 // alpha_r == 0.0 && alpha_i == 0.0
182242
.align 3
183243

244+
/////// INCX != 1 ////////
184245
.L22:
185-
bge $r0, I, .L997
186-
move XX, X
187246
CMPEQ $fcc0, ALPHAR, a1
188247
CMPEQ $fcc1, ALPHAI, a1
248+
move XX, X
249+
bge $r0, I, .L29
250+
bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
189251
bceqz $fcc0, .L23
190252
b .L24
191253
.align 3
192254

193255
.L23:
194-
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
195-
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
256+
bceqz $fcc1, .L25 //alpha_r != 0.0 && alpha_i != 0.0
257+
b .L26 //alpha_r != 0.0 && alpha_i == 0.0
196258

197259
.L24:
198-
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
199-
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
260+
bceqz $fcc1, .L28 //alpha_r == 0.0 && alpha_i != 0.0
261+
b .L27 //alpha_r == 0.0 && alpha_i == 0.0
200262
.align 3
201263

202-
.L221: //alpha_r == 0.0 && alpha_i == 0.0
264+
.L27: //alpha_r == 0.0 && alpha_i == 0.0
203265
#ifdef DOUBLE
204266
xvstelm.d VXZ, X, 0, 0
205267
xvstelm.d VXZ, X, 1 * SIZE, 0
@@ -239,11 +301,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239301
#endif
240302
add.d X, X, INCX
241303
addi.d I, I, -1
242-
blt $r0, I, .L221
243-
b .L997
304+
blt $r0, I, .L27
305+
b .L29
244306
.align 3
245307

246-
.L223: //alpha_r != 0.0 && alpha_i == 0.0
308+
.L26: //alpha_r != 0.0 && alpha_i == 0.0
247309
#ifdef DOUBLE
248310
ld.d t1, X, 0 * SIZE
249311
ld.d t2, X, 1 * SIZE
@@ -350,11 +412,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
350412
xvstelm.w x4, XX, 1 * SIZE, 7
351413
#endif
352414
add.d XX, XX, INCX
353-
blt $r0, I, .L223
354-
b .L997
415+
blt $r0, I, .L26
416+
b .L29
355417
.align 3
356418

357-
.L224: //alpha_r != 0.0 && alpha_i != 0.0
419+
.L25: //alpha_r != 0.0 && alpha_i != 0.0
358420
#ifdef DOUBLE
359421
ld.d t1, X, 0 * SIZE
360422
ld.d t2, X, 1 * SIZE
@@ -465,20 +527,176 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
465527
xvstelm.w x4, XX, 1 * SIZE, 7
466528
#endif
467529
add.d XX, XX, INCX
468-
blt $r0, I, .L224
469-
b .L997
530+
blt $r0, I, .L25
531+
b .L29
470532
.align 3
471533

472-
.L997:
534+
.L28: //alpha_r == 0.0 && alpha_i != 0.0
473535
#ifdef DOUBLE
474-
andi I, N, 3
536+
ld.d t1, X, 0 * SIZE
537+
ld.d t2, X, 1 * SIZE
538+
add.d X, X, INCX
539+
ld.d t3, X, 0 * SIZE
540+
ld.d t4, X, 1 * SIZE
541+
add.d X, X, INCX
542+
xvinsgr2vr.d x1, t1, 0
543+
xvinsgr2vr.d x2, t2, 0
544+
xvinsgr2vr.d x1, t3, 1
545+
xvinsgr2vr.d x2, t4, 1
546+
ld.d t1, X, 0 * SIZE
547+
ld.d t2, X, 1 * SIZE
548+
add.d X, X, INCX
549+
ld.d t3, X, 0 * SIZE
550+
ld.d t4, X, 1 * SIZE
551+
xvinsgr2vr.d x1, t1, 2
552+
xvinsgr2vr.d x2, t2, 2
553+
xvinsgr2vr.d x1, t3, 3
554+
xvinsgr2vr.d x2, t4, 3
555+
add.d X, X, INCX
556+
557+
xvfmul.d x3, VXAI, x2
558+
xvfsub.d x3, VXZ, x3
559+
xvfmul.d x4, VXAI, x1
560+
addi.d I, I, -1
561+
xvstelm.d x3, XX, 0 * SIZE, 0
562+
xvstelm.d x4, XX, 1 * SIZE, 0
563+
add.d XX, XX, INCX
564+
xvstelm.d x3, XX, 0 * SIZE, 1
565+
xvstelm.d x4, XX, 1 * SIZE, 1
566+
add.d XX, XX, INCX
567+
xvstelm.d x3, XX, 0 * SIZE, 2
568+
xvstelm.d x4, XX, 1 * SIZE, 2
569+
add.d XX, XX, INCX
570+
xvstelm.d x3, XX, 0 * SIZE, 3
571+
xvstelm.d x4, XX, 1 * SIZE, 3
475572
#else
476-
andi I, N, 7
573+
ld.w t1, X, 0 * SIZE
574+
ld.w t2, X, 1 * SIZE
575+
add.d X, X, INCX
576+
ld.w t3, X, 0 * SIZE
577+
ld.w t4, X, 1 * SIZE
578+
add.d X, X, INCX
579+
xvinsgr2vr.w x1, t1, 0
580+
xvinsgr2vr.w x2, t2, 0
581+
xvinsgr2vr.w x1, t3, 1
582+
xvinsgr2vr.w x2, t4, 1
583+
ld.w t1, X, 0 * SIZE
584+
ld.w t2, X, 1 * SIZE
585+
add.d X, X, INCX
586+
ld.w t3, X, 0 * SIZE
587+
ld.w t4, X, 1 * SIZE
588+
xvinsgr2vr.w x1, t1, 2
589+
xvinsgr2vr.w x2, t2, 2
590+
xvinsgr2vr.w x1, t3, 3
591+
xvinsgr2vr.w x2, t4, 3
592+
add.d X, X, INCX
593+
ld.w t1, X, 0 * SIZE
594+
ld.w t2, X, 1 * SIZE
595+
add.d X, X, INCX
596+
ld.w t3, X, 0 * SIZE
597+
ld.w t4, X, 1 * SIZE
598+
add.d X, X, INCX
599+
xvinsgr2vr.w x1, t1, 4
600+
xvinsgr2vr.w x2, t2, 4
601+
xvinsgr2vr.w x1, t3, 5
602+
xvinsgr2vr.w x2, t4, 5
603+
ld.w t1, X, 0 * SIZE
604+
ld.w t2, X, 1 * SIZE
605+
add.d X, X, INCX
606+
ld.w t3, X, 0 * SIZE
607+
ld.w t4, X, 1 * SIZE
608+
xvinsgr2vr.w x1, t1, 6
609+
xvinsgr2vr.w x2, t2, 6
610+
xvinsgr2vr.w x1, t3, 7
611+
xvinsgr2vr.w x2, t4, 7
612+
add.d X, X, INCX
613+
614+
xvfmul.s x3, VXAI, x2
615+
xvfsub.s x3, VXZ, x3
616+
xvfmul.s x4, VXAI, x1
617+
addi.d I, I, -1
618+
xvstelm.w x3, XX, 0 * SIZE, 0
619+
xvstelm.w x4, XX, 1 * SIZE, 0
620+
add.d XX, XX, INCX
621+
xvstelm.w x3, XX, 0 * SIZE, 1
622+
xvstelm.w x4, XX, 1 * SIZE, 1
623+
add.d XX, XX, INCX
624+
xvstelm.w x3, XX, 0 * SIZE, 2
625+
xvstelm.w x4, XX, 1 * SIZE, 2
626+
add.d XX, XX, INCX
627+
xvstelm.w x3, XX, 0 * SIZE, 3
628+
xvstelm.w x4, XX, 1 * SIZE, 3
629+
add.d XX, XX, INCX
630+
xvstelm.w x3, XX, 0 * SIZE, 4
631+
xvstelm.w x4, XX, 1 * SIZE, 4
632+
add.d XX, XX, INCX
633+
xvstelm.w x3, XX, 0 * SIZE, 5
634+
xvstelm.w x4, XX, 1 * SIZE, 5
635+
add.d XX, XX, INCX
636+
xvstelm.w x3, XX, 0 * SIZE, 6
637+
xvstelm.w x4, XX, 1 * SIZE, 6
638+
add.d XX, XX, INCX
639+
xvstelm.w x3, XX, 0 * SIZE, 7
640+
xvstelm.w x4, XX, 1 * SIZE, 7
477641
#endif
478-
bge $r0, I, .L999
642+
add.d XX, XX, INCX
643+
blt $r0, I, .L28
644+
b .L29
479645
.align 3
480646

481-
.L998:
647+
/////// INCX != 1 && N < 8 ///////
648+
.L29:
649+
#ifdef DOUBLE
650+
andi I, N, 3
651+
#else
652+
andi I, N, 7
653+
#endif
654+
beqz I, .L999
655+
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
656+
657+
bceqz $fcc0, .L23_1
658+
b .L24_1
659+
660+
.L23_1:
661+
bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
662+
b .L997 // alpha_r != 0.0 && alpha_i == 0.0
663+
664+
.L24_1:
665+
bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
666+
b .L995 // alpha_r == 0.0 && alpha_i == 0.0
667+
.align 3
668+
669+
.L995: // alpha_r == 0.0 && alpha_i == 0.0
670+
ST a1, X, 0 * SIZE
671+
ST a1, X, 1 * SIZE
672+
addi.d I, I, -1
673+
add.d X, X, INCX
674+
blt $r0, I, .L995
675+
b .L999
676+
.L996: // alpha_r == 0.0 && alpha_i != 0.0
677+
LD a1, X, 0 * SIZE
678+
LD a2, X, 1 * SIZE
679+
addi.d I, I, -1
680+
MUL s1, ALPHAI, a2
681+
MUL s2, ALPHAI, a1
682+
SUB s1, $f12, s1
683+
ST s1, X, 0 * SIZE
684+
ST s2, X, 1 * SIZE
685+
add.d X, X, INCX
686+
blt $r0, I, .L996
687+
b .L999
688+
.L997: // alpha_r != 0.0 && alpha_i == 0.0
689+
LD a1, X, 0 * SIZE
690+
LD a2, X, 1 * SIZE
691+
addi.d I, I, -1
692+
MUL s1, ALPHAR, a1
693+
MUL s2, ALPHAR, a2
694+
ST s1, X, 0 * SIZE
695+
ST s2, X, 1 * SIZE
696+
add.d X, X, INCX
697+
blt $r0, I, .L997
698+
b .L999
699+
.L998: // alpha_r != 0.0 && alpha_i != 0.0, one by one
482700
LD a1, X, 0 * SIZE
483701
LD a2, X, 1 * SIZE
484702
addi.d I, I, -1
@@ -490,7 +708,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
490708
ST s2, X, 1 * SIZE
491709
add.d X, X, INCX
492710
blt $r0, I, .L998
493-
.align 3
711+
b .L999
494712

495713
.L999:
496714
move $r4, $r12

0 commit comments

Comments
 (0)