Skip to content

Commit 60fd286

Browse files
committed
LoongArch64: Fixed LSX version of cscal and zscal
1 parent 7c3a920 commit 60fd286

File tree

1 file changed

+231
-34
lines changed

1 file changed

+231
-34
lines changed

kernel/loongarch64/cscal_lsx.S

+231-34
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3333
#define ALPHAI $f1
3434
#define X $r7
3535
#define INCX $r8
36+
#define DUMMY2 $r9
3637

3738
#define I $r12
3839
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6566

6667
bge $r0, N, .L999
6768
bge $r0, INCX, .L999
69+
ld.d DUMMY2, $sp, 0
6870
li.d TEMP, 1
6971
movgr2fr.d a1, $r0
7072
FFINT a1, a1
@@ -84,24 +86,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8486
srai.d I, N, 2
8587
bne INCX, TEMP, .L22
8688

89+
/////// INCX == 1 ////////
8790
.L11:
88-
bge $r0, I, .L997
8991
CMPEQ $fcc0, ALPHAR, a1
9092
CMPEQ $fcc1, ALPHAI, a1
93+
bge $r0, I, .L19
94+
95+
/////// INCX == 1 && N >= 4 ////////
96+
bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
97+
9198
bceqz $fcc0, .L13
9299
b .L14
93100
.align 3
94101

95102
.L13:
96-
bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
97-
b .L113 //alpha_r != 0.0 && alpha_i == 0.0
103+
bceqz $fcc1, .L17 //alpha_r != 0.0 && alpha_i != 0.0
104+
b .L16 //alpha_r != 0.0 && alpha_i == 0.0
98105

99106
.L14:
100-
bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
101-
b .L111 //alpha_r == 0.0 && alpha_i == 0.0
107+
bceqz $fcc1, .L18 //alpha_r == 0.0 && alpha_i != 0.0
108+
b .L15 //alpha_r == 0.0 && alpha_i == 0.0
102109
.align 3
103110

104-
.L111: //alpha_r == 0.0 && alpha_i == 0.0
111+
.L15: //alpha_r == 0.0 && alpha_i == 0.0
105112
vst VXZ, X, 0 * SIZE
106113
#ifdef DOUBLE
107114
vst VXZ, X, 2 * SIZE
@@ -112,11 +119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
112119
#endif
113120
addi.d X, X, 8 * SIZE
114121
addi.d I, I, -1
115-
blt $r0, I, .L111
116-
b .L997
122+
blt $r0, I, .L15
123+
b .L19
117124
.align 3
118125

119-
.L113: //alpha_r != 0.0 && alpha_i == 0.0
126+
.L16: //alpha_r != 0.0 && alpha_i == 0.0
120127
vld VX0, X, 0 * SIZE
121128
#ifdef DOUBLE
122129
vld VX1, X, 2 * SIZE
@@ -151,11 +158,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
151158
#endif
152159
addi.d X, X, 8 * SIZE
153160
addi.d I, I, -1
154-
blt $r0, I, .L113
155-
b .L997
161+
blt $r0, I, .L16
162+
b .L19
156163
.align 3
157164

158-
.L114: //alpha_r != 0.0 && alpha_i != 0.0
165+
.L17: //alpha_r != 0.0 && alpha_i != 0.0
159166
vld VX0, X, 0 * SIZE
160167
#ifdef DOUBLE
161168
vld VX1, X, 2 * SIZE
@@ -196,29 +203,92 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
196203
#endif
197204
addi.d X, X, 8 * SIZE
198205
addi.d I, I, -1
199-
blt $r0, I, .L114
200-
b .L997
206+
blt $r0, I, .L17
207+
b .L19
208+
.align 3
209+
210+
.L18: //alpha_r == 0.0 && alpha_i != 0.0
211+
vld VX0, X, 0 * SIZE
212+
#ifdef DOUBLE
213+
vld VX1, X, 2 * SIZE
214+
vpickev.d x1, VX1, VX0
215+
vpickod.d x2, VX1, VX0
216+
vfmul.d x3, VXAI, x2
217+
vfsub.d x3, VXZ, x3
218+
vfmul.d x4, VXAI, x1
219+
vilvl.d VX2, x4 ,x3
220+
vilvh.d VX3, x4, x3
221+
vst VX2, X, 0 * SIZE
222+
vst VX3, X, 2 * SIZE
223+
vld VX0, X, 4 * SIZE
224+
vld VX1, X, 6 * SIZE
225+
vpickev.d x1, VX1, VX0
226+
vpickod.d x2, VX1, VX0
227+
vfmul.d x3, VXAI, x2
228+
vfsub.d x3, VXZ, x3
229+
vfmul.d x4, VXAI, x1
230+
vilvl.d VX2, x4 ,x3
231+
vilvh.d VX3, x4, x3
232+
vst VX2, X, 4 * SIZE
233+
vst VX3, X, 6 * SIZE
234+
#else
235+
vld VX1, X, 4 * SIZE
236+
vpickev.w x1, VX1, VX0
237+
vpickod.w x2, VX1, VX0
238+
vfmul.s x3, VXAI, x2
239+
vfsub.s x3, VXZ, x3
240+
vfmul.s x4, VXAI, x1
241+
vilvl.w VX2, x4 ,x3
242+
vilvh.w VX3, x4, x3
243+
vst VX2, X, 0 * SIZE
244+
vst VX3, X, 4 * SIZE
245+
#endif
246+
addi.d X, X, 8 * SIZE
247+
addi.d I, I, -1
248+
blt $r0, I, .L18
249+
b .L19
250+
.align 3
251+
252+
/////// INCX == 1 && N < 8 ///////
253+
.L19:
254+
andi I, N, 3
255+
beqz I, .L999
256+
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
257+
258+
bceqz $fcc0, .L13_1
259+
b .L14_1
260+
261+
.L13_1:
262+
bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
263+
b .L997 // alpha_r != 0.0 && alpha_i == 0.0
264+
265+
.L14_1:
266+
bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
267+
b .L995 // alpha_r == 0.0 && alpha_i == 0.0
201268
.align 3
202269

270+
271+
/////// INCX != 1 ////////
203272
.L22:
204-
bge $r0, I, .L997
205-
move XX, X
206273
CMPEQ $fcc0, ALPHAR, a1
207274
CMPEQ $fcc1, ALPHAI, a1
275+
move XX, X
276+
bge $r0, I, .L29
277+
bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
208278
bceqz $fcc0, .L23
209279
b .L24
210280
.align 3
211281

212282
.L23:
213-
bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
214-
b .L223 //alpha_r != 0.0 && alpha_i == 0.0
283+
bceqz $fcc1, .L25 //alpha_r != 0.0 && alpha_i != 0.0
284+
b .L26 //alpha_r != 0.0 && alpha_i == 0.0
215285

216286
.L24:
217-
bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
218-
b .L221 //alpha_r == 0.0 && alpha_i == 0.0
287+
bceqz $fcc1, .L28 //alpha_r == 0.0 && alpha_i != 0.0
288+
b .L27 //alpha_r == 0.0 && alpha_i == 0.0
219289
.align 3
220290

221-
.L221: //alpha_r == 0.0 && alpha_i == 0.0
291+
.L27: //alpha_r == 0.0 && alpha_i == 0.0
222292
#ifdef DOUBLE
223293
vstelm.d VXZ, X, 0, 0
224294
vstelm.d VXZ, X, 1 * SIZE, 0
@@ -246,11 +316,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
246316
#endif
247317
add.d X, X, INCX
248318
addi.d I, I, -1
249-
blt $r0, I, .L221
250-
b .L997
319+
blt $r0, I, .L27
320+
b .L29
251321
.align 3
252322

253-
.L223: //alpha_r != 0.0 && alpha_i == 0.0
323+
.L26: //alpha_r != 0.0 && alpha_i == 0.0
254324
#ifdef DOUBLE
255325
ld.d t1, X, 0 * SIZE
256326
ld.d t2, X, 1 * SIZE
@@ -327,11 +397,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
327397
vstelm.w x4, XX, 1 * SIZE, 3
328398
#endif
329399
add.d XX, XX, INCX
330-
blt $r0, I, .L223
331-
b .L997
400+
blt $r0, I, .L26
401+
b .L29
332402
.align 3
333403

334-
.L224: //alpha_r != 0.0 && alpha_i != 0.0
404+
.L25: //alpha_r != 0.0 && alpha_i != 0.0
335405
#ifdef DOUBLE
336406
ld.d t1, X, 0 * SIZE
337407
ld.d t2, X, 1 * SIZE
@@ -414,16 +484,143 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
414484
vstelm.w x4, XX, 1 * SIZE, 3
415485
#endif
416486
add.d XX, XX, INCX
417-
blt $r0, I, .L224
418-
b .L997
487+
blt $r0, I, .L25
488+
b .L29
419489
.align 3
420490

421-
.L997:
422-
andi I, N, 3
423-
bge $r0, I, .L999
491+
.L28: //alpha_r == 0.0 && alpha_i != 0.0
492+
#ifdef DOUBLE
493+
ld.d t1, X, 0 * SIZE
494+
ld.d t2, X, 1 * SIZE
495+
add.d X, X, INCX
496+
ld.d t3, X, 0 * SIZE
497+
ld.d t4, X, 1 * SIZE
498+
add.d X, X, INCX
499+
vinsgr2vr.d x1, t1, 0
500+
vinsgr2vr.d x2, t2, 0
501+
vinsgr2vr.d x1, t3, 1
502+
vinsgr2vr.d x2, t4, 1
503+
vfmul.d x3, VXAI, x2
504+
vfsub.d x3, VXZ, x3
505+
vfmul.d x4, VXAI, x1
506+
vstelm.d x3, XX, 0 * SIZE, 0
507+
vstelm.d x4, XX, 1 * SIZE, 0
508+
add.d XX, XX, INCX
509+
vstelm.d x3, XX, 0 * SIZE, 1
510+
vstelm.d x4, XX, 1 * SIZE, 1
511+
add.d XX, XX, INCX
512+
513+
ld.d t1, X, 0 * SIZE
514+
ld.d t2, X, 1 * SIZE
515+
add.d X, X, INCX
516+
ld.d t3, X, 0 * SIZE
517+
ld.d t4, X, 1 * SIZE
518+
vinsgr2vr.d x1, t1, 0
519+
vinsgr2vr.d x2, t2, 0
520+
vinsgr2vr.d x1, t3, 1
521+
vinsgr2vr.d x2, t4, 1
522+
add.d X, X, INCX
523+
vfmul.d x3, VXAI, x2
524+
vfsub.d x3, VXZ, x3
525+
vfmul.d x4, VXAI, x1
526+
addi.d I, I, -1
527+
vstelm.d x3, XX, 0 * SIZE, 0
528+
vstelm.d x4, XX, 1 * SIZE, 0
529+
add.d XX, XX, INCX
530+
vstelm.d x3, XX, 0 * SIZE, 1
531+
vstelm.d x4, XX, 1 * SIZE, 1
532+
#else
533+
ld.w t1, X, 0 * SIZE
534+
ld.w t2, X, 1 * SIZE
535+
add.d X, X, INCX
536+
ld.w t3, X, 0 * SIZE
537+
ld.w t4, X, 1 * SIZE
538+
add.d X, X, INCX
539+
vinsgr2vr.w x1, t1, 0
540+
vinsgr2vr.w x2, t2, 0
541+
vinsgr2vr.w x1, t3, 1
542+
vinsgr2vr.w x2, t4, 1
543+
ld.w t1, X, 0 * SIZE
544+
ld.w t2, X, 1 * SIZE
545+
add.d X, X, INCX
546+
ld.w t3, X, 0 * SIZE
547+
ld.w t4, X, 1 * SIZE
548+
vinsgr2vr.w x1, t1, 2
549+
vinsgr2vr.w x2, t2, 2
550+
vinsgr2vr.w x1, t3, 3
551+
vinsgr2vr.w x2, t4, 3
552+
add.d X, X, INCX
553+
554+
vfmul.s x3, VXAI, x2
555+
vfsub.s x3, VXZ, x3
556+
vfmul.s x4, VXAI, x1
557+
addi.d I, I, -1
558+
vstelm.w x3, XX, 0 * SIZE, 0
559+
vstelm.w x4, XX, 1 * SIZE, 0
560+
add.d XX, XX, INCX
561+
vstelm.w x3, XX, 0 * SIZE, 1
562+
vstelm.w x4, XX, 1 * SIZE, 1
563+
add.d XX, XX, INCX
564+
vstelm.w x3, XX, 0 * SIZE, 2
565+
vstelm.w x4, XX, 1 * SIZE, 2
566+
add.d XX, XX, INCX
567+
vstelm.w x3, XX, 0 * SIZE, 3
568+
vstelm.w x4, XX, 1 * SIZE, 3
569+
#endif
570+
add.d XX, XX, INCX
571+
blt $r0, I, .L28
572+
b .L29
573+
.align 3
574+
575+
/////// INCX != 1 && N < 8 ///////
576+
.L29:
577+
andi I, N, 3
578+
beqz I, .L999
579+
bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
580+
581+
bceqz $fcc0, .L23_1
582+
b .L24_1
583+
584+
.L23_1:
585+
bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
586+
b .L997 // alpha_r != 0.0 && alpha_i == 0.0
587+
588+
.L24_1:
589+
bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
590+
b .L995 // alpha_r == 0.0 && alpha_i == 0.0
424591
.align 3
425592

426-
.L998:
593+
.L995: // alpha_r == 0.0 && alpha_i == 0.0
594+
ST a1, X, 0 * SIZE
595+
ST a1, X, 1 * SIZE
596+
addi.d I, I, -1
597+
add.d X, X, INCX
598+
blt $r0, I, .L995
599+
b .L999
600+
.L996: // alpha_r == 0.0 && alpha_i != 0.0
601+
LD a1, X, 0 * SIZE
602+
LD a2, X, 1 * SIZE
603+
addi.d I, I, -1
604+
MUL s1, ALPHAI, a2
605+
MUL s2, ALPHAI, a1
606+
SUB s1, $f12, s1
607+
ST s1, X, 0 * SIZE
608+
ST s2, X, 1 * SIZE
609+
add.d X, X, INCX
610+
blt $r0, I, .L996
611+
b .L999
612+
.L997: // alpha_r != 0.0 && alpha_i == 0.0
613+
LD a1, X, 0 * SIZE
614+
LD a2, X, 1 * SIZE
615+
addi.d I, I, -1
616+
MUL s1, ALPHAR, a1
617+
MUL s2, ALPHAR, a2
618+
ST s1, X, 0 * SIZE
619+
ST s2, X, 1 * SIZE
620+
add.d X, X, INCX
621+
blt $r0, I, .L997
622+
b .L999
623+
.L998: // alpha_r != 0.0 && alpha_i != 0.0, one by one
427624
LD a1, X, 0 * SIZE
428625
LD a2, X, 1 * SIZE
429626
addi.d I, I, -1
@@ -435,7 +632,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
435632
ST s2, X, 1 * SIZE
436633
add.d X, X, INCX
437634
blt $r0, I, .L998
438-
.align 3
635+
b .L999
439636

440637
.L999:
441638
move $r4, $r12

0 commit comments

Comments
 (0)