@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
33
#define ALPHAI $f1
34
34
#define X $r7
35
35
#define INCX $r8
36
+ #define DUMMY2 $r9
36
37
37
38
#define I $r12
38
39
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65
66
66
67
bge $r0, N, .L999
67
68
bge $r0, INCX, .L999
69
+ ld.d DUMMY2, $sp, 0
68
70
li.d TEMP, 1
69
71
movgr2fr.d a1, $r0
70
72
FFINT a1, a1
@@ -86,24 +88,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
86
88
#endif
87
89
bne INCX, TEMP, .L22
88
90
91
+ /////// INCX == 1 ////////
89
92
.L11:
90
- bge $r0, I, .L997
91
93
CMPEQ $fcc0, ALPHAR, a1
92
94
CMPEQ $fcc1, ALPHAI, a1
95
+ bge $r0, I, .L19
96
+ /////// INCX == 1 && N >= 4 ////////
97
+ bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
98
+
93
99
bceqz $fcc0, .L13
94
100
b .L14
95
101
.align 3
96
102
97
103
.L13:
98
- bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
99
- b .L113 //alpha_r != 0.0 && alpha_i == 0.0
104
+ bceqz $fcc1, .L17 //alpha_r != 0.0 && alpha_i != 0.0
105
+ b .L16 //alpha_r != 0.0 && alpha_i == 0.0
100
106
101
107
.L14:
102
- bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
103
- b .L111 //alpha_r == 0.0 && alpha_i == 0.0
108
+ bceqz $fcc1, .L18 //alpha_r == 0.0 && alpha_i != 0.0
109
+ b .L15 //alpha_r == 0.0 && alpha_i == 0.0
104
110
.align 3
105
111
106
- .L111 : //alpha_r == 0.0 && alpha_i == 0.0
112
+ .L15 : //alpha_r == 0.0 && alpha_i == 0.0
107
113
xvst VXZ, X, 0 * SIZE
108
114
#ifdef DOUBLE
109
115
xvst VXZ, X, 4 * SIZE
@@ -113,11 +119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
113
119
addi.d X, X, 16 * SIZE
114
120
#endif
115
121
addi.d I, I, -1
116
- blt $r0, I, .L111
117
- b .L997
122
+ blt $r0, I, .L15
123
+ b .L19
118
124
.align 3
119
125
120
- .L113 : //alpha_r != 0.0 && alpha_i == 0.0
126
+ .L16 : //alpha_r != 0.0 && alpha_i == 0.0
121
127
xvld VX0, X, 0 * SIZE
122
128
#ifdef DOUBLE
123
129
xvld VX1, X, 4 * SIZE
@@ -143,11 +149,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
143
149
addi.d X, X, 16 * SIZE
144
150
#endif
145
151
addi.d I, I, -1
146
- blt $r0, I, .L113
147
- b .L997
152
+ blt $r0, I, .L16
153
+ b .L19
148
154
.align 3
149
155
150
- .L114 : //alpha_r != 0.0 && alpha_i != 0.0
156
+ .L17 : //alpha_r != 0.0 && alpha_i != 0.0
151
157
xvld VX0, X, 0 * SIZE
152
158
#ifdef DOUBLE
153
159
xvld VX1, X, 4 * SIZE
@@ -177,29 +183,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
177
183
addi.d X, X, 16 * SIZE
178
184
#endif
179
185
addi.d I, I, -1
180
- blt $r0, I, .L114
181
- b .L997
186
+ blt $r0, I, .L17
187
+ b .L19
188
+ .align 3
189
+
190
+ .L18: //alpha_r == 0.0 && alpha_i != 0.0
191
+ xvld VX0, X, 0 * SIZE
192
+ #ifdef DOUBLE
193
+ xvld VX1, X, 4 * SIZE
194
+ xvpickev.d x1, VX1, VX0
195
+ xvpickod.d x2, VX1, VX0
196
+ xvfmul.d x3, VXAI, x2
197
+ xvfsub.d x3, VXZ, x3
198
+ xvfmul.d x4, VXAI, x1
199
+ xvilvl.d VX2, x4 ,x3
200
+ xvilvh.d VX3, x4, x3
201
+ xvst VX2, X, 0 * SIZE
202
+ xvst VX3, X, 4 * SIZE
203
+ addi.d X, X, 8 * SIZE
204
+ #else
205
+ xvld VX1, X, 8 * SIZE
206
+ xvpickev.w x1, VX1, VX0
207
+ xvpickod.w x2, VX1, VX0
208
+ xvfmul.s x3, VXAI, x2
209
+ xvfsub.s x3, VXZ, x3
210
+ xvfmul.s x4, VXAI, x1
211
+ xvilvl.w VX2, x4 ,x3
212
+ xvilvh.w VX3, x4, x3
213
+ xvst VX2, X, 0 * SIZE
214
+ xvst VX3, X, 8 * SIZE
215
+ addi.d X, X, 16 * SIZE
216
+ #endif
217
+ addi.d I, I, -1
218
+ blt $r0, I, .L18
219
+ b .L19
220
+ .align 3
221
+
222
+ /////// INCX == 1 && N < 8 ///////
223
+ .L19:
224
+ #ifdef DOUBLE
225
+ andi I, N, 3
226
+ #else
227
+ andi I, N, 7
228
+ #endif
229
+ beqz I, .L999
230
+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
231
+
232
+ bceqz $fcc0, .L13_1
233
+ b .L14_1
234
+
235
+ .L13_1:
236
+ bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
237
+ b .L997 // alpha_r != 0.0 && alpha_i == 0.0
238
+
239
+ .L14_1:
240
+ bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
241
+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
182
242
.align 3
183
243
244
+ /////// INCX != 1 ////////
184
245
.L22:
185
- bge $r0, I, .L997
186
- move XX, X
187
246
CMPEQ $fcc0, ALPHAR, a1
188
247
CMPEQ $fcc1, ALPHAI, a1
248
+ move XX, X
249
+ bge $r0, I, .L29
250
+ bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
189
251
bceqz $fcc0, .L23
190
252
b .L24
191
253
.align 3
192
254
193
255
.L23:
194
- bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
195
- b .L223 //alpha_r != 0.0 && alpha_i == 0.0
256
+ bceqz $fcc1, .L25 //alpha_r != 0.0 && alpha_i != 0.0
257
+ b .L26 //alpha_r != 0.0 && alpha_i == 0.0
196
258
197
259
.L24:
198
- bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
199
- b .L221 //alpha_r == 0.0 && alpha_i == 0.0
260
+ bceqz $fcc1, .L28 //alpha_r == 0.0 && alpha_i != 0.0
261
+ b .L27 //alpha_r == 0.0 && alpha_i == 0.0
200
262
.align 3
201
263
202
- .L221 : //alpha_r == 0.0 && alpha_i == 0.0
264
+ .L27 : //alpha_r == 0.0 && alpha_i == 0.0
203
265
#ifdef DOUBLE
204
266
xvstelm.d VXZ, X, 0 , 0
205
267
xvstelm.d VXZ, X, 1 * SIZE, 0
@@ -239,11 +301,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
239
301
#endif
240
302
add .d X, X, INCX
241
303
addi.d I, I, -1
242
- blt $r0, I, .L221
243
- b .L997
304
+ blt $r0, I, .L27
305
+ b .L29
244
306
.align 3
245
307
246
- .L223 : //alpha_r != 0.0 && alpha_i == 0.0
308
+ .L26 : //alpha_r != 0.0 && alpha_i == 0.0
247
309
#ifdef DOUBLE
248
310
ld.d t1, X, 0 * SIZE
249
311
ld.d t2, X, 1 * SIZE
@@ -350,11 +412,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
350
412
xvstelm.w x4, XX, 1 * SIZE, 7
351
413
#endif
352
414
add .d XX, XX, INCX
353
- blt $r0, I, .L223
354
- b .L997
415
+ blt $r0, I, .L26
416
+ b .L29
355
417
.align 3
356
418
357
- .L224 : //alpha_r != 0.0 && alpha_i != 0.0
419
+ .L25 : //alpha_r != 0.0 && alpha_i != 0.0
358
420
#ifdef DOUBLE
359
421
ld.d t1, X, 0 * SIZE
360
422
ld.d t2, X, 1 * SIZE
@@ -465,20 +527,176 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
465
527
xvstelm.w x4, XX, 1 * SIZE, 7
466
528
#endif
467
529
add .d XX, XX, INCX
468
- blt $r0, I, .L224
469
- b .L997
530
+ blt $r0, I, .L25
531
+ b .L29
470
532
.align 3
471
533
472
- .L997:
534
+ .L28: //alpha_r == 0.0 && alpha_i != 0.0
473
535
#ifdef DOUBLE
474
- andi I, N, 3
536
+ ld.d t1, X, 0 * SIZE
537
+ ld.d t2, X, 1 * SIZE
538
+ add .d X, X, INCX
539
+ ld.d t3, X, 0 * SIZE
540
+ ld.d t4, X, 1 * SIZE
541
+ add .d X, X, INCX
542
+ xvinsgr2vr.d x1, t1, 0
543
+ xvinsgr2vr.d x2, t2, 0
544
+ xvinsgr2vr.d x1, t3, 1
545
+ xvinsgr2vr.d x2, t4, 1
546
+ ld.d t1, X, 0 * SIZE
547
+ ld.d t2, X, 1 * SIZE
548
+ add .d X, X, INCX
549
+ ld.d t3, X, 0 * SIZE
550
+ ld.d t4, X, 1 * SIZE
551
+ xvinsgr2vr.d x1, t1, 2
552
+ xvinsgr2vr.d x2, t2, 2
553
+ xvinsgr2vr.d x1, t3, 3
554
+ xvinsgr2vr.d x2, t4, 3
555
+ add .d X, X, INCX
556
+
557
+ xvfmul.d x3, VXAI, x2
558
+ xvfsub.d x3, VXZ, x3
559
+ xvfmul.d x4, VXAI, x1
560
+ addi.d I, I, -1
561
+ xvstelm.d x3, XX, 0 * SIZE, 0
562
+ xvstelm.d x4, XX, 1 * SIZE, 0
563
+ add .d XX, XX, INCX
564
+ xvstelm.d x3, XX, 0 * SIZE, 1
565
+ xvstelm.d x4, XX, 1 * SIZE, 1
566
+ add .d XX, XX, INCX
567
+ xvstelm.d x3, XX, 0 * SIZE, 2
568
+ xvstelm.d x4, XX, 1 * SIZE, 2
569
+ add .d XX, XX, INCX
570
+ xvstelm.d x3, XX, 0 * SIZE, 3
571
+ xvstelm.d x4, XX, 1 * SIZE, 3
475
572
#else
476
- andi I, N, 7
573
+ ld.w t1, X, 0 * SIZE
574
+ ld.w t2, X, 1 * SIZE
575
+ add .d X, X, INCX
576
+ ld.w t3, X, 0 * SIZE
577
+ ld.w t4, X, 1 * SIZE
578
+ add .d X, X, INCX
579
+ xvinsgr2vr.w x1, t1, 0
580
+ xvinsgr2vr.w x2, t2, 0
581
+ xvinsgr2vr.w x1, t3, 1
582
+ xvinsgr2vr.w x2, t4, 1
583
+ ld.w t1, X, 0 * SIZE
584
+ ld.w t2, X, 1 * SIZE
585
+ add .d X, X, INCX
586
+ ld.w t3, X, 0 * SIZE
587
+ ld.w t4, X, 1 * SIZE
588
+ xvinsgr2vr.w x1, t1, 2
589
+ xvinsgr2vr.w x2, t2, 2
590
+ xvinsgr2vr.w x1, t3, 3
591
+ xvinsgr2vr.w x2, t4, 3
592
+ add .d X, X, INCX
593
+ ld.w t1, X, 0 * SIZE
594
+ ld.w t2, X, 1 * SIZE
595
+ add .d X, X, INCX
596
+ ld.w t3, X, 0 * SIZE
597
+ ld.w t4, X, 1 * SIZE
598
+ add .d X, X, INCX
599
+ xvinsgr2vr.w x1, t1, 4
600
+ xvinsgr2vr.w x2, t2, 4
601
+ xvinsgr2vr.w x1, t3, 5
602
+ xvinsgr2vr.w x2, t4, 5
603
+ ld.w t1, X, 0 * SIZE
604
+ ld.w t2, X, 1 * SIZE
605
+ add .d X, X, INCX
606
+ ld.w t3, X, 0 * SIZE
607
+ ld.w t4, X, 1 * SIZE
608
+ xvinsgr2vr.w x1, t1, 6
609
+ xvinsgr2vr.w x2, t2, 6
610
+ xvinsgr2vr.w x1, t3, 7
611
+ xvinsgr2vr.w x2, t4, 7
612
+ add .d X, X, INCX
613
+
614
+ xvfmul.s x3, VXAI, x2
615
+ xvfsub.s x3, VXZ, x3
616
+ xvfmul.s x4, VXAI, x1
617
+ addi.d I, I, -1
618
+ xvstelm.w x3, XX, 0 * SIZE, 0
619
+ xvstelm.w x4, XX, 1 * SIZE, 0
620
+ add .d XX, XX, INCX
621
+ xvstelm.w x3, XX, 0 * SIZE, 1
622
+ xvstelm.w x4, XX, 1 * SIZE, 1
623
+ add .d XX, XX, INCX
624
+ xvstelm.w x3, XX, 0 * SIZE, 2
625
+ xvstelm.w x4, XX, 1 * SIZE, 2
626
+ add .d XX, XX, INCX
627
+ xvstelm.w x3, XX, 0 * SIZE, 3
628
+ xvstelm.w x4, XX, 1 * SIZE, 3
629
+ add .d XX, XX, INCX
630
+ xvstelm.w x3, XX, 0 * SIZE, 4
631
+ xvstelm.w x4, XX, 1 * SIZE, 4
632
+ add .d XX, XX, INCX
633
+ xvstelm.w x3, XX, 0 * SIZE, 5
634
+ xvstelm.w x4, XX, 1 * SIZE, 5
635
+ add .d XX, XX, INCX
636
+ xvstelm.w x3, XX, 0 * SIZE, 6
637
+ xvstelm.w x4, XX, 1 * SIZE, 6
638
+ add .d XX, XX, INCX
639
+ xvstelm.w x3, XX, 0 * SIZE, 7
640
+ xvstelm.w x4, XX, 1 * SIZE, 7
477
641
#endif
478
- bge $r0, I, .L999
642
+ add .d XX, XX, INCX
643
+ blt $r0, I, .L28
644
+ b .L29
479
645
.align 3
480
646
481
- .L998:
647
+ /////// INCX != 1 && N < 8 ///////
648
+ .L29:
649
+ #ifdef DOUBLE
650
+ andi I, N, 3
651
+ #else
652
+ andi I, N, 7
653
+ #endif
654
+ beqz I, .L999
655
+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
656
+
657
+ bceqz $fcc0, .L23_1
658
+ b .L24_1
659
+
660
+ .L23_1:
661
+ bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
662
+ b .L997 // alpha_r != 0.0 && alpha_i == 0.0
663
+
664
+ .L24_1:
665
+ bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
666
+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
667
+ .align 3
668
+
669
+ .L995: // alpha_r == 0.0 && alpha_i == 0.0
670
+ ST a1, X, 0 * SIZE
671
+ ST a1, X, 1 * SIZE
672
+ addi.d I, I, -1
673
+ add .d X, X, INCX
674
+ blt $r0, I, .L995
675
+ b .L999
676
+ .L996: // alpha_r == 0.0 && alpha_i != 0.0
677
+ LD a1, X, 0 * SIZE
678
+ LD a2, X, 1 * SIZE
679
+ addi.d I, I, -1
680
+ MUL s1, ALPHAI, a2
681
+ MUL s2, ALPHAI, a1
682
+ SUB s1, $f12, s1
683
+ ST s1, X, 0 * SIZE
684
+ ST s2, X, 1 * SIZE
685
+ add .d X, X, INCX
686
+ blt $r0, I, .L996
687
+ b .L999
688
+ .L997: // alpha_r != 0.0 && alpha_i == 0.0
689
+ LD a1, X, 0 * SIZE
690
+ LD a2, X, 1 * SIZE
691
+ addi.d I, I, -1
692
+ MUL s1, ALPHAR, a1
693
+ MUL s2, ALPHAR, a2
694
+ ST s1, X, 0 * SIZE
695
+ ST s2, X, 1 * SIZE
696
+ add .d X, X, INCX
697
+ blt $r0, I, .L997
698
+ b .L999
699
+ .L998: // alpha_r != 0.0 && alpha_i != 0.0, one by one
482
700
LD a1, X, 0 * SIZE
483
701
LD a2, X, 1 * SIZE
484
702
addi.d I, I, -1
@@ -490,7 +708,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
490
708
ST s2, X, 1 * SIZE
491
709
add .d X, X, INCX
492
710
blt $r0, I, .L998
493
- .align 3
711
+ b .L999
494
712
495
713
.L999:
496
714
move $r4, $r12
0 commit comments