@@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
33
#define ALPHAI $f1
34
34
#define X $r7
35
35
#define INCX $r8
36
+ #define DUMMY2 $r9
36
37
37
38
#define I $r12
38
39
#define TEMP $r13
@@ -65,6 +66,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
65
66
66
67
bge $r0, N, .L999
67
68
bge $r0, INCX, .L999
69
+ ld.d DUMMY2, $sp, 0
68
70
li.d TEMP, 1
69
71
movgr2fr.d a1, $r0
70
72
FFINT a1, a1
@@ -84,24 +86,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
84
86
srai.d I, N, 2
85
87
bne INCX, TEMP, .L22
86
88
89
+ /////// INCX == 1 ////////
87
90
.L11:
88
- bge $r0, I, .L997
89
91
CMPEQ $fcc0, ALPHAR, a1
90
92
CMPEQ $fcc1, ALPHAI, a1
93
+ bge $r0, I, .L19
94
+
95
+ /////// INCX == 1 && N >= 4 ////////
96
+ bnez DUMMY2, .L17 // if DUMMPY2 == 1, called from c/zscal.
97
+
91
98
bceqz $fcc0, .L13
92
99
b .L14
93
100
.align 3
94
101
95
102
.L13:
96
- bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
97
- b .L113 //alpha_r != 0.0 && alpha_i == 0.0
103
+ bceqz $fcc1, .L17 //alpha_r != 0.0 && alpha_i != 0.0
104
+ b .L16 //alpha_r != 0.0 && alpha_i == 0.0
98
105
99
106
.L14:
100
- bceqz $fcc1, .L114 //alpha_r == 0.0 && alpha_i != 0.0
101
- b .L111 //alpha_r == 0.0 && alpha_i == 0.0
107
+ bceqz $fcc1, .L18 //alpha_r == 0.0 && alpha_i != 0.0
108
+ b .L15 //alpha_r == 0.0 && alpha_i == 0.0
102
109
.align 3
103
110
104
- .L111 : //alpha_r == 0.0 && alpha_i == 0.0
111
+ .L15 : //alpha_r == 0.0 && alpha_i == 0.0
105
112
vst VXZ, X, 0 * SIZE
106
113
#ifdef DOUBLE
107
114
vst VXZ, X, 2 * SIZE
@@ -112,11 +119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
112
119
#endif
113
120
addi.d X, X, 8 * SIZE
114
121
addi.d I, I, -1
115
- blt $r0, I, .L111
116
- b .L997
122
+ blt $r0, I, .L15
123
+ b .L19
117
124
.align 3
118
125
119
- .L113 : //alpha_r != 0.0 && alpha_i == 0.0
126
+ .L16 : //alpha_r != 0.0 && alpha_i == 0.0
120
127
vld VX0, X, 0 * SIZE
121
128
#ifdef DOUBLE
122
129
vld VX1, X, 2 * SIZE
@@ -151,11 +158,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
151
158
#endif
152
159
addi.d X, X, 8 * SIZE
153
160
addi.d I, I, -1
154
- blt $r0, I, .L113
155
- b .L997
161
+ blt $r0, I, .L16
162
+ b .L19
156
163
.align 3
157
164
158
- .L114 : //alpha_r != 0.0 && alpha_i != 0.0
165
+ .L17 : //alpha_r != 0.0 && alpha_i != 0.0
159
166
vld VX0, X, 0 * SIZE
160
167
#ifdef DOUBLE
161
168
vld VX1, X, 2 * SIZE
@@ -196,29 +203,92 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
196
203
#endif
197
204
addi.d X, X, 8 * SIZE
198
205
addi.d I, I, -1
199
- blt $r0, I, .L114
200
- b .L997
206
+ blt $r0, I, .L17
207
+ b .L19
208
+ .align 3
209
+
210
+ .L18: //alpha_r == 0.0 && alpha_i != 0.0
211
+ vld VX0, X, 0 * SIZE
212
+ #ifdef DOUBLE
213
+ vld VX1, X, 2 * SIZE
214
+ vpickev.d x1, VX1, VX0
215
+ vpickod.d x2, VX1, VX0
216
+ vfmul.d x3, VXAI, x2
217
+ vfsub.d x3, VXZ, x3
218
+ vfmul.d x4, VXAI, x1
219
+ vilvl.d VX2, x4 ,x3
220
+ vilvh.d VX3, x4, x3
221
+ vst VX2, X, 0 * SIZE
222
+ vst VX3, X, 2 * SIZE
223
+ vld VX0, X, 4 * SIZE
224
+ vld VX1, X, 6 * SIZE
225
+ vpickev.d x1, VX1, VX0
226
+ vpickod.d x2, VX1, VX0
227
+ vfmul.d x3, VXAI, x2
228
+ vfsub.d x3, VXZ, x3
229
+ vfmul.d x4, VXAI, x1
230
+ vilvl.d VX2, x4 ,x3
231
+ vilvh.d VX3, x4, x3
232
+ vst VX2, X, 4 * SIZE
233
+ vst VX3, X, 6 * SIZE
234
+ #else
235
+ vld VX1, X, 4 * SIZE
236
+ vpickev.w x1, VX1, VX0
237
+ vpickod.w x2, VX1, VX0
238
+ vfmul.s x3, VXAI, x2
239
+ vfsub.s x3, VXZ, x3
240
+ vfmul.s x4, VXAI, x1
241
+ vilvl.w VX2, x4 ,x3
242
+ vilvh.w VX3, x4, x3
243
+ vst VX2, X, 0 * SIZE
244
+ vst VX3, X, 4 * SIZE
245
+ #endif
246
+ addi.d X, X, 8 * SIZE
247
+ addi.d I, I, -1
248
+ blt $r0, I, .L18
249
+ b .L19
250
+ .align 3
251
+
252
+ /////// INCX == 1 && N < 8 ///////
253
+ .L19:
254
+ andi I, N, 3
255
+ beqz I, .L999
256
+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
257
+
258
+ bceqz $fcc0, .L13_1
259
+ b .L14_1
260
+
261
+ .L13_1:
262
+ bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
263
+ b .L997 // alpha_r != 0.0 && alpha_i == 0.0
264
+
265
+ .L14_1:
266
+ bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
267
+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
201
268
.align 3
202
269
270
+
271
+ /////// INCX != 1 ////////
203
272
.L22:
204
- bge $r0, I, .L997
205
- move XX, X
206
273
CMPEQ $fcc0, ALPHAR, a1
207
274
CMPEQ $fcc1, ALPHAI, a1
275
+ move XX, X
276
+ bge $r0, I, .L29
277
+ bnez DUMMY2, .L25 // if DUMMPY2 == 1, called from c/zscal.
208
278
bceqz $fcc0, .L23
209
279
b .L24
210
280
.align 3
211
281
212
282
.L23:
213
- bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
214
- b .L223 //alpha_r != 0.0 && alpha_i == 0.0
283
+ bceqz $fcc1, .L25 //alpha_r != 0.0 && alpha_i != 0.0
284
+ b .L26 //alpha_r != 0.0 && alpha_i == 0.0
215
285
216
286
.L24:
217
- bceqz $fcc1, .L224 //alpha_r == 0.0 && alpha_i != 0.0
218
- b .L221 //alpha_r == 0.0 && alpha_i == 0.0
287
+ bceqz $fcc1, .L28 //alpha_r == 0.0 && alpha_i != 0.0
288
+ b .L27 //alpha_r == 0.0 && alpha_i == 0.0
219
289
.align 3
220
290
221
- .L221 : //alpha_r == 0.0 && alpha_i == 0.0
291
+ .L27 : //alpha_r == 0.0 && alpha_i == 0.0
222
292
#ifdef DOUBLE
223
293
vstelm.d VXZ, X, 0 , 0
224
294
vstelm.d VXZ, X, 1 * SIZE, 0
@@ -246,11 +316,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
246
316
#endif
247
317
add .d X, X, INCX
248
318
addi.d I, I, -1
249
- blt $r0, I, .L221
250
- b .L997
319
+ blt $r0, I, .L27
320
+ b .L29
251
321
.align 3
252
322
253
- .L223 : //alpha_r != 0.0 && alpha_i == 0.0
323
+ .L26 : //alpha_r != 0.0 && alpha_i == 0.0
254
324
#ifdef DOUBLE
255
325
ld.d t1, X, 0 * SIZE
256
326
ld.d t2, X, 1 * SIZE
@@ -327,11 +397,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
327
397
vstelm.w x4, XX, 1 * SIZE, 3
328
398
#endif
329
399
add .d XX, XX, INCX
330
- blt $r0, I, .L223
331
- b .L997
400
+ blt $r0, I, .L26
401
+ b .L29
332
402
.align 3
333
403
334
- .L224 : //alpha_r != 0.0 && alpha_i != 0.0
404
+ .L25 : //alpha_r != 0.0 && alpha_i != 0.0
335
405
#ifdef DOUBLE
336
406
ld.d t1, X, 0 * SIZE
337
407
ld.d t2, X, 1 * SIZE
@@ -414,16 +484,143 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
414
484
vstelm.w x4, XX, 1 * SIZE, 3
415
485
#endif
416
486
add .d XX, XX, INCX
417
- blt $r0, I, .L224
418
- b .L997
487
+ blt $r0, I, .L25
488
+ b .L29
419
489
.align 3
420
490
421
- .L997:
422
- andi I, N, 3
423
- bge $r0, I, .L999
491
+ .L28: //alpha_r == 0.0 && alpha_i != 0.0
492
+ #ifdef DOUBLE
493
+ ld.d t1, X, 0 * SIZE
494
+ ld.d t2, X, 1 * SIZE
495
+ add .d X, X, INCX
496
+ ld.d t3, X, 0 * SIZE
497
+ ld.d t4, X, 1 * SIZE
498
+ add .d X, X, INCX
499
+ vinsgr2vr.d x1, t1, 0
500
+ vinsgr2vr.d x2, t2, 0
501
+ vinsgr2vr.d x1, t3, 1
502
+ vinsgr2vr.d x2, t4, 1
503
+ vfmul.d x3, VXAI, x2
504
+ vfsub.d x3, VXZ, x3
505
+ vfmul.d x4, VXAI, x1
506
+ vstelm.d x3, XX, 0 * SIZE, 0
507
+ vstelm.d x4, XX, 1 * SIZE, 0
508
+ add .d XX, XX, INCX
509
+ vstelm.d x3, XX, 0 * SIZE, 1
510
+ vstelm.d x4, XX, 1 * SIZE, 1
511
+ add .d XX, XX, INCX
512
+
513
+ ld.d t1, X, 0 * SIZE
514
+ ld.d t2, X, 1 * SIZE
515
+ add .d X, X, INCX
516
+ ld.d t3, X, 0 * SIZE
517
+ ld.d t4, X, 1 * SIZE
518
+ vinsgr2vr.d x1, t1, 0
519
+ vinsgr2vr.d x2, t2, 0
520
+ vinsgr2vr.d x1, t3, 1
521
+ vinsgr2vr.d x2, t4, 1
522
+ add .d X, X, INCX
523
+ vfmul.d x3, VXAI, x2
524
+ vfsub.d x3, VXZ, x3
525
+ vfmul.d x4, VXAI, x1
526
+ addi.d I, I, -1
527
+ vstelm.d x3, XX, 0 * SIZE, 0
528
+ vstelm.d x4, XX, 1 * SIZE, 0
529
+ add .d XX, XX, INCX
530
+ vstelm.d x3, XX, 0 * SIZE, 1
531
+ vstelm.d x4, XX, 1 * SIZE, 1
532
+ #else
533
+ ld.w t1, X, 0 * SIZE
534
+ ld.w t2, X, 1 * SIZE
535
+ add .d X, X, INCX
536
+ ld.w t3, X, 0 * SIZE
537
+ ld.w t4, X, 1 * SIZE
538
+ add .d X, X, INCX
539
+ vinsgr2vr.w x1, t1, 0
540
+ vinsgr2vr.w x2, t2, 0
541
+ vinsgr2vr.w x1, t3, 1
542
+ vinsgr2vr.w x2, t4, 1
543
+ ld.w t1, X, 0 * SIZE
544
+ ld.w t2, X, 1 * SIZE
545
+ add .d X, X, INCX
546
+ ld.w t3, X, 0 * SIZE
547
+ ld.w t4, X, 1 * SIZE
548
+ vinsgr2vr.w x1, t1, 2
549
+ vinsgr2vr.w x2, t2, 2
550
+ vinsgr2vr.w x1, t3, 3
551
+ vinsgr2vr.w x2, t4, 3
552
+ add .d X, X, INCX
553
+
554
+ vfmul.s x3, VXAI, x2
555
+ vfsub.s x3, VXZ, x3
556
+ vfmul.s x4, VXAI, x1
557
+ addi.d I, I, -1
558
+ vstelm.w x3, XX, 0 * SIZE, 0
559
+ vstelm.w x4, XX, 1 * SIZE, 0
560
+ add .d XX, XX, INCX
561
+ vstelm.w x3, XX, 0 * SIZE, 1
562
+ vstelm.w x4, XX, 1 * SIZE, 1
563
+ add .d XX, XX, INCX
564
+ vstelm.w x3, XX, 0 * SIZE, 2
565
+ vstelm.w x4, XX, 1 * SIZE, 2
566
+ add .d XX, XX, INCX
567
+ vstelm.w x3, XX, 0 * SIZE, 3
568
+ vstelm.w x4, XX, 1 * SIZE, 3
569
+ #endif
570
+ add .d XX, XX, INCX
571
+ blt $r0, I, .L28
572
+ b .L29
573
+ .align 3
574
+
575
+ /////// INCX != 1 && N < 8 ///////
576
+ .L29:
577
+ andi I, N, 3
578
+ beqz I, .L999
579
+ bnez DUMMY2, .L998 // if DUMMPY2 == 1, called from c/zscal.
580
+
581
+ bceqz $fcc0, .L23_1
582
+ b .L24_1
583
+
584
+ .L23_1:
585
+ bceqz $fcc1, .L998 // alpha_r != 0.0 && alpha_i != 0.0
586
+ b .L997 // alpha_r != 0.0 && alpha_i == 0.0
587
+
588
+ .L24_1:
589
+ bceqz $fcc1, .L996 // alpha_r == 0.0 && alpha_i != 0.0
590
+ b .L995 // alpha_r == 0.0 && alpha_i == 0.0
424
591
.align 3
425
592
426
- .L998:
593
+ .L995: // alpha_r == 0.0 && alpha_i == 0.0
594
+ ST a1, X, 0 * SIZE
595
+ ST a1, X, 1 * SIZE
596
+ addi.d I, I, -1
597
+ add .d X, X, INCX
598
+ blt $r0, I, .L995
599
+ b .L999
600
+ .L996: // alpha_r == 0.0 && alpha_i != 0.0
601
+ LD a1, X, 0 * SIZE
602
+ LD a2, X, 1 * SIZE
603
+ addi.d I, I, -1
604
+ MUL s1, ALPHAI, a2
605
+ MUL s2, ALPHAI, a1
606
+ SUB s1, $f12, s1
607
+ ST s1, X, 0 * SIZE
608
+ ST s2, X, 1 * SIZE
609
+ add .d X, X, INCX
610
+ blt $r0, I, .L996
611
+ b .L999
612
+ .L997: // alpha_r != 0.0 && alpha_i == 0.0
613
+ LD a1, X, 0 * SIZE
614
+ LD a2, X, 1 * SIZE
615
+ addi.d I, I, -1
616
+ MUL s1, ALPHAR, a1
617
+ MUL s2, ALPHAR, a2
618
+ ST s1, X, 0 * SIZE
619
+ ST s2, X, 1 * SIZE
620
+ add .d X, X, INCX
621
+ blt $r0, I, .L997
622
+ b .L999
623
+ .L998: // alpha_r != 0.0 && alpha_i != 0.0, one by one
427
624
LD a1, X, 0 * SIZE
428
625
LD a2, X, 1 * SIZE
429
626
addi.d I, I, -1
@@ -435,7 +632,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
435
632
ST s2, X, 1 * SIZE
436
633
add .d X, X, INCX
437
634
blt $r0, I, .L998
438
- .align 3
635
+ b .L999
439
636
440
637
.L999:
441
638
move $r4, $r12
0 commit comments