@@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
150
150
#endif
151
151
}
152
152
#elif defined(__AVX__ )
153
+ static inline __m128i packNibbles ( __m128i bytes1 , __m128i bytes2 )
154
+ {
155
+ // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
156
+ const __m128i lowByte = _mm_set1_epi16 ( 0xFF );
157
+ __m128i high = _mm_andnot_si128 ( lowByte , bytes1 );
158
+ __m128i low = _mm_and_si128 ( lowByte , bytes1 );
159
+ high = _mm_srli_epi16 ( high , 4 );
160
+ bytes1 = _mm_or_si128 ( low , high );
161
+ high = _mm_andnot_si128 ( lowByte , bytes2 );
162
+ low = _mm_and_si128 ( lowByte , bytes2 );
163
+ high = _mm_srli_epi16 ( high , 4 );
164
+ bytes2 = _mm_or_si128 ( low , high );
165
+
166
+ return _mm_packus_epi16 ( bytes1 , bytes2 );
167
+ }
168
+
169
+ static inline __m128i mul_add_epi8_sse (const __m128i x , const __m128i y ) {
170
+ const __m128i ax = _mm_sign_epi8 (x , x );
171
+ const __m128i sy = _mm_sign_epi8 (y , x );
172
+ return _mm_maddubs_epi16 (ax , sy );
173
+ }
174
+
153
175
// spread 32 bits to 32 bytes { 0x00, 0xFF }
154
176
static inline __m256i bytes_from_bits_32 (const uint8_t * x ) {
155
177
uint32_t x32 ;
@@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
217
239
return sum_i16_pairs_float (doth , dotl );
218
240
}
219
241
220
- static inline __m128i packNibbles ( __m128i bytes1 , __m128i bytes2 )
221
- {
222
- // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
223
- const __m128i lowByte = _mm_set1_epi16 ( 0xFF );
224
- __m128i high = _mm_andnot_si128 ( lowByte , bytes1 );
225
- __m128i low = _mm_and_si128 ( lowByte , bytes1 );
226
- high = _mm_srli_epi16 ( high , 4 );
227
- bytes1 = _mm_or_si128 ( low , high );
228
- high = _mm_andnot_si128 ( lowByte , bytes2 );
229
- low = _mm_and_si128 ( lowByte , bytes2 );
230
- high = _mm_srli_epi16 ( high , 4 );
231
- bytes2 = _mm_or_si128 ( low , high );
242
+ // larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
243
+ static inline __m256 mul_sum_i8_quad_float (const __m128i x_1_0 , const __m128i x_1_1 , const __m128i x_2_0 , const __m128i x_2_1 ,
244
+ const __m128i y_1_0 , const __m128i y_1_1 , const __m128i y_2_0 , const __m128i y_2_1 ) {
245
+ const __m128i mone = _mm_set1_epi16 (1 );
232
246
233
- return _mm_packus_epi16 ( bytes1 , bytes2 );
247
+ const __m128i p16_1_0 = mul_add_epi8_sse (x_1_0 , y_1_0 );
248
+ const __m128i p16_1_1 = mul_add_epi8_sse (x_1_1 , y_1_1 );
249
+ const __m128i p16_2_0 = mul_add_epi8_sse (x_2_0 , y_2_0 );
250
+ const __m128i p16_2_1 = mul_add_epi8_sse (x_2_1 , y_2_1 );
251
+ const __m128i p_1_0 = _mm_madd_epi16 (p16_1_0 , mone );
252
+ const __m128i p_1_1 = _mm_madd_epi16 (p16_1_1 , mone );
253
+ const __m128i p_2_0 = _mm_madd_epi16 (p16_2_0 , mone );
254
+ const __m128i p_2_1 = _mm_madd_epi16 (p16_2_1 , mone );
255
+ const __m128i p_1 = _mm_add_epi32 (p_1_0 , p_1_1 );
256
+ const __m128i p_2 = _mm_add_epi32 (p_2_0 , p_2_1 );
257
+ return _mm256_cvtepi32_ps (MM256_SET_M128I (p_2 , p_1 ));
234
258
}
235
259
236
- static inline __m128i mul_add_epi8_sse (const __m128i x , const __m128i y ) {
237
- const __m128i ax = _mm_sign_epi8 (x , x );
238
- const __m128i sy = _mm_sign_epi8 (y , x );
239
- return _mm_maddubs_epi16 (ax , sy );
260
+ // quad fp16 delta calculation
261
+ static inline __m256 quad_fp16_delta_float (const float x0 , const float y0 , const float x1 , const float y1 ) {
262
+ // GGML_FP16_TO_FP32 is faster than Intel F16C
263
+ return _mm256_set_m128 (_mm_set1_ps (GGML_FP16_TO_FP32 (x1 ) * GGML_FP16_TO_FP32 (y1 )),
264
+ _mm_set1_ps (GGML_FP16_TO_FP32 (x0 ) * GGML_FP16_TO_FP32 (y0 )));
240
265
}
241
266
#endif
242
267
#elif defined(__SSSE3__ )
@@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2004
2029
2005
2030
sumf = hsum_float_8 (acc );
2006
2031
#elif defined(__AVX__ )
2007
- const __m128i mone = _mm_set1_epi16 (1 );
2008
-
2009
- __m256 accum1 = _mm256_setzero_ps ();
2010
- __m256 accum2 = _mm256_setzero_ps ();
2032
+ __m256 accum = _mm256_setzero_ps ();
2011
2033
for (; ib + 1 < nb ; ib += 2 ) {
2012
2034
const __m128i q4bits_1 = _mm_loadu_si128 ((const __m128i * )x [ib + 0 ].qs );
2013
2035
const __m128i q4bits_2 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs );
@@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
2020
2042
const __m128i q4b_1_1 = _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), _mm_srli_epi16 (q4bits_1 , 4 )), _mm_set1_epi8 (8 ));
2021
2043
const __m128i q4b_2_0 = _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), q4bits_2 ), _mm_set1_epi8 (8 ));
2022
2044
const __m128i q4b_2_1 = _mm_sub_epi8 (_mm_and_si128 (_mm_set1_epi8 (15 ), _mm_srli_epi16 (q4bits_2 , 4 )), _mm_set1_epi8 (8 ));
2045
+
2023
2046
const __m128i p16_1_0 = mul_add_epi8_sse (q4b_1_0 , q8b_1_0 );
2024
2047
const __m128i p16_1_1 = mul_add_epi8_sse (q4b_1_1 , q8b_1_1 );
2025
2048
const __m128i p16_2_0 = mul_add_epi8_sse (q4b_2_0 , q8b_2_0 );
2026
2049
const __m128i p16_2_1 = mul_add_epi8_sse (q4b_2_1 , q8b_2_1 );
2027
- const __m128i p_1_0 = _mm_madd_epi16 (p16_1_0 , mone );
2028
- const __m128i p_1_1 = _mm_madd_epi16 (p16_1_1 , mone );
2029
- const __m128i p_2_0 = _mm_madd_epi16 (p16_2_0 , mone );
2030
- const __m128i p_2_1 = _mm_madd_epi16 (p16_2_1 , mone );
2031
- accum1 = _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib + 0 ].d )* GGML_FP16_TO_FP32 (x [ib + 0 ].d )),
2032
- _mm256_cvtepi32_ps (MM256_SET_M128I (p_1_1 , p_1_0 ))), accum1 );
2033
- accum2 = _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib + 1 ].d )* GGML_FP16_TO_FP32 (x [ib + 1 ].d )),
2034
- _mm256_cvtepi32_ps (MM256_SET_M128I (p_2_1 , p_2_0 ))), accum2 );
2050
+ const __m128i p_1 = _mm_add_epi16 (p16_1_0 , p16_1_1 );
2051
+ const __m128i p_2 = _mm_add_epi16 (p16_2_0 , p16_2_1 );
2052
+ const __m256 p = sum_i16_pairs_float (p_2 , p_1 );
2053
+
2054
+ const __m256 deltas = quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib + 1 ].d , y [ib + 1 ].d );
2055
+ accum = _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
2035
2056
}
2036
2057
2037
- sumf = hsum_float_8 (_mm256_add_ps ( accum1 , accum2 ) );
2058
+ sumf = hsum_float_8 (accum );
2038
2059
#elif defined(__SSSE3__ )
2039
2060
// set constants
2040
2061
const __m128i lowMask = _mm_set1_epi8 (0xF );
@@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3535
3556
}
3536
3557
3537
3558
sumf = vaddvq_f32 (sumv0 ) + vaddvq_f32 (sumv1 );
3538
- #elif defined(__AVX2__ ) || defined( __AVX__ )
3559
+ #elif defined(__AVX2__ )
3539
3560
// Initialize accumulator with zeros
3540
3561
__m256 acc = _mm256_setzero_ps ();
3541
3562
@@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
3549
3570
const __m256 q = mul_sum_i8_pairs_float (qx , qy );
3550
3571
3551
3572
// Multiply q with scale and accumulate
3552
- #if defined(__AVX2__ )
3553
3573
acc = _mm256_fmadd_ps ( d , q , acc );
3554
- #else
3555
- acc = _mm256_add_ps ( _mm256_mul_ps ( d , q ), acc );
3556
- #endif
3557
3574
}
3558
3575
3559
3576
sumf = hsum_float_8 (acc );
3577
+ #elif defined(__AVX__ )
3578
+ __m256 accum = _mm256_setzero_ps ();
3579
+
3580
+ for (; ib + 1 < nb ; ib += 2 ) {
3581
+ const __m128i qx_1_0 = _mm_loadu_si128 ((const __m128i * )x [ib ].qs );
3582
+ const __m128i qx_1_1 = _mm_loadu_si128 ((const __m128i * )x [ib ].qs + 1 );
3583
+ const __m128i qx_2_0 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs );
3584
+ const __m128i qx_2_1 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs + 1 );
3585
+ const __m128i qy_1_0 = _mm_loadu_si128 ((const __m128i * )y [ib ].qs );
3586
+ const __m128i qy_1_1 = _mm_loadu_si128 ((const __m128i * )y [ib ].qs + 1 );
3587
+ const __m128i qy_2_0 = _mm_loadu_si128 ((const __m128i * )y [ib + 1 ].qs );
3588
+ const __m128i qy_2_1 = _mm_loadu_si128 ((const __m128i * )y [ib + 1 ].qs + 1 );
3589
+
3590
+ const __m256 p = mul_sum_i8_quad_float (qx_1_0 , qx_1_1 , qx_2_0 , qx_2_1 , qy_1_0 , qy_1_1 , qy_2_0 , qy_2_1 );
3591
+ const __m256 deltas = quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib + 1 ].d , y [ib + 1 ].d );
3592
+ accum = _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
3593
+ }
3594
+
3595
+ sumf = hsum_float_8 (accum );
3560
3596
#elif defined(__riscv_v_intrinsic )
3561
3597
size_t vl = __riscv_vsetvl_e8m1 (qk );
3562
3598
@@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
10322
10358
#elif defined __AVX__
10323
10359
const __m128i values128 = _mm_loadu_si128 ((const __m128i * )kvalues_iq4nl );
10324
10360
const __m128i m4b = _mm_set1_epi8 (0x0f );
10325
- const __m128i mone = _mm_set1_epi16 (1 );
10326
10361
10327
- __m256 accum1 = _mm256_setzero_ps ();
10328
- __m256 accum2 = _mm256_setzero_ps ();
10362
+ __m256 accum = _mm256_setzero_ps ();
10329
10363
for (; ib + 1 < nb ; ib += 2 ) {
10330
10364
const __m128i q4bits_1 = _mm_loadu_si128 ((const __m128i * )x [ib + 0 ].qs );
10331
10365
const __m128i q4bits_2 = _mm_loadu_si128 ((const __m128i * )x [ib + 1 ].qs );
@@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
10338
10372
const __m128i q4b_1_1 = _mm_shuffle_epi8 (values128 , _mm_and_si128 (_mm_srli_epi16 (q4bits_1 , 4 ), m4b ));
10339
10373
const __m128i q4b_2_0 = _mm_shuffle_epi8 (values128 , _mm_and_si128 (q4bits_2 , m4b ));
10340
10374
const __m128i q4b_2_1 = _mm_shuffle_epi8 (values128 , _mm_and_si128 (_mm_srli_epi16 (q4bits_2 , 4 ), m4b ));
10341
- const __m128i p16_1_0 = mul_add_epi8_sse (q4b_1_0 , q8b_1_0 );
10342
- const __m128i p16_1_1 = mul_add_epi8_sse (q4b_1_1 , q8b_1_1 );
10343
- const __m128i p16_2_0 = mul_add_epi8_sse (q4b_2_0 , q8b_2_0 );
10344
- const __m128i p16_2_1 = mul_add_epi8_sse (q4b_2_1 , q8b_2_1 );
10345
- const __m128i p_1_0 = _mm_madd_epi16 (p16_1_0 , mone );
10346
- const __m128i p_1_1 = _mm_madd_epi16 (p16_1_1 , mone );
10347
- const __m128i p_2_0 = _mm_madd_epi16 (p16_2_0 , mone );
10348
- const __m128i p_2_1 = _mm_madd_epi16 (p16_2_1 , mone );
10349
- accum1 = _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib + 0 ].d )* GGML_FP16_TO_FP32 (x [ib + 0 ].d )),
10350
- _mm256_cvtepi32_ps (MM256_SET_M128I (p_1_1 , p_1_0 ))), accum1 );
10351
- accum2 = _mm256_add_ps (_mm256_mul_ps (_mm256_set1_ps (GGML_FP16_TO_FP32 (y [ib + 1 ].d )* GGML_FP16_TO_FP32 (x [ib + 1 ].d )),
10352
- _mm256_cvtepi32_ps (MM256_SET_M128I (p_2_1 , p_2_0 ))), accum2 );
10375
+
10376
+ const __m256 p = mul_sum_i8_quad_float (q4b_1_0 , q4b_1_1 , q4b_2_0 , q4b_2_1 , q8b_1_0 , q8b_1_1 , q8b_2_0 , q8b_2_1 );
10377
+ const __m256 deltas = quad_fp16_delta_float (x [ib ].d , y [ib ].d , x [ib + 1 ].d , y [ib + 1 ].d );
10378
+ accum = _mm256_add_ps (_mm256_mul_ps (deltas , p ), accum );
10353
10379
}
10354
10380
10355
- sumf = hsum_float_8 (_mm256_add_ps ( accum1 , accum2 ) );
10381
+ sumf = hsum_float_8 (accum );
10356
10382
10357
10383
#elif defined(__POWER9_VECTOR__ )
10358
10384
const vector signed char lowMask = vec_splats ((signed char )0xF );
0 commit comments