Skip to content

Commit 1842922

Browse files
authored
AVX BF16 and single scale quant optimizations (#10212)
* use 128 bit loads (i've tried 256->128 to death and its slower) * double accumulator * avx bf16 vec dot * +3% q4_0 inference * +7% tg +5% pp compared to master * slower f16c version, kep for reference * 256b version, also slow. i tried :) * revert f16 * faster with madd * split to functions * Q8_0 and IQ4_NL, 5-7% faster * fix potential overflow (performance reduced) * 16 bit add for q4_0 only * merge
1 parent f0204a0 commit 1842922

File tree

2 files changed

+82
-52
lines changed

2 files changed

+82
-52
lines changed

ggml/src/ggml-cpu/ggml-cpu-quants.c

+77-51
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,28 @@ static inline __m128i packNibbles( __m256i bytes )
150150
#endif
151151
}
152152
#elif defined(__AVX__)
153+
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
154+
{
155+
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
156+
const __m128i lowByte = _mm_set1_epi16( 0xFF );
157+
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
158+
__m128i low = _mm_and_si128( lowByte, bytes1 );
159+
high = _mm_srli_epi16( high, 4 );
160+
bytes1 = _mm_or_si128( low, high );
161+
high = _mm_andnot_si128( lowByte, bytes2 );
162+
low = _mm_and_si128( lowByte, bytes2 );
163+
high = _mm_srli_epi16( high, 4 );
164+
bytes2 = _mm_or_si128( low, high );
165+
166+
return _mm_packus_epi16( bytes1, bytes2);
167+
}
168+
169+
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
170+
const __m128i ax = _mm_sign_epi8(x, x);
171+
const __m128i sy = _mm_sign_epi8(y, x);
172+
return _mm_maddubs_epi16(ax, sy);
173+
}
174+
153175
// spread 32 bits to 32 bytes { 0x00, 0xFF }
154176
static inline __m256i bytes_from_bits_32(const uint8_t * x) {
155177
uint32_t x32;
@@ -217,26 +239,29 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
217239
return sum_i16_pairs_float(doth, dotl);
218240
}
219241

220-
static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
221-
{
222-
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
223-
const __m128i lowByte = _mm_set1_epi16( 0xFF );
224-
__m128i high = _mm_andnot_si128( lowByte, bytes1 );
225-
__m128i low = _mm_and_si128( lowByte, bytes1 );
226-
high = _mm_srli_epi16( high, 4 );
227-
bytes1 = _mm_or_si128( low, high );
228-
high = _mm_andnot_si128( lowByte, bytes2 );
229-
low = _mm_and_si128( lowByte, bytes2 );
230-
high = _mm_srli_epi16( high, 4 );
231-
bytes2 = _mm_or_si128( low, high );
242+
// larger version of mul_sum_i8_pairs_float where x and y are each represented by four 128-bit vectors
243+
static inline __m256 mul_sum_i8_quad_float(const __m128i x_1_0, const __m128i x_1_1, const __m128i x_2_0, const __m128i x_2_1,
244+
const __m128i y_1_0, const __m128i y_1_1, const __m128i y_2_0, const __m128i y_2_1) {
245+
const __m128i mone = _mm_set1_epi16(1);
232246

233-
return _mm_packus_epi16( bytes1, bytes2);
247+
const __m128i p16_1_0 = mul_add_epi8_sse(x_1_0, y_1_0);
248+
const __m128i p16_1_1 = mul_add_epi8_sse(x_1_1, y_1_1);
249+
const __m128i p16_2_0 = mul_add_epi8_sse(x_2_0, y_2_0);
250+
const __m128i p16_2_1 = mul_add_epi8_sse(x_2_1, y_2_1);
251+
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
252+
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
253+
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
254+
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
255+
const __m128i p_1 = _mm_add_epi32(p_1_0, p_1_1);
256+
const __m128i p_2 = _mm_add_epi32(p_2_0, p_2_1);
257+
return _mm256_cvtepi32_ps(MM256_SET_M128I(p_2, p_1));
234258
}
235259

236-
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
237-
const __m128i ax = _mm_sign_epi8(x, x);
238-
const __m128i sy = _mm_sign_epi8(y, x);
239-
return _mm_maddubs_epi16(ax, sy);
260+
// quad fp16 delta calculation
261+
static inline __m256 quad_fp16_delta_float(const float x0, const float y0, const float x1, const float y1) {
262+
// GGML_FP16_TO_FP32 is faster than Intel F16C
263+
return _mm256_set_m128(_mm_set1_ps(GGML_FP16_TO_FP32(x1) * GGML_FP16_TO_FP32(y1)),
264+
_mm_set1_ps(GGML_FP16_TO_FP32(x0) * GGML_FP16_TO_FP32(y0)));
240265
}
241266
#endif
242267
#elif defined(__SSSE3__)
@@ -2004,10 +2029,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
20042029

20052030
sumf = hsum_float_8(acc);
20062031
#elif defined(__AVX__)
2007-
const __m128i mone = _mm_set1_epi16(1);
2008-
2009-
__m256 accum1 = _mm256_setzero_ps();
2010-
__m256 accum2 = _mm256_setzero_ps();
2032+
__m256 accum = _mm256_setzero_ps();
20112033
for (; ib + 1 < nb; ib += 2) {
20122034
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
20132035
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@@ -2020,21 +2042,20 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
20202042
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
20212043
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
20222044
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
2045+
20232046
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
20242047
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
20252048
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
20262049
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
2027-
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
2028-
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
2029-
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
2030-
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
2031-
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
2032-
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
2033-
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
2034-
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
2050+
const __m128i p_1 = _mm_add_epi16(p16_1_0, p16_1_1);
2051+
const __m128i p_2 = _mm_add_epi16(p16_2_0, p16_2_1);
2052+
const __m256 p = sum_i16_pairs_float(p_2, p_1);
2053+
2054+
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
2055+
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
20352056
}
20362057

2037-
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
2058+
sumf = hsum_float_8(accum);
20382059
#elif defined(__SSSE3__)
20392060
// set constants
20402061
const __m128i lowMask = _mm_set1_epi8(0xF);
@@ -3535,7 +3556,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
35353556
}
35363557

35373558
sumf = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
3538-
#elif defined(__AVX2__) || defined(__AVX__)
3559+
#elif defined(__AVX2__)
35393560
// Initialize accumulator with zeros
35403561
__m256 acc = _mm256_setzero_ps();
35413562

@@ -3549,14 +3570,29 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void * r
35493570
const __m256 q = mul_sum_i8_pairs_float(qx, qy);
35503571

35513572
// Multiply q with scale and accumulate
3552-
#if defined(__AVX2__)
35533573
acc = _mm256_fmadd_ps( d, q, acc );
3554-
#else
3555-
acc = _mm256_add_ps( _mm256_mul_ps( d, q ), acc );
3556-
#endif
35573574
}
35583575

35593576
sumf = hsum_float_8(acc);
3577+
#elif defined(__AVX__)
3578+
__m256 accum = _mm256_setzero_ps();
3579+
3580+
for (; ib + 1 < nb; ib += 2) {
3581+
const __m128i qx_1_0 = _mm_loadu_si128((const __m128i *)x[ib].qs);
3582+
const __m128i qx_1_1 = _mm_loadu_si128((const __m128i *)x[ib].qs + 1);
3583+
const __m128i qx_2_0 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
3584+
const __m128i qx_2_1 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs + 1);
3585+
const __m128i qy_1_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
3586+
const __m128i qy_1_1 = _mm_loadu_si128((const __m128i *)y[ib].qs + 1);
3587+
const __m128i qy_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
3588+
const __m128i qy_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
3589+
3590+
const __m256 p = mul_sum_i8_quad_float(qx_1_0, qx_1_1, qx_2_0, qx_2_1, qy_1_0, qy_1_1, qy_2_0, qy_2_1);
3591+
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
3592+
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
3593+
}
3594+
3595+
sumf = hsum_float_8(accum);
35603596
#elif defined(__riscv_v_intrinsic)
35613597
size_t vl = __riscv_vsetvl_e8m1(qk);
35623598

@@ -10322,10 +10358,8 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
1032210358
#elif defined __AVX__
1032310359
const __m128i values128 = _mm_loadu_si128((const __m128i*)kvalues_iq4nl);
1032410360
const __m128i m4b = _mm_set1_epi8(0x0f);
10325-
const __m128i mone = _mm_set1_epi16(1);
1032610361

10327-
__m256 accum1 = _mm256_setzero_ps();
10328-
__m256 accum2 = _mm256_setzero_ps();
10362+
__m256 accum = _mm256_setzero_ps();
1032910363
for (; ib + 1 < nb; ib += 2) {
1033010364
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
1033110365
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
@@ -10338,21 +10372,13 @@ void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void *
1033810372
const __m128i q4b_1_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_1, 4), m4b));
1033910373
const __m128i q4b_2_0 = _mm_shuffle_epi8(values128, _mm_and_si128(q4bits_2, m4b));
1034010374
const __m128i q4b_2_1 = _mm_shuffle_epi8(values128, _mm_and_si128(_mm_srli_epi16(q4bits_2, 4), m4b));
10341-
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
10342-
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
10343-
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
10344-
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
10345-
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
10346-
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
10347-
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
10348-
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
10349-
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 0].d)*GGML_FP16_TO_FP32(x[ib + 0].d)),
10350-
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
10351-
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(y[ib + 1].d)*GGML_FP16_TO_FP32(x[ib + 1].d)),
10352-
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
10375+
10376+
const __m256 p = mul_sum_i8_quad_float(q4b_1_0, q4b_1_1, q4b_2_0, q4b_2_1, q8b_1_0, q8b_1_1, q8b_2_0, q8b_2_1);
10377+
const __m256 deltas = quad_fp16_delta_float(x[ib].d, y[ib].d, x[ib + 1].d, y[ib + 1].d);
10378+
accum = _mm256_add_ps(_mm256_mul_ps(deltas, p), accum);
1035310379
}
1035410380

10355-
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
10381+
sumf = hsum_float_8(accum);
1035610382

1035710383
#elif defined(__POWER9_VECTOR__)
1035810384
const vector signed char lowMask = vec_splats((signed char)0xF);

ggml/src/ggml-cpu/ggml-cpu.c

+5-1
Original file line numberDiff line numberDiff line change
@@ -1469,8 +1469,12 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
14691469
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
14701470

14711471
#undef LOAD
1472-
#elif defined(__AVX2__)
1472+
#elif defined(__AVX2__) || defined(__AVX__)
1473+
#if defined(__AVX2__)
14731474
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
1475+
#else
1476+
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
1477+
#endif
14741478
__m256 c1 = _mm256_setzero_ps();
14751479
__m256 c2 = _mm256_setzero_ps();
14761480
__m256 c3 = _mm256_setzero_ps();

0 commit comments

Comments
 (0)