How to sum all 32-bit or 64-bit sub-registers in an SSE XMM, or AVX YMM, and ZMM register?

Question

Say your task results in a subtotal in each floating-point subregister. I'm not seeing an instruction that would sum the subtotals down to one floating-point total. Do I need to store the MM register in plain old memory then do the sum with simple instructions?

(It's unresolved whether these will be double or single-precision, and I plan on coding for every CPU variation up to the forthcoming (?) 512-bit AVX version if I can find the opcodes.)

See [`_mm_hadd_ps`](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_ps%2520&expand=2757) *et al*. Also [this question](http://stackoverflow.com/questions/8536032/adding-the-components-of-an-sse-register/8536234#8536234). — Paul R, Apr 01 '17 at 17:01
...and [this question](http://stackoverflow.com/a/6996992/253056). — Paul R, Apr 01 '17 at 17:06
...and [this question for an AVX solution](http://stackoverflow.com/a/23189942/253056). — Paul R, Apr 01 '17 at 17:08

Z boson · Accepted Answer · 2017-04-05T09:02:09.950

wget http://www.agner.org/optimize/vectorclass.zip
unzip vectorclass.zip -d vectorclass
cd vectorclass/

This code is GPLv3.

SSE

grep -A11 horizontal_add vectorf128.h

static inline float horizontal_add (Vec4f const & a) {
#if  INSTRSET >= 3  // SSE3
    __m128 t1 = _mm_hadd_ps(a,a);
    __m128 t2 = _mm_hadd_ps(t1,t1);
    return _mm_cvtss_f32(t2);        
#else
    __m128 t1 = _mm_movehl_ps(a,a);
    __m128 t2 = _mm_add_ps(a,t1);
    __m128 t3 = _mm_shuffle_ps(t2,t2,1);
    __m128 t4 = _mm_add_ss(t2,t3);
    return _mm_cvtss_f32(t4);
#endif
--
static inline double horizontal_add (Vec2d const & a) {
#if  INSTRSET >= 3  // SSE3
    __m128d t1 = _mm_hadd_pd(a,a);
    return _mm_cvtsd_f64(t1);        
#else
    __m128  t0 = _mm_castpd_ps(a);
    __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
    __m128d t2 = _mm_add_sd(a,t1);
    return _mm_cvtsd_f64(t2);
#endif
}

AVX

grep -A6 horizontal_add vectorf256.h

static inline float horizontal_add (Vec8f const & a) {
    __m256 t1 = _mm256_hadd_ps(a,a);
    __m256 t2 = _mm256_hadd_ps(t1,t1);
    __m128 t3 = _mm256_extractf128_ps(t2,1);
    __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
    return _mm_cvtss_f32(t4);        
}
--
static inline double horizontal_add (Vec4d const & a) {
    __m256d t1 = _mm256_hadd_pd(a,a);
    __m128d t2 = _mm256_extractf128_pd(t1,1);
    __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
    return _mm_cvtsd_f64(t3);        
}

AVX512

grep -A3 horizontal_add vectorf512.h

static inline float horizontal_add (Vec16f const & a) {
#if defined(__INTEL_COMPILER)
    return _mm512_reduce_add_ps(a);
#else
    return horizontal_add(a.get_low() + a.get_high());
#endif
}

--
static inline double horizontal_add (Vec8d const & a) {
#if defined(__INTEL_COMPILER)
    return _mm512_reduce_add_pd(a);
#else
    return horizontal_add(a.get_low() + a.get_high());
#endif
}

get_high() and get_low()

Vec8f get_high() const {
    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(zmm),1));
}
Vec8f get_low() const {
    return _mm512_castps512_ps256(zmm);
}

Vec4d get_low() const {
    return _mm512_castpd512_pd256(zmm);
}

Vec4d get_high() const {
    return _mm512_extractf64x4_pd(zmm,1);
}

For integers look for horizontal_add in vectori128.h, vectori256.h, and vectori512.h.

You can also use the Vector Class Library (VCL) directly

#include <stdio.h>
#define MAX_VECTOR_SIZE 512
#include "vectorclass.h"

int main(void) {

  float x[16]; for(int i=0;i<16;i++) x[i]=i+1;
  Vec4f  v4  =  Vec4f().load(x);
  Vec8f  v8  =  Vec8f().load(x);
  Vec16f v16 = Vec16f().load(x);

  printf("%f %d\n", horizontal_add(v4), 4*5/2);
  printf("%f %d\n", horizontal_add(v8), 8*9/2);
  printf("%f %d\n", horizontal_add(v16), 16*17/2);
}

Compile like this (GCC only my KNL is too old for AVX512)

SSE2:     g++  -O3 test.cpp
AVX:      g++  -O3 -mavx test.cpp
AVX512ER: icpc -O3 -xMIC-AVX512 test.cpp

output

10.000000 10
36.000000 36
136.000000 136

One nice thing with the VCL library is that if you use e.g. Vec8f with a system that only has SSE2 it will emulate AVX using SSE twice.

See the section "Instruction sets and CPU dispatching" in the vectorclass.pdf manual for how to compile for different instruction sets with MSVC, ICC, Clang, and GCC.

`Rant = ON;`Good answer, but would be nice if less people used intrinsics and more used assembly. `Rant = OFF;` — annoying_squid, Jul 26 '18 at 18:36

score 2 · Answer 2 · answered Apr 01 '17 at 17:51

I have implemented the following inline function for AVX2. It sums all elements and returns the result. You can look this as a suggestion answer to develop your own function for this purpose.

Note: _mm256_extract_epi32 is not presented for AVX you can use your own method with vmovss such as float _mm256_cvtss_f32 (__m256 a) instead and develop your horizontal addition functions.

// my horizontal addition of epi32
inline int _mm256_hadd2_epi32(__m256i a)
{
    __m256i a_hi;
    a_hi = _mm256_permute2x128_si256(a, a, 1); //maybe it should be 4 
    a = _mm256_hadd_epi32(a, a_hi);
    a = _mm256_hadd_epi32(a, a);
    a = _mm256_hadd_epi32(a, a);
    return _mm256_extract_epi32(a,0);
}

How to sum all 32-bit or 64-bit sub-registers in an SSE XMM, or AVX YMM, and ZMM register?

2 Answers2