I'm trying to take the content from ymm0,ymm1,ymm2, break it into 12 byte chunks, apply a xor and store the result in a buffer. The following code works, but is really cumbersome and it would be great if someone could point me to a more elegant way to do it.
# break into 6 chunks of 16 byte
vextracti128 xmm4,ymm0,1
vextracti128 xmm5,ymm1,0
vextracti128 xmm6,ymm1,1
vextracti128 xmm7,ymm2,0
vextracti128 xmm8,ymm2,1
# xor the register and reduce the size to 48
pxor xmm0,xmm6
vpxor xmm1,xmm4,xmm7
vpxor xmm2,xmm5,xmm8
# build 12 byte chunks
movdqa xmm3,xmm0
psrldq xmm3,12 # last 4byte from xmm0
movdqa xmm4,xmm1
pslldq xmm4,4 # first 8 byte vom xmm1 shifted by 4 byte
por xmm3, xmm4
movdqa xmm4,xmm1
psrldq xmm4,8 # last 8 byte from xmm1
movdqa xmm5,xmm2
pslldq xmm5,8 # first 4 byte vom xmm2
por xmm4, xmm5
psrldq xmm2,4 # last 12 byte vom xmm2
# final xor into xmm0
pxor xmm0,xmm3
pxor xmm0,xmm4
pxor xmm0,xmm2
# finally move the result from xmm0 to the result buffer
movq rax, xmm0
mov [rdi], rax # write first 8 byte into result buffer
psrldq xmm0, 8
movd eax, xmm0
mov [rdi+8], eax # write final 4 byte into result buffer