Unless your CPU has XOP than there is no efficient way to compare unsigned 64-bit integers.
I ripped the following from Agner Fog's Vector Class Library. This shows how to compare unsigned 64-bit integers.
static inline Vec2qb operator > (Vec2uq const & a, Vec2uq const & b) {
#ifdef __XOP__ // AMD XOP instruction set
return Vec2q(_mm_comgt_epu64(a,b));
#else // SSE2 instruction set
__m128i sign32 = _mm_set1_epi32(0x80000000); // sign bit of each dword
__m128i aflip = _mm_xor_si128(a,sign32); // a with sign bits flipped
__m128i bflip = _mm_xor_si128(b,sign32); // b with sign bits flipped
__m128i equal = _mm_cmpeq_epi32(a,b); // a == b, dwords
__m128i bigger = _mm_cmpgt_epi32(aflip,bflip); // a > b, dwords
__m128i biggerl = _mm_shuffle_epi32(bigger,0xA0); // a > b, low dwords copied to high dwords
__m128i eqbig = _mm_and_si128(equal,biggerl); // high part equal and low part bigger
__m128i hibig = _mm_or_si128(bigger,eqbig); // high part bigger or high part equal and low part bigger
__m128i big = _mm_shuffle_epi32(hibig,0xF5); // result copied to low part
return Vec2qb(Vec2q(big));
#endif
}
So if you CPU supports XOP than you should try compiling with -mxop and see if the loop is vectorized.
Edit: If GCC does not vectorize this like you want and your CPU has XOP you can do
for (WorkerID=0; WorkerID<WorkersON-1; workerID+=2){
__m128i v = _mm_loadu_si128((__m128i*)&WorkerDataTime[workerID]);
__m128i cmp = _mm_comgt_epu64(v, _mm_setzero_si128());
v = _mm_add_epi64(v,cmp);
_mm_storeu_si128((__m128i*)&WorkerDataTime[workerID], v);
}
for (;WorkerID<WorkersON;++WorkerID){
if(WorkerDataTime[WorkerID] > 0) WorkerDataTime[WorkerID]-=1;
}
Compile with -mxop and include #include <x86intrin.h>.
Edit: as Nils Pipenbrinck pointed out if you don't have XOP you can do this with one more instruction using _mm_xor_si128:
for (WorkerID=0; WorkerID<WorkersON-1; WorkerID+=2){
__m128i v = _mm_loadu_si128((__m128i*)&WorkerDataTime[workerID]);
__m128i mask = _mm_cmpeq_epi64(v,_mm_setzero_si128());
mask = _mm_xor_si128(mask, _mm_set1_epi32(~0));
v= _mm_add_epi64(v,mask);
_mm_storeu_si128((__m128i*)&WorkerDataTime[workerID], v);
}
for (;WorkerID<WorkersON;++WorkerID){
if(WorkerDataTime[WorkerID] > 0) WorkerDataTime[WorkerID]-=1;
}
Edit:
Based on a comment by Stephen Canon I learned that there is a more efficient way to compare general 64-bit unsigned integers using the pcmpgtq instruction from SSE4.2:
__m128i a,b;
__m128i sign64 = _mm_set1_epi64x(0x8000000000000000L);
__m128i aflip = _mm_xor_si128(a, sign64);
__m128i bflip = _mm_xor_si128(b, sign64);
__m128i cmp = _mm_cmpgt_epi64(aflip,bflip);