Thursday, February 26, 2015
Introducing MOVDQU, PADDB and MOVNTPD instructions
//-------------------------------------------
//1. use movdqu to:
//load array values in xmm0 and xmm1
//2. use paddb to:
//add each byte element in the two arrays.
//3. use movntpd to:
//save the result in xmm1 to the sum array.
//Notes:
//If your cpu supports sse4.1 the movdqu can
//be replaced with movntdqa. 128bit movntdqa
//requires the operand to be 16byte aligned. Hence the
//array declarations are accompanied by aligned(16).
//for using movdqu aligned(16) is not required.
//--------------------------------------------
#include <stdio.h>
#include <stdlib.h>
int main() {
unsigned char __attribute__((aligned(16))) carray1[16];
unsigned char __attribute__((aligned(16))) carray2[16];
unsigned char sum[16];
int i = 0;
for(i=0;i<16;i++){
carray1[i] = i;
carray2[i] = carray1[i] ;
}
asm volatile ("movdqu %0, %%xmm0;" : :"m"(carray1[0]) :);
asm volatile ("movdqu %0, %%xmm1;" : :"m"(carray2[0]) :);
asm volatile ("paddb %%xmm0, %%xmm1;" : : :);
asm volatile ("movntdq %%xmm1, %0;" :"=m"(sum[0]) : :);
printf("The second element of the sum array is %d\n", sum[1]);
printf("The middle element of the sum array is %d\n", sum[8]);
printf("The last element of the sum array is %d\n", sum[15]);
return 0;
}
$ gcc -g -Fstabs paddb.c
$ ./a.out
The second element of the sum array is 2
The middle element of the sum array is 16
The last element of the sum array is 30
Subscribe to Posts [Atom]
Post a Comment