Thursday, February 26, 2015
Introducing MOVDQU, PADDB and MOVNTPD instructions
//-------------------------------------------
//1. use movdqu to:
//load array values in xmm0 and xmm1
//2. use paddb to:
//add each byte element in the two arrays.
//3. use movntpd to:
//save the result in xmm1 to the sum array.
//Notes:
//If your cpu supports sse4.1 the movdqu can
//be replaced with movntdqa. 128bit movntdqa
//requires the operand to be 16byte aligned. Hence the
//array declarations are accompanied by aligned(16).
//for using movdqu aligned(16) is not required.
//--------------------------------------------
#include <stdio.h>
#include <stdlib.h>
int main() {
unsigned char __attribute__((aligned(16))) carray1[16];
unsigned char __attribute__((aligned(16))) carray2[16];
unsigned char sum[16];
int i = 0;
for(i=0;i<16;i++){
carray1[i] = i;
carray2[i] = carray1[i] ;
}
asm volatile ("movdqu %0, %%xmm0;" : :"m"(carray1[0]) :);
asm volatile ("movdqu %0, %%xmm1;" : :"m"(carray2[0]) :);
asm volatile ("paddb %%xmm0, %%xmm1;" : : :);
asm volatile ("movntdq %%xmm1, %0;" :"=m"(sum[0]) : :);
printf("The second element of the sum array is %d\n", sum[1]);
printf("The middle element of the sum array is %d\n", sum[8]);
printf("The last element of the sum array is %d\n", sum[15]);
return 0;
}
$ gcc -g -Fstabs paddb.c
$ ./a.out
The second element of the sum array is 2
The middle element of the sum array is 16
The last element of the sum array is 30
Friday, February 20, 2015
CVTPS2PD and CVTSD2SI instructions
//--------------------------------------
//summary:
//Take floating point numbers from f[].
//convert float numbers from f[] into
//double and put it in d[]. (cvtps2pd)
//convert double numbers from d[] into
//int and put it in op_arr[]. (cvtsd2si)
//--------------------------------------
#include <stdio.h>
#include <stdlib.h>
int main() {
float f[] = {3.22779, 5.25, 6.8, 44.65};
double d[] = {0.0,0.0,0.0,0.0};
int op_arr[] = {0,0,0,0};
int index=0;
//convert f[0] and f[1] to double precision
asm volatile ( "cvtps2pd %0, %%xmm0;\n"
: :"m"(f[0]) : );
//convert f[2] and f[3] to double precision
asm volatile ( "cvtps2pd %0, %%xmm1;\n"
: :"m"(f[2]) : );
//mov xmm0, xmm1 to memory
asm volatile ("movupd %%xmm0, %0;\n"
:"=m"(d[0]) : :);
asm volatile ("movupd %%xmm1, %0;\n"
:"=m"(d[2]) : :);
//now convert d[0] to d[3] into int
//result in eax, ebx, ecx, edx
asm volatile ("cvtsd2si %0, %%eax\n"
: :"m"(d[0]) :);
asm volatile ("cvtsd2si %0, %%ebx\n"
: :"m"(d[1]) :);
asm volatile ("cvtsd2si %0, %%ecx\n"
: :"m"(d[2]) :);
asm volatile ("cvtsd2si %0, %%edx\n"
: :"m"(d[3]) :);
//mov eax, ebx, ecx, edx
//into the int array
asm volatile ("movl %%eax, %0\n"
: :"m"(op_arr[0]) :);
asm volatile ("movl %%ebx, %0\n"
: :"m"(op_arr[1]) :);
asm volatile ("movl %%ecx, %0\n"
: :"m"(op_arr[2]) :);
asm volatile ("movl %%edx, %0\n"
: :"m"(op_arr[3]) :);
for (index=0; index < 4 ; index++) {
printf("float value %f: double value %e\n", f[index] , d[index]);
printf("float value %f: int value %d\n", f[index] , op_arr[index]);
}
return 0;
}
$ gcc -g -Fstabs cvtps2pd_cvtsd2si.c
$ ./a.out float value 3.227790: double value 3.227790e+00
float value 3.227790: int value 3
float value 5.250000: double value 5.250000e+00
float value 5.250000: int value 5
float value 6.800000: double value 6.800000e+00
float value 6.800000: int value 7
float value 44.650002: double value 4.465000e+01
float value 44.650002: int value 45
Subscribe to Posts [Atom]