Sunday, December 25, 2016
Compute area of a circle using fldpi,fmul and fld instructions
1: //******************************************
2: //Note st0 refers to the top of
3: //the floating point stack.
4: //1. fldpi loads st0 with PI.
5: //2. flds or fldl loads st0 with the radius provided by the user
6: //flds is used for float and fldl is used if the radius is
7: //declared as double. This is required by the gnu assembler.
8: //3. (first fmul) multiply st0 with st0 this computes r*r
9: //4. (second fmul) multiply st0 with st1 this computes pi * r * r
10: //Then save the top of floating point stack(st0) into the variable area.
11: //In inline assembly, the "=t" means top of floating point stack and in
12: //this program it is mapped to area.
13: //******************************************
14: #include <stdio.h>
15: int main(int argc, char* argv[]){
16: double radius, area;
17: printf("Enter the radius\n");
18: scanf("%lf", &radius);
19: asm volatile(
20: "fldpi\n"
21: "fldl %1\n"
22: "fmul %%st(0),%%st(0)\n" //compute r ^ 2
23: "fmul %%st(1), %%st(0)\n" //compute pi * r^2
24: :"=t"(area)
25: :"m"(radius)
26: :
27: );
28: printf("The area is %lf\n",area);
29: return 0;
30: }
$ gcc -g -Fstabs area.c
$ ./a.out
Enter the radius
7.0
The area is 153.938040
Thursday, February 26, 2015
Introducing MOVDQU, PADDB and MOVNTPD instructions
//-------------------------------------------
//1. use movdqu to:
//load array values in xmm0 and xmm1
//2. use paddb to:
//add each byte element in the two arrays.
//3. use movntpd to:
//save the result in xmm1 to the sum array.
//Notes:
//If your cpu supports sse4.1 the movdqu can
//be replaced with movntdqa. 128bit movntdqa
//requires the operand to be 16byte aligned. Hence the
//array declarations are accompanied by aligned(16).
//for using movdqu aligned(16) is not required.
//--------------------------------------------
#include <stdio.h>
#include <stdlib.h>
int main() {
unsigned char __attribute__((aligned(16))) carray1[16];
unsigned char __attribute__((aligned(16))) carray2[16];
unsigned char sum[16];
int i = 0;
for(i=0;i<16;i++){
carray1[i] = i;
carray2[i] = carray1[i] ;
}
asm volatile ("movdqu %0, %%xmm0;" : :"m"(carray1[0]) :);
asm volatile ("movdqu %0, %%xmm1;" : :"m"(carray2[0]) :);
asm volatile ("paddb %%xmm0, %%xmm1;" : : :);
asm volatile ("movntdq %%xmm1, %0;" :"=m"(sum[0]) : :);
printf("The second element of the sum array is %d\n", sum[1]);
printf("The middle element of the sum array is %d\n", sum[8]);
printf("The last element of the sum array is %d\n", sum[15]);
return 0;
}
$ gcc -g -Fstabs paddb.c
$ ./a.out
The second element of the sum array is 2
The middle element of the sum array is 16
The last element of the sum array is 30
Friday, February 20, 2015
CVTPS2PD and CVTSD2SI instructions
//--------------------------------------
//summary:
//Take floating point numbers from f[].
//convert float numbers from f[] into
//double and put it in d[]. (cvtps2pd)
//convert double numbers from d[] into
//int and put it in op_arr[]. (cvtsd2si)
//--------------------------------------
#include <stdio.h>
#include <stdlib.h>
int main() {
float f[] = {3.22779, 5.25, 6.8, 44.65};
double d[] = {0.0,0.0,0.0,0.0};
int op_arr[] = {0,0,0,0};
int index=0;
//convert f[0] and f[1] to double precision
asm volatile ( "cvtps2pd %0, %%xmm0;\n"
: :"m"(f[0]) : );
//convert f[2] and f[3] to double precision
asm volatile ( "cvtps2pd %0, %%xmm1;\n"
: :"m"(f[2]) : );
//mov xmm0, xmm1 to memory
asm volatile ("movupd %%xmm0, %0;\n"
:"=m"(d[0]) : :);
asm volatile ("movupd %%xmm1, %0;\n"
:"=m"(d[2]) : :);
//now convert d[0] to d[3] into int
//result in eax, ebx, ecx, edx
asm volatile ("cvtsd2si %0, %%eax\n"
: :"m"(d[0]) :);
asm volatile ("cvtsd2si %0, %%ebx\n"
: :"m"(d[1]) :);
asm volatile ("cvtsd2si %0, %%ecx\n"
: :"m"(d[2]) :);
asm volatile ("cvtsd2si %0, %%edx\n"
: :"m"(d[3]) :);
//mov eax, ebx, ecx, edx
//into the int array
asm volatile ("movl %%eax, %0\n"
: :"m"(op_arr[0]) :);
asm volatile ("movl %%ebx, %0\n"
: :"m"(op_arr[1]) :);
asm volatile ("movl %%ecx, %0\n"
: :"m"(op_arr[2]) :);
asm volatile ("movl %%edx, %0\n"
: :"m"(op_arr[3]) :);
for (index=0; index < 4 ; index++) {
printf("float value %f: double value %e\n", f[index] , d[index]);
printf("float value %f: int value %d\n", f[index] , op_arr[index]);
}
return 0;
}
$ gcc -g -Fstabs cvtps2pd_cvtsd2si.c
$ ./a.out float value 3.227790: double value 3.227790e+00
float value 3.227790: int value 3
float value 5.250000: double value 5.250000e+00
float value 5.250000: int value 5
float value 6.800000: double value 6.800000e+00
float value 6.800000: int value 7
float value 44.650002: double value 4.465000e+01
float value 44.650002: int value 45
Wednesday, January 28, 2015
CVTPS2DQ using inline assembly
//-------------------------------------------
//Use inline assembly to demonstrate the cvtps2dq
//instruction. The instruction converts single
//precision floating point values to dword integers.
//The program will convert the four floats into integers
//and put the values in op_arr.
//--------------------------------------------
#include <stdio.h>
#include <stdlib.h>
int main() {
float f[] = {3.22779, 5.25, 6.8, 44.65};
int op_arr[] = {0,0,0,0};
int index=0;
asm volatile (
"cvtps2dq %1, %%xmm0;\n"
"movups %%xmm0, %0;\n"
:"=m"(op_arr[0])
:"m"(f[0])
:
);
for (index=0; index < 4 ; index++) {
printf("float value %f: int value %d\n", f[index] , op_arr[index]);
}
return 0;
}
To build:
$ gcc -g -Fstabs cvtps2dq.c
$ ./a.out
float value 3.227790: int value 3
float value 5.250000: int value 5
float value 6.800000: int value 7
float value 44.650002: int value 45
Monday, November 3, 2014
CVTDQ2PD instruction
In this example, there are two integers data1(=35) and data2 (=67).
Using the CVTDQ2PD instruction, the two integers above will
be converted to Double Precision floating point values. The CVTDQ2PS
instruction is used if the integers are to be converted to Single Precision
floating point values.
Syntax:
CVTDQ2PD xmm1, m64 --> m64 is the memory location to 64 bits(2 dword integers) of data
[or]
CVTDQ2PD xmm1, xmm2 --> xmm2 contains two dword integers.
Program:
section .data
data1 dd 35
data2 dd 67
section .text
global main
main:
nop
cvtdq2pd xmm7, [data1]
movq xmm3, [data1] ; move quadword into xmm3
cvtdq2pd xmm4, xmm3
mov eax, 1
mov ebx, 0
int 0x80
Notes:
There are two forms of cvtdq2pd used above - The first one converts two dword integers from memory into two double precision floating point values. The second form performs the same operation with the source residing in xmm3 register. The value is moved into xmm3 from memory using the movq instruction.
To assemble and link:
nasm -felf64 cvt.asm
gcc -o cvt cvt.o
Using gdb:
3. gdb cvt
(gdb) break main
(gdb) run
(gdb) set disassembly-flavor intel
(gdb) disassemble main
;Dump of assembler code for function main:
0x00000000004004c0 <+0>: nop
0x00000000004004c1 <+1>: cvtdq2pd xmm7,QWORD PTR ds:0x601018
0x00000000004004ca <+10>: movq xmm3,QWORD PTR ds:0x601018
0x00000000004004d3 <+19>: cvtdq2pd xmm4,xmm3
=> 0x00000000004004d7 <+23>: mov eax,0x1
0x00000000004004dc <+28>: mov ebx,0x0
0x00000000004004e1 <+33>: int 0x80
4. Set a breakpoint on IP 0x4004c1 , then check the value of xmm7:
(gdb) p /x $xmm7
$1 = {v4_float = {0x0, 0x3, 0x0, 0x3}, v2_double = {0x23, 0x43}, v16_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x41, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x50, 0x40},
v8_int16 = {0x0, 0x0, 0x8000, 0x4041, 0x0, 0x0, 0xc000, 0x4050}, v4_int32 = {0x0, 0x40418000, 0x0, 0x4050c000}, v2_int64 = {0x4041800000000000, 0x4050c00000000000},
uint128 = 0x4050c000000000004041800000000000}
xmm7[63:0] has 0x4041800000000000 which is 35.0 represented in double precision format.
xmm7[127:64] has 0x4050c00000000000 which is 67.0 represented in double precision format.
Thursday, July 23, 2009
SSE2 Data Transfer/Packed Arithmetic Instruction - Example
SIMD: Single Instruction Multiple Data
This example shows the operation of 3 SSE2 instructions:
a) MOVLPD - SSE2 Data Transfer Instruction
b) MOVHPD - SSE2 Data Transfer Instruction
c) ADDPD - SSE2 Packed Arithmetic Instruction
The registers used in the example are the extended MMX registers (hence the abbreviation XMM). The x86 architecture provides for 16 XMM registers in 64-bit mode and 8 registers in 32-bit mode.
The XMM registers are 128 bit registers. These registers can be imagined as having 2 parts: a lower and a upper part of 64 bits each.
MOVLPD - Moves Data to the lower part of the XMM register. (bits 63:0)
MOVHPD - Moves Data to the upper part of the XMM register. (bits 127:64)
ADDPD - Adds the packed values in the two registers and saves the result in the destination register.
The instruction addpd xmm1, xmm0 works as explained under:
xmm1[63:0] <- xmm0[63:0] + xmm1[63:0]
xmm1[127:64] <- xmm0[127:64] + xmm1[127:64]
Here is a simple example that utilizes all these instructions:
1. The goal of this example is to add mm0_data_low (1.5) to mm1_data_low (2.5) and mm0_data_high(2.5) to mm1_data_high(2.0).
2. By using the SIMD instructions adding 2 different pairs of floating point numbers is done in a single instruction. Hence the name SIMD - Single Instruction Multiple Data.
//////////////////////////////////
section .data
mm0_data_low dq 1.5
mm0_data_high dq 2.5
mm1_data_high dq 2.0
mm1_data_low dq 2.5
section .text
global _start
_start:
nop
; xmm0[63:0] <- 1.5
movlpd xmm0, [mm0_data_low]
; xmm0[127:64] <- 2.5
movhpd xmm0, [mm0_data_high]
; xmm1[63:0] <- 2.0
movlpd xmm1, [mm1_data_low]
; xmm1[127:64] <- 2.5
movhpd xmm1, [mm1_data_high]
; xmm1[127:64] <- xmm0[127:64] + xmm1[127:64]
; xmm1[63:0] <- xmm0[63:0] + xmm1[63:0]
addpd xmm1,xmm0
mov eax, 1
mov ebx, 0
int 0x80
//////////////////////////////////////////////
Lets run this program through gdb and see what the values are:
We expect the following values in XMM1:
xmm1[127:64] = 4.5
xmm1[63:0] = 4.0
After loading the low-part of xmm0:
(gdb) p $xmm0
$2 = v2_double = {1.5, 0}
xmm0 low-part is 1.5
Now load the upper-part of xmm0:
(gdb) next
14 movhpd xmm0, [mm0_data_high]
(gdb) p $xmm0
$3 = v2_double = {1.5, 2.5}
xmm0 upper-part is 2.5 and xmm0 low-part is 1.5
Now load the low-part of xmm1:
(gdb) next
15 movlpd xmm1, [mm1_data_low]
(gdb) p $xmm1
$4 = v2_double = {2.5, 0}
xmm1 low-part is 2.5
(gdb) next
16 movhpd xmm1, [mm1_data_high]
gdb) p $xmm1
$5 = v2_double = {2.5, 2}
xmm1 upper-part is 2.0 and low-part is 2.5
Finally, the addpd:
(gdb) next
17 addpd xmm1,xmm0
p $xmm1
$6 = v2_double = {4, 4.5}
This agrees with our expected result of xmm1[127:64] = 4.5 and xmm1[63:0] = 4.0.
Tuesday, June 16, 2009
String Instructions - scasb,scasw,scasd,scasq
Labels: scasb, scasd, scasw, x86 string instructions
Subscribe to Posts [Atom]