Sunday, December 25, 2016

 

Compute area of a circle using fldpi,fmul and fld instructions

1:  //******************************************  
2:  //Note st0 refers to the top of  
3:  //the floating point stack.  
4:  //1. fldpi loads st0 with PI.  
5:  //2. flds or fldl loads st0 with the radius provided by the user  
6:  //flds is used for float and fldl is used if the radius is  
7:  //declared as double. This is required by the gnu assembler.  
8:  //3. (first fmul) multiply st0 with st0  this computes r*r  
9:  //4. (second fmul) multiply st0 with st1 this computes pi * r * r  
10:  //Then save the top of floating point stack(st0) into the variable area.  
11:  //In inline assembly, the "=t" means top of floating point stack and in   
12:  //this program it is mapped to area.  
13:  //******************************************  
14:  #include <stdio.h>  
15:  int main(int argc, char* argv[]){  
16:   double radius, area;  
17:   printf("Enter the radius\n");  
18:   scanf("%lf", &radius);  
19:   asm volatile(  
20:    "fldpi\n"   
21:    "fldl %1\n"   
22:    "fmul %%st(0),%%st(0)\n" //compute r ^ 2  
23:    "fmul %%st(1), %%st(0)\n" //compute pi * r^2  
24:    :"=t"(area)  
25:    :"m"(radius)  
26:    :  
27:   );  
28:   printf("The area is %lf\n",area);  
29:   return 0;  
30:  }  
 
 $ gcc -g -Fstabs area.c
 
 $ ./a.out 
Enter the radius
7.0
The area is 153.938040
 

Thursday, February 26, 2015

 

Introducing MOVDQU, PADDB and MOVNTPD instructions

 //-------------------------------------------  
 //1. use movdqu to:  
 //load array values in xmm0 and xmm1  
 //2. use paddb to:  
 //add each byte element in the two arrays.  
 //3. use movntpd to:  
 //save the result in xmm1 to the sum array.  
 //Notes:  
 //If your cpu supports sse4.1 the movdqu can   
 //be replaced with movntdqa. 128bit movntdqa   
 //requires the operand to be 16byte aligned. Hence the  
 //array declarations are accompanied by aligned(16).  
 //for using movdqu aligned(16) is not required.  
 //--------------------------------------------  
 #include <stdio.h>  
 #include <stdlib.h>  
 int main() {  
  unsigned char __attribute__((aligned(16))) carray1[16];   
  unsigned char __attribute__((aligned(16))) carray2[16];   
  unsigned char sum[16];   
  int i = 0;  
  for(i=0;i<16;i++){  
    carray1[i] = i;  
    carray2[i] = carray1[i] ;  
  }  
  asm volatile ("movdqu %0, %%xmm0;" : :"m"(carray1[0]) :);  
  asm volatile ("movdqu %0, %%xmm1;" : :"m"(carray2[0]) :);  
  asm volatile ("paddb %%xmm0, %%xmm1;" : : :);  
  asm volatile ("movntdq %%xmm1, %0;" :"=m"(sum[0]) : :);   
  printf("The second element of the sum array is %d\n", sum[1]);  
  printf("The middle element of the sum array is %d\n", sum[8]);  
  printf("The last element of the sum array is %d\n", sum[15]);  
  return 0;  
 }  


$ gcc -g -Fstabs paddb.c

$ ./a.out
The second element of the sum array is 2
The middle element of the sum array is 16
The last element of the sum array is 30

Friday, February 20, 2015

 

CVTPS2PD and CVTSD2SI instructions

 //--------------------------------------  
 //summary:   
 //Take floating point numbers from f[].  
 //convert float numbers from f[] into   
 //double and put it in d[]. (cvtps2pd)  
 //convert double numbers from d[] into   
 //int and put it in op_arr[]. (cvtsd2si)  
 //--------------------------------------  
 #include <stdio.h>  
 #include <stdlib.h>  
 int main() {  
  float f[] = {3.22779, 5.25, 6.8, 44.65};  
  double d[] = {0.0,0.0,0.0,0.0};  
  int op_arr[] = {0,0,0,0};  
  int index=0;  
  //convert f[0] and f[1] to double precision  
  asm  volatile ( "cvtps2pd %0, %%xmm0;\n"   
          : :"m"(f[0]) : );   
  //convert f[2] and f[3] to double precision  
  asm  volatile ( "cvtps2pd %0, %%xmm1;\n"   
            : :"m"(f[2]) : );   
  //mov xmm0, xmm1 to memory  
  asm volatile ("movupd %%xmm0, %0;\n"   
           :"=m"(d[0]) : :);  
  asm volatile ("movupd %%xmm1, %0;\n"   
           :"=m"(d[2]) : :);  
  //now convert d[0] to d[3] into int   
  //result in eax, ebx, ecx, edx  
  asm volatile ("cvtsd2si %0, %%eax\n"   
           : :"m"(d[0]) :);   
  asm volatile ("cvtsd2si %0, %%ebx\n"   
           : :"m"(d[1]) :);   
  asm volatile ("cvtsd2si %0, %%ecx\n"   
           : :"m"(d[2]) :);   
  asm volatile ("cvtsd2si %0, %%edx\n"   
           : :"m"(d[3]) :);   
  //mov eax, ebx, ecx, edx   
  //into the int array  
  asm volatile ("movl %%eax, %0\n"   
          : :"m"(op_arr[0]) :);   
  asm volatile ("movl %%ebx, %0\n"   
         : :"m"(op_arr[1]) :);   
  asm volatile ("movl %%ecx, %0\n"   
          : :"m"(op_arr[2]) :);   
  asm volatile ("movl %%edx, %0\n"   
           : :"m"(op_arr[3]) :);   
  for (index=0; index < 4 ; index++) {  
    printf("float value %f: double value %e\n", f[index] , d[index]);  
    printf("float value %f: int value %d\n",  f[index] , op_arr[index]);  
    }  
  return 0;  
 }  
 $ gcc -g -Fstabs cvtps2pd_cvtsd2si.c
 
 $ ./a.out float value 3.227790: double value 3.227790e+00
float value 3.227790: int value 3
float value 5.250000: double value 5.250000e+00
float value 5.250000: int value 5
float value 6.800000: double value 6.800000e+00
float value 6.800000: int value 7
float value 44.650002: double value 4.465000e+01
float value 44.650002: int value 45

Wednesday, January 28, 2015

 

CVTPS2DQ using inline assembly

 //-------------------------------------------
//Use inline assembly to demonstrate the cvtps2dq
//instruction. The instruction converts single 
//precision floating point values to dword integers.
//The program will convert the four floats into integers
//and put the values in op_arr.
//--------------------------------------------
#include <stdio.h>  
 #include <stdlib.h>  
 int main() {  
  float f[] = {3.22779, 5.25, 6.8, 44.65};  
  int op_arr[] = {0,0,0,0};  
  int index=0;  
  asm  volatile (  
            "cvtps2dq %1, %%xmm0;\n"   
            "movups %%xmm0, %0;\n"  
          :"=m"(op_arr[0])  
            :"m"(f[0])  
            :  
        );   
 for (index=0; index < 4 ; index++) {  
   printf("float value %f: int value %d\n", f[index] , op_arr[index]);  
 }  
  return 0;  
 }   
 
 To build: 
 $ gcc -g -Fstabs cvtps2dq.c
 
 $ ./a.out 
float value 3.227790: int value 3
float value 5.250000: int value 5
float value 6.800000: int value 7
float value 44.650002: int value 45

Monday, November 3, 2014

 

CVTDQ2PD instruction

Convert Packed Dword Integers to Packed Double-Precision FP Values.

In this example, there are two integers data1(=35) and data2 (=67).
Using the CVTDQ2PD instruction, the two integers above will
be converted to Double Precision floating point values. The CVTDQ2PS
instruction is used if the integers are to be converted to Single Precision
floating point values.

Syntax:

CVTDQ2PD xmm1, m64 --> m64 is the memory location to 64 bits(2 dword integers) of data

[or]

CVTDQ2PD xmm1, xmm2 --> xmm2 contains two dword integers.


Program:
 section .data
data1   dd 35
data2   dd 67

section .text

global main

main:
nop
cvtdq2pd xmm7, [data1]

movq xmm3, [data1] ; move quadword into xmm3
cvtdq2pd xmm4, xmm3

mov eax, 1
mov ebx, 0
int 0x80

Notes:
 There are two forms of cvtdq2pd used above - The first one converts two dword integers from memory into two double precision floating point values. The second form performs the same operation with the source residing in xmm3 register. The value is moved into xmm3 from memory using the movq instruction.

To assemble and link:

 nasm -felf64 cvt.asm
  gcc -o cvt cvt.o

Using gdb:

3. gdb  cvt
     (gdb) break main
     (gdb) run
     (gdb) set disassembly-flavor intel
     (gdb) disassemble main

;Dump of assembler code for function main:
  0x00000000004004c0 <+0>:    nop
   0x00000000004004c1 <+1>:    cvtdq2pd xmm7,QWORD PTR ds:0x601018
   0x00000000004004ca <+10>:    movq   xmm3,QWORD PTR ds:0x601018
   0x00000000004004d3 <+19>:    cvtdq2pd xmm4,xmm3
=> 0x00000000004004d7 <+23>:    mov    eax,0x1
   0x00000000004004dc <+28>:    mov    ebx,0x0
   0x00000000004004e1 <+33>:    int    0x80

4. Set a  breakpoint on IP 0x4004c1 , then check the value of xmm7:

(gdb) p /x $xmm7
$1 = {v4_float = {0x0, 0x3, 0x0, 0x3}, v2_double = {0x23, 0x43}, v16_int8 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x80, 0x41, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0xc0, 0x50, 0x40},
  v8_int16 = {0x0, 0x0, 0x8000, 0x4041, 0x0, 0x0, 0xc000, 0x4050}, v4_int32 = {0x0,   0x40418000, 0x0, 0x4050c000}, v2_int64 = {0x4041800000000000, 0x4050c00000000000},
  uint128 = 0x4050c000000000004041800000000000}


xmm7[63:0] has 0x4041800000000000 which is 35.0 represented in double precision format.

 xmm7[127:64] has 0x4050c00000000000 which is 67.0 represented in double precision format.




Thursday, July 23, 2009

 

SSE2 Data Transfer/Packed Arithmetic Instruction - Example

SSE2 : Single Streaming Extensions2
SIMD: Single Instruction Multiple Data

This example shows the operation of 3 SSE2 instructions:

a) MOVLPD - SSE2 Data Transfer Instruction
b) MOVHPD - SSE2 Data Transfer Instruction
c) ADDPD - SSE2 Packed Arithmetic Instruction

The registers used in the example are the extended MMX registers (hence the abbreviation XMM). The x86 architecture provides for 16 XMM registers in 64-bit mode and 8 registers in 32-bit mode.

The XMM registers are 128 bit registers. These registers can be imagined as having 2 parts: a lower and a upper part of 64 bits each.

MOVLPD - Moves Data to the lower part of the XMM register. (bits 63:0)
MOVHPD - Moves Data to the upper part of the XMM register. (bits 127:64)
ADDPD - Adds the packed values in the two registers and saves the result in the destination register.

The instruction addpd xmm1, xmm0 works as explained under:

xmm1[63:0] <- xmm0[63:0] + xmm1[63:0]
xmm1[127:64] <- xmm0[127:64] + xmm1[127:64]

Here is a simple example that utilizes all these instructions:

1. The goal of this example is to add mm0_data_low (1.5) to mm1_data_low (2.5) and mm0_data_high(2.5) to mm1_data_high(2.0).

2. By using the SIMD instructions adding 2 different pairs of floating point numbers is done in a single instruction. Hence the name SIMD - Single Instruction Multiple Data.


//////////////////////////////////
section .data
mm0_data_low dq 1.5
mm0_data_high dq 2.5
mm1_data_high dq 2.0
mm1_data_low dq 2.5

section .text

global _start

_start:
nop

; xmm0[63:0] <- 1.5
movlpd xmm0, [mm0_data_low]

; xmm0[127:64] <- 2.5
movhpd xmm0, [mm0_data_high]

; xmm1[63:0] <- 2.0
movlpd xmm1, [mm1_data_low]

; xmm1[127:64] <- 2.5
movhpd xmm1, [mm1_data_high]

; xmm1[127:64] <- xmm0[127:64] + xmm1[127:64]

; xmm1[63:0] <- xmm0[63:0] + xmm1[63:0]

addpd xmm1,xmm0

mov eax, 1
mov ebx, 0
int 0x80
//////////////////////////////////////////////


Lets run this program through gdb and see what the values are:
We expect the following values in XMM1:
xmm1[127:64] = 4.5
xmm1[63:0] = 4.0

After loading the low-part of xmm0:

(gdb) p $xmm0
$2 = v2_double = {1.5, 0}
xmm0 low-part is 1.5

Now load the upper-part of xmm0:

(gdb) next
14 movhpd xmm0, [mm0_data_high]
(gdb) p $xmm0
$3 = v2_double = {1.5, 2.5}
xmm0 upper-part is 2.5 and xmm0 low-part is 1.5

Now load the low-part of xmm1:

(gdb) next
15 movlpd xmm1, [mm1_data_low]
(gdb) p $xmm1
$4 = v2_double = {2.5, 0}
xmm1 low-part is 2.5

(gdb) next
16 movhpd xmm1, [mm1_data_high]
gdb) p $xmm1
$5 = v2_double = {2.5, 2}
xmm1 upper-part is 2.0 and low-part is 2.5

Finally, the addpd:
(gdb) next
17 addpd xmm1,xmm0
p $xmm1
$6 = v2_double = {4, 4.5}

This agrees with our expected result of xmm1[127:64] = 4.5 and xmm1[63:0] = 4.0.


Tuesday, June 16, 2009

 

String Instructions - scasb,scasw,scasd,scasq

The x86 architecture offers different types of instructions to perform various string operations . Scan string instruction is one of them. There are different flavors of the scan string instruction: scasb (byte form), scasw(word), scasd(double word) and scasq(quad word).

scasb: Will compare the byte at AL with the byte value in ES:EDI and sets the flags accordingly.
scasw: Will compare the word at AX with the word value in ES:EDI and sets the flags accordingly.
scasd: Will compare the dword at EAX with the dword value in ES:EDI and sets the flags accordingly.
scasq: Will compare the qword at RAX with the qword value in ES:(E/R)DI and sets the flags accordingly.

When the scas* instructions are used with the repeat prefix they become very powerful. For eg: The scasb instruction can be used with the repne(repeat not equal) prefix to compute the string length.
Here is an alogrithm of how the scasb instruction works when used with the repne prefix:

1. cmp AL with ES:EDI
2. If they are equal jump to 5 else goto 3.
3. if(DF==0) EDI = EDI+1 else EDI=EDI-1
4. jmp to 1
5. DONE

The DF above is the direction flag which controls the direction in which the string operation proceeds. If DF is 0, then after every iteration the value in EDI is incremented. If DF is 1, then after every iteration the value in EDI is decremented. The value by which EDI is incremented depends upon what version (byte/word/dword/qword) of scas is used. For the string length, use of scasb keeps it simple.

To control the direction flag, use the std/cld (set/clear direction flag) instructions. Assume AL has 0 (which is the NULL character in the string). At the end of the iteration if you subtract the final value of edi from the initial value of edi and then subtract the result by one, you end up with the string length.

string length = final edi - initial edi - 1;

Here is an example program:


-----------------------------------------------
section .data
mystring db "Siddharth", 0
mystrlen dd 0


section .text
global _start
_start:
nop
mov ax, ds
mov es, ax ; Initialize ES
mov edi, mystring ; Initialize EDI and EBP to point to the
mov ebp, mystring ; string in memory.
cld ; Clear eflags.df
mov ecx, 255 ; set ecx to a high value
mov al, 0 ; Initialize al with null character.
repne scasb ; scan bytes in the string
dec edi
sub edi, ebp ; This should put the string length in edi.
mov dword [mystrlen], edi; store string length in memory

; Use the stringlength as the exit-code
mov ebx, [mystrlen]
mov eax,1 ; 'exit' system call
int 80h ; call the kernel
---------------------------------------------


After assembling and running the program the exit code will contain the string length.
Typically executing 'echo $?' gives the exit code of the last command the shell executed. In this case, you will have a value of 9 which is the string length.



Labels: , , ,


This page is powered by Blogger. Isn't yours?

Subscribe to Posts [Atom]