Showing posts with label Inline Assembler. Show all posts
Showing posts with label Inline Assembler. Show all posts

Saturday, April 3, 2021

How to use inline assembly language for M1 Mac for x86_64 and arm64

(1) Demo the inline assembly code for x86_64 and arm64
add-inline.c  Select all
#include <stdio.h> // compile with command line // clang add-inline.c -o add-inline_x86_64 -arch x86_64 // clang add-inline.c -o add-inline_arm64 -arch arm64 // disassemble with // otool -otV add-inline_x86_64 // otool -otV add-inline_arm64 int main(int argc, const char * argv[]) { int a = 10; int b = 25; int ans = 0; #ifdef __x86_64__ __asm__( "add %2,%1;\n" // %1 += %2 add source to destination "mov %1,%0;\n" // move data from %1 to %0 : "=r"(ans) : "r"(a), "r"(b) : ); #endif #ifdef __arm64__ __asm__( "add %w0,%w1,%w2;\n" // load %w0 = %w1 + %w2 : "=r"(ans) : "r"(a), "r"(b) : ); #endif printf("The answer is %d\n",ans); return 0; }


(2) Demo the assembly source code as a separate function for x86_64 and arm64
add-main.c  Select all
#include <stdio.h> // compile with command line // clang add-main.c add.s -o add-main_x86_64 -arch x86_64 // clang add-main.c add.s -o add-main_arm64 -arch arm64 #include <stdio.h> int add(int x, int y); int main(int argc, const char * argv[]) { int ans = add(15,40); printf("The answer is %d\n",ans); return 0; }


add.s  Select all
.text .globl _add .align 2 _add: #ifdef __x86_64__ add %esi,%edi // %edi += %esi, source is the first mov %edi,%eax // move data from %edi to %eax // x86_64 calling convention // rdi, rsi, rdx, rcx, r8, r9 // The 32-bit general purpose registers are edi, esi, edx, ecx, r8d, r9d instead. // The 16-bit general purpose registers are di, si, dx, cx, r8w, r9w instead. // The syscall number is placed in rax // see https://sigsegv.pl/osx-bsd-syscalls/ // Return value is in rax #endif #ifdef __arm64__ add w0,w0,w1 // load w0 with w0+w1, destination is the first // calling convention for arm64 // x0,x1,x2,x3,x4,x5,x6,x7 or r0 to r7 // The 32-bit general purpose registers are w0-w7 instead. // The syscall number is placed in r8 // Return value is in x0 // see https://wiki.cdot.senecacollege.ca/wiki/Syscalls #endif ret




entitlements  Select all
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> <plist version="1.0"> <dict> <key>com.apple.security.get-task-allow</key> <true/> </dict> </plist>


(3) Compile with
clang add-inline.c -o add-inline_x86_64 -arch x86_64
clang add-inline.c -o add-inline_arm64 -arch arm64
clang add-main.c add.s -o add-main_x86_64 -arch x86_64
clang add-main.c add.s -o add-main_arm64 -arch arm64


(4) Disassemble with (e.g.)
otool -otV add-inline_arm64

(5) Install Rosetta 2 on M1 to run the x86_64 version (e.g.)
./add-main_x86_64

(6) codesign and debug with lldb (e.g.)
clang -g -o add-inline_x86_64 add-inline.c -arch x86_64
codesign --entitlement entitlements --force -s - add-inline_x86_64
lldb add-inline_x86_64
(lldb) breakpoint set --file add-inline.c --line 7


(7) floating point example for x86_64 and arm64
compile with
clang sum.s callsum.c -o callsum_x86_64 -arch x86_64
clang sum.s callsum.c -o callsum_arm64 -arch arm64
Debug and codesign similar to above example
callsum.c  Select all
/* * callsum.c * * Illustrates how to call the sum.s function wrote in assembly language. */ // clang sum.s callsum.c -o callsum_x86_64 -arch x86_64 // clang sum.s callsum.c -o callsum_arm64 -arch arm64 #include <stdio.h> double sum(double[], unsigned); int main() { double test[] = { 40.5, 26.7, 21.9, 1.5, -40.5, -23.4 }; printf("%20.7f\n", sum(test, 6)); printf("%20.7f\n", sum(test, 2)); printf("%20.7f\n", sum(test, 0)); printf("%20.7f\n", sum(test, 3)); return 0; }


sum.s  Select all
# ----------------------------------------------------------------------- # A 64-bit function that returns the sum of the elements in a # floating-point array for x86_64 and arm64. The function has prototype: # # double sum(double[] array, unsigned length) # ----------------------------------------------------------------------- .global _sum .text .align 2 _sum: #ifdef __x86_64__ xorpd %xmm0, %xmm0 // initialize the sum to 0 // floats are passed in xmm0 cmp $0, %rsi // special case for length = 0 je done #endif #ifdef __arm64__ movi d0, #0 // initialize the sum to 0 // floats are passed in s0-7 and doubles in the d0-7 registers. cmp x1, #0 // special case for length = 0 b.eq done #endif next: #ifdef __x86_64__ addsd (%rdi), %xmm0 // add in the current array element, return floating point value in xmm0 add $8, %rdi // move to next array element dec %rsi // count down jnz next // if not done counting, continue #endif #ifdef __arm64__ ldr d16, [x0] // load the float into d16 // floats in s0-7 and doubles in the d0-7 registers. fadd d0, d0, d16 // add in the current array element, return floating point value in d0 add x0, x0, #8 // move to next array element subs x1, x1, #1 // count down cbnz w1, next // if not done counting, continue #endif done: ret


Monday, January 12, 2009

How-to do ARM GCC Inline Assembler for iPhone

The following code demonstrates an example to write Inline Assembler in llvm-gcc for iPhone ARM

arithmetic_shift_right.c Select all

#include <stdio.h>
#include <stdlib.h>

int main(int argc, char *argv[]) {
int i = atoi(argv[1]);
asm volatile ("mov %0, %1, ASR #1" : "=r"(i) : "r"(i));
printf("arithmetic_shift_right is %d\n", i);
exit(0);
}


The general format (ref) is
asm volatile (assembler instructions : output operands (optional) : input operands (optional) : clobbered registers (optional) );

e.g.
asm volatile ("mul %0, %1, %2" : "=r" (result) : "r" (number1) , "r" (number2));


This version is for Assembler Macro

arithmetic_shift_right.c Select all

#include <stdio.h>
#include <stdlib.h>

inline int arithmetic_shift_right(int a) {
int y;
__asm__("mov %0, %1, ASR #1" : "=r" (y) : "r" (a));
// Register R0 will become the value of register R1 shifted to the right by 1 bit, with the sign maintained.
return y;
}

int main(int argc, char *argv[]) {
int i = atoi(argv[1]);
printf("arithmetic_shift_right %d is %d\n", i, arithmetic_shift_right(i));
exit(0);
}



Below is the otool output for Assembler Macro
otool -tV Select all

_arithmetic_shift_right:
00001eb4 e92d4080 stmdb sp!, {r7, lr}
00001eb8 e28d7000 add r7, sp, #0 ; 0x0
00001ebc e24dd008 sub sp, sp, #8 ; 0x8
00001ec0 e58d0000 str r0, [sp]
00001ec4 e59d3000 ldr r3, [sp]
00001ec8 e1a030c3 mov r3, r3, asr #1       @@ arithmetic shift right
00001ecc e58d3004 str r3, [sp, #4]
00001ed0 e59d3004 ldr r3, [sp, #4]
00001ed4 e1a00003 mov r0, r3
00001ed8 e247d000 sub sp, r7, #0 ; 0x0
00001edc e8bd8080 ldmia sp!, {r7, pc}


Below is the otool output for Assembler Macro (after full optimzation -O2)
otool -tV Select all

_arithmetic_shift_right:
00001f04 e1a000c0 mov r0, r0, asr #1
00001f08 e12fff1e bx lr




Assembler Macro for more than one assembler instruction
arithmetic shift right then perform 16 bit binary multiplication

Assembler Macros Select all

__asm__("mov %0, %1, ASR #1\n\t"
"mul %0, %0, %1"
: "=r" (y) : "r" (a));