Showing posts with label arm64. Show all posts
Showing posts with label arm64. Show all posts

Thursday, April 8, 2021

How to write subrountine call in macOS assembly code for x86_64 and arm64

(1) Demo the subrountine call in macOS assembly code for x86_64 and arm64. For M1 Mac with Rosetta 2 installed, it can compile and run x86_64 and arm64 binary after installation of Xcode
callfactorial.c  Select all
/* * An application that illustrates calling the factorial function defined elsewhere. */ #include <stdio.h> #include <inttypes.h> uint64_t factorial(unsigned n); int main() { for (unsigned i = 0; i < 20; i++) { printf("factorial(%2u) = %llu\n", i, factorial(i)); } }


factorial.s  Select all
# ---------------------------------------------------------------------- # A 64-bit recursive implementation of the function # # uint64_t factorial(unsigned n) # # implemented recursively with x86_64 and arm64 assembly code # ----------------------------------------------------------------------- .globl _factorial .text #ifdef __arm64__ .align 4 #endif _factorial: #ifdef __x86_64__ cmp $1, %rdi # n <= 1? jnbe L1 # if not, go do a recursive call mov $1, %rax # otherwise return 1 ret #endif #ifdef __arm64__ cmp x8, #1 //# n > 1? b.gt L1 //# if yes, go do a recursive call mov x0, #1 //# otherwise return 1 ret #endif L1: #ifdef __x86_64__ push %rdi # save n on stack (also aligns %rsp!) dec %rdi # n-1 call _factorial # factorial(n-1), result goes in %rax pop %rdi # restore n imul %rdi, %rax # n * factorial(n-1), stored in %rax ret #endif #ifdef __arm64__ STP X8, LR, [SP, #-16]! //# push x8 and LR(x30) // LR is to return from subroutine subs x8, x8, #1 //# n-1 bl _factorial //# factorial(n-1), result goes in x0 LDP X8, LR, [SP], #16 //# pop x8 and LR(x30) mul x0, x0, x8 //# n * factorial(n-1), stored in x0 ret #endif


(2) To compile with -g and codesign the program so as to debug in lldb under macOS.
shell script  Select all
# To compile and codesign x86_64 version clang factorial.s callfactorial.c -g -o callfactorial_arm64 -arch x86_64 && codesign --entitlement entitlements --force -s - callfactorial_x86_64 # To compile and codesign arm64 version clang factorial.s callfactorial.c -g -o callfactorial_arm64 -arch arm64 && codesign --entitlement entitlements --force -s - callfactorial_arm64


(3) To debug using lldb
shell script  Select all
lldb callfactorial_x86_64 # or lldb callfactorial_arm64 # lldb debug session for arm64 - useful commands (lldb) breakpoint set --name main --name factorial (lldb) breakpoint list (lldb) run (lldb) step (lldb) po i (lldb) reg read x0 x8 lr pc (lldb) reg read -f t cpsr # lldb debug session for x86_64 - useful commands (lldb) reg read -f d rax rdi rflags (lldb) reg read -f t rflags # print the address value in the stackpointer for x86_64 (lldb) p *(int **)$sp # hint: to search lldb command history use ctrl-r


Saturday, April 3, 2021

How to use inline assembly language for M1 Mac for x86_64 and arm64

(1) Demo the inline assembly code for x86_64 and arm64
add-inline.c  Select all
#include <stdio.h> // compile with command line // clang add-inline.c -o add-inline_x86_64 -arch x86_64 // clang add-inline.c -o add-inline_arm64 -arch arm64 // disassemble with // otool -otV add-inline_x86_64 // otool -otV add-inline_arm64 int main(int argc, const char * argv[]) { int a = 10; int b = 25; int ans = 0; #ifdef __x86_64__ __asm__( "add %2,%1;\n" // %1 += %2 add source to destination "mov %1,%0;\n" // move data from %1 to %0 : "=r"(ans) : "r"(a), "r"(b) : ); #endif #ifdef __arm64__ __asm__( "add %w0,%w1,%w2;\n" // load %w0 = %w1 + %w2 : "=r"(ans) : "r"(a), "r"(b) : ); #endif printf("The answer is %d\n",ans); return 0; }


(2) Demo the assembly source code as a separate function for x86_64 and arm64
add-main.c  Select all
#include <stdio.h> // compile with command line // clang add-main.c add.s -o add-main_x86_64 -arch x86_64 // clang add-main.c add.s -o add-main_arm64 -arch arm64 #include <stdio.h> int add(int x, int y); int main(int argc, const char * argv[]) { int ans = add(15,40); printf("The answer is %d\n",ans); return 0; }


add.s  Select all
.text .globl _add .align 2 _add: #ifdef __x86_64__ add %esi,%edi // %edi += %esi, source is the first mov %edi,%eax // move data from %edi to %eax // x86_64 calling convention // rdi, rsi, rdx, rcx, r8, r9 // The 32-bit general purpose registers are edi, esi, edx, ecx, r8d, r9d instead. // The 16-bit general purpose registers are di, si, dx, cx, r8w, r9w instead. // The syscall number is placed in rax // see https://sigsegv.pl/osx-bsd-syscalls/ // Return value is in rax #endif #ifdef __arm64__ add w0,w0,w1 // load w0 with w0+w1, destination is the first // calling convention for arm64 // x0,x1,x2,x3,x4,x5,x6,x7 or r0 to r7 // The 32-bit general purpose registers are w0-w7 instead. // The syscall number is placed in r8 // Return value is in x0 // see https://wiki.cdot.senecacollege.ca/wiki/Syscalls #endif ret




entitlements  Select all
<?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> <plist version="1.0"> <dict> <key>com.apple.security.get-task-allow</key> <true/> </dict> </plist>


(3) Compile with
clang add-inline.c -o add-inline_x86_64 -arch x86_64
clang add-inline.c -o add-inline_arm64 -arch arm64
clang add-main.c add.s -o add-main_x86_64 -arch x86_64
clang add-main.c add.s -o add-main_arm64 -arch arm64


(4) Disassemble with (e.g.)
otool -otV add-inline_arm64

(5) Install Rosetta 2 on M1 to run the x86_64 version (e.g.)
./add-main_x86_64

(6) codesign and debug with lldb (e.g.)
clang -g -o add-inline_x86_64 add-inline.c -arch x86_64
codesign --entitlement entitlements --force -s - add-inline_x86_64
lldb add-inline_x86_64
(lldb) breakpoint set --file add-inline.c --line 7


(7) floating point example for x86_64 and arm64
compile with
clang sum.s callsum.c -o callsum_x86_64 -arch x86_64
clang sum.s callsum.c -o callsum_arm64 -arch arm64
Debug and codesign similar to above example
callsum.c  Select all
/* * callsum.c * * Illustrates how to call the sum.s function wrote in assembly language. */ // clang sum.s callsum.c -o callsum_x86_64 -arch x86_64 // clang sum.s callsum.c -o callsum_arm64 -arch arm64 #include <stdio.h> double sum(double[], unsigned); int main() { double test[] = { 40.5, 26.7, 21.9, 1.5, -40.5, -23.4 }; printf("%20.7f\n", sum(test, 6)); printf("%20.7f\n", sum(test, 2)); printf("%20.7f\n", sum(test, 0)); printf("%20.7f\n", sum(test, 3)); return 0; }


sum.s  Select all
# ----------------------------------------------------------------------- # A 64-bit function that returns the sum of the elements in a # floating-point array for x86_64 and arm64. The function has prototype: # # double sum(double[] array, unsigned length) # ----------------------------------------------------------------------- .global _sum .text .align 2 _sum: #ifdef __x86_64__ xorpd %xmm0, %xmm0 // initialize the sum to 0 // floats are passed in xmm0 cmp $0, %rsi // special case for length = 0 je done #endif #ifdef __arm64__ movi d0, #0 // initialize the sum to 0 // floats are passed in s0-7 and doubles in the d0-7 registers. cmp x1, #0 // special case for length = 0 b.eq done #endif next: #ifdef __x86_64__ addsd (%rdi), %xmm0 // add in the current array element, return floating point value in xmm0 add $8, %rdi // move to next array element dec %rsi // count down jnz next // if not done counting, continue #endif #ifdef __arm64__ ldr d16, [x0] // load the float into d16 // floats in s0-7 and doubles in the d0-7 registers. fadd d0, d0, d16 // add in the current array element, return floating point value in d0 add x0, x0, #8 // move to next array element subs x1, x1, #1 // count down cbnz w1, next // if not done counting, continue #endif done: ret