Showing posts with label ARM. Show all posts
Showing posts with label ARM. Show all posts

Saturday, November 21, 2020

How to install tensorflow for the new Mac M1 hardware

Prerequisite: Xcode 12.2 and Command Line Tools for Xcode 12.2

(1) # Download the archive for this repo from https://github.com/apple/tensorflow_macos/releases
cd $HOME/Downloads/
curl -fsSLO https://github.com/apple/tensorflow_macos/releases/download/v0.1alpha0/tensorflow_macos-0.1alpha0.tar.gz
tar xzvf tensorflow_macos-0.1alpha0.tar.gz
/bin/bash ./tensorflow_macos/install_venv.sh --help

(2) # Download Miniconda from https://conda-forge.org/blog/posts/2020-10-29-macos-arm64/

(3) # Install Miniconda and after installtion, exit shell and login again
/bin/bash -c "$(curl -fsSL https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh)"

(4) # Once it's installed, create a Python 3.8 env by running
conda create --name python38 python=3.8

(5) # Put a path to where the arm64 libraries are. For example...
libs="$HOME/Downloads/tensorflow_macos/arm64/"

(6) # Replace this with the path of your Conda environment
env="$HOME/miniforge3/envs/python38"

(7) # upgrade
conda upgrade -c conda-forge pip setuptools cached-property six

(8) # activate env
conda activate python38
# conda deactivate

(9) pip install --upgrade -t "$env/lib/python3.8/site-packages/" --no-dependencies --force "$libs/grpcio-1.33.2-cp38-cp38-macosx_11_0_arm64.whl"

(10) pip install --upgrade -t "$env/lib/python3.8/site-packages/" --no-dependencies --force "$libs/h5py-2.10.0-cp38-cp38-macosx_11_0_arm64.whl"

(11) pip install --upgrade -t "$env/lib/python3.8/site-packages/" --no-dependencies --force "$libs/numpy-1.18.5-cp38-cp38-macosx_11_0_arm64.whl"

(12) pip install --upgrade -t "$env/lib/python3.8/site-packages/" --no-dependencies --force "$libs/tensorflow_addons-0.11.2+mlcompute-cp38-cp38-macosx_11_0_arm64.whl"

(13) # install these
conda install -c conda-forge -y absl-py
conda install -c conda-forge -y astunparse
conda install -c conda-forge -y gast
conda install -c conda-forge -y opt_einsum
conda install -c conda-forge -y termcolor
conda install -c conda-forge -y typing_extensions
conda install -c conda-forge -y wheel
conda install -c conda-forge -y typeguard

pip install tensorboard

pip install wrapt flatbuffers tensorflow_estimator google_pasta keras_preprocessing protobuf



(14) pip install --upgrade -t "$env/lib/python3.8/site-packages/" --no-dependencies --force "$libs/tensorflow_macos-0.1a0-cp38-cp38-macosx_11_0_arm64.whl"

(15) # Run this to test
time python tftest.py


tftest.py    Select all
from datetime import datetime import numpy as np import tensorflow as tf from tensorflow.python.compiler.mlcompute import mlcompute mlcompute.set_mlc_device(device_name="cpu") # tensorflow:Eager mode on GPU is extremely slow. So use CPU instead print("Hello, Tensorflow! ", end='') print(tf.__version__) print("start" , datetime.now()) X_raw = np.array([2013, 2014, 2015, 2016, 2017, 2018], dtype=np.float32) y_raw = np.array([12000, 14000, 15000, 16500, 17500, 19000], dtype=np.float32) X = (X_raw - X_raw.min()) / (X_raw.max() - X_raw.min()) y = (y_raw - y_raw.min()) / (y_raw.max() - y_raw.min()) X = tf.constant(X) y = tf.constant(y) a = tf.Variable(initial_value=0.) b = tf.Variable(initial_value=0.) variables = [a, b] num_epoch = 10000 optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3) for e in range(num_epoch): with tf.GradientTape() as tape: y_pred = a * X + b loss = 0.5 * tf.reduce_sum(tf.square(y_pred - y)) grads = tape.gradient(loss, variables) optimizer.apply_gradients(grads_and_vars=zip(grads, variables)) print(a, b) print("end" , datetime.now())




(16) Test run this cnn.py https://github.com/apple/tensorflow_macos/issues/25

Monday, January 12, 2009

How-to do ARM GCC Inline Assembler for iPhone

The following code demonstrates an example to write Inline Assembler in llvm-gcc for iPhone ARM

arithmetic_shift_right.c Select all

#include <stdio.h>
#include <stdlib.h>

int main(int argc, char *argv[]) {
int i = atoi(argv[1]);
asm volatile ("mov %0, %1, ASR #1" : "=r"(i) : "r"(i));
printf("arithmetic_shift_right is %d\n", i);
exit(0);
}


The general format (ref) is
asm volatile (assembler instructions : output operands (optional) : input operands (optional) : clobbered registers (optional) );

e.g.
asm volatile ("mul %0, %1, %2" : "=r" (result) : "r" (number1) , "r" (number2));


This version is for Assembler Macro

arithmetic_shift_right.c Select all

#include <stdio.h>
#include <stdlib.h>

inline int arithmetic_shift_right(int a) {
int y;
__asm__("mov %0, %1, ASR #1" : "=r" (y) : "r" (a));
// Register R0 will become the value of register R1 shifted to the right by 1 bit, with the sign maintained.
return y;
}

int main(int argc, char *argv[]) {
int i = atoi(argv[1]);
printf("arithmetic_shift_right %d is %d\n", i, arithmetic_shift_right(i));
exit(0);
}



Below is the otool output for Assembler Macro
otool -tV Select all

_arithmetic_shift_right:
00001eb4 e92d4080 stmdb sp!, {r7, lr}
00001eb8 e28d7000 add r7, sp, #0 ; 0x0
00001ebc e24dd008 sub sp, sp, #8 ; 0x8
00001ec0 e58d0000 str r0, [sp]
00001ec4 e59d3000 ldr r3, [sp]
00001ec8 e1a030c3 mov r3, r3, asr #1       @@ arithmetic shift right
00001ecc e58d3004 str r3, [sp, #4]
00001ed0 e59d3004 ldr r3, [sp, #4]
00001ed4 e1a00003 mov r0, r3
00001ed8 e247d000 sub sp, r7, #0 ; 0x0
00001edc e8bd8080 ldmia sp!, {r7, pc}


Below is the otool output for Assembler Macro (after full optimzation -O2)
otool -tV Select all

_arithmetic_shift_right:
00001f04 e1a000c0 mov r0, r0, asr #1
00001f08 e12fff1e bx lr




Assembler Macro for more than one assembler instruction
arithmetic shift right then perform 16 bit binary multiplication

Assembler Macros Select all

__asm__("mov %0, %1, ASR #1\n\t"
"mul %0, %0, %1"
: "=r" (y) : "r" (a));