Tuesday, January 28, 2014

OpenCL HelloWorld Example

This sample code is taken from https://developer.apple.com/library/mac/samplecode/OpenCL_Hello_World_Example/Introduction/Intro.html

and enhanced with
i) use all supported cpu and gpu devices
ii) print gpu/cpu vendor and device name and supported OpenCL version
iii) compute execution time -> gpu_time
iv) create a FAT binary for arm and x86

My testing results are
"Time taken by NVIDIA GeForce GT 650M is 0.883 ms"
"Time taken by Intel HD Graphics 4000 is 1.302 ms"
"Time taken by Intel Intel(R) Core(TM) i7-3820QM CPU @ 2.70GHz is 10.188 ms"
"Time taken by Intel Intel(R) Core(TM) i7-2635QM CPU @ 2.00GHz is 12.802 ms"
"Time taken by Apple ARM CPU Compute Device is 1.637 ms" (DATA_SIZE 102400 only)

This source code can be compiled using command line
clang -arch x86_64 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk -framework OpenCL hello.c -o hello
or for MacOSX10.8 Mountain Lion
clang -arch x86_64 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.8.sdk -framework OpenCL hello.c -o hello
or for iOS (OpenCL iOS header files are here)
clang -arch armv7 -arch arm64 -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS7.0.sdk -F/Applications/Xcode.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS7.0.sdk/System/Library/PrivateFrameworks -I./include -framework OpenCL hello.c -o hello.arm

Then create a FAT binary and codesign
xcrun lipo hello.arm -arch x86_64 hello -create -output opencl_hello
CODESIGN_ALLOCATE="/Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/codesign_allocate" ldid -S opencl_hello

hello.c    Select all
//////////////////////////////////////////////////////////////////////////////// #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <math.h> #include <unistd.h> #include <sys/types.h> #include <sys/stat.h> #include <OpenCL/opencl.h> //////////////////////////////////////////////////////////////////////////////// // Use a static data size for simplicity // #if defined(__APPLE__) && defined(__MACH__) /* Apple OSX and iOS (Darwin). ------------------------------ */ #include <TargetConditionals.h> #if TARGET_IPHONE_SIMULATOR == 1 /* iOS in Xcode simulator */ #elif TARGET_OS_IPHONE == 1 #define DATA_SIZE (102400) #elif TARGET_OS_MAC == 1 #define DATA_SIZE (1024000) #endif #endif //////////////////////////////////////////////////////////////////////////////// // Simple compute kernel which computes the square of an input array // const char *KernelSource = "\n" \ "__kernel void square( \n" \ " __global float* input, \n" \ " __global float* output, \n" \ " const unsigned int count) \n" \ "{ \n" \ " int i = get_global_id(0); \n" \ " if(i < count) \n" \ " output[i] = input[i] * input[i]; \n" \ "} \n" \ "\n"; //////////////////////////////////////////////////////////////////////////////// int testdevice(cl_device_id device_id) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // unsigned int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; if (device_id == NULL) { printf("Failed to create a device group"); return EXIT_FAILURE; } // Get some information about the returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; size_t returned_size = 0; err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err |= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); if (err != CL_SUCCESS) return EXIT_FAILURE; printf("Connecting to %s %s...\n", vendor_name, device_name); // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // Create Queue with Profiling enabled commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; cl_event myEvent = NULL; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, &myEvent); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Ensure kernel execution is finished clWaitForEvents(1 , &myEvent); // compute gpu_time // cl_ulong endTime, startTime; clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START,sizeof(startTime), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END,sizeof(endTime), &endTime, NULL); double gpu_time = endTime-startTime; // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); printf("\nTime taken by %s %s is %0.3f ms\n\n", vendor_name, device_name, gpu_time/1000000.0); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; } int main(int argc, char* const argv[]) { cl_uint num_devices, i; clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices); cl_device_id* devices = calloc(sizeof(cl_device_id), num_devices); clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL); char buf[128]; for (i = 0; i < num_devices; i++) { clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, 128, buf, NULL); fprintf(stdout, "Vendor \"%s\" ", buf); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 128, buf, NULL); fprintf(stdout, "Device \"%s\" supports ", buf); clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, 128, buf, NULL); fprintf(stdout, "\"%s\"\n\n", buf); testdevice(devices[i]); } free(devices); }

The complete Project folder with Makefile (with other demo programs) is here

Thursday, January 9, 2014

How to install perl, theos and llvm-clang in iOS for iOS SDK6.1

(1) Install the following packages in Cydia
    APT 0.6 Transitional (and all its dependencies)

(2) If you have previous installation of odcctools, iPhone-gcc, theos and perl, you should first use SSH login shell and remove the packages by using commands
    apt-get remove iphone-gcc odcctools
    apt-get remove perl net.howett.theos

(3) Use SSH login shell commands to install perl & theos & llvm-clang
    apt-get install org.coolstar.cctools org.coolstar.ld64 org.coolstar.llvm-clang
    apt-get install coreutils wget make ldid zip unzip git subversion com.ericasadun.utilities
    cd /var
    git clone git://github.com/coolstar/theos
    wget --no-check-certificate -O org.coolstar.perl.deb 'http://d.pr/f/R0nx+'
    dpkg -i org.coolstar.perl.deb

(4) Download iPhoneOS6.1.sdk.tgz from here

(5) copy iPhoneOS6.1.sdk.tgz to iPhone

(6) Install SDK and additional libraries to sdks under theos

    tar xzvf iPhoneOS6.1.sdk.tgz
    mkdir -p /var/theos/sdks
    mv iPhoneOS6.1.sdk /var/theos/sdks/

# if your device is arm64 (that is iPad Mini 2, iPad Air or iPhone 5s)
    cd /var/theos/makefiles/targets
    ln -s Darwin-arm Darwin-arm64
    cd /var/theos/makefiles/platform
    ln -s Darwin-arm Darwin-arm64

# clone iphoneheaders.git
    cd /var/theos/
    mv include include.bak
    git clone git://github.com/rpetrich/iphoneheaders.git include
    for FILE in include.bak/*.h; do mv $FILE include/; done
    rmdir -fr include.bak/

(7) Create a command line tool project
    cd ~
    /var/theos/bin/nic.pl blocktest

(8) Choose [4.] iphone/tool

(9) Edit blocktest/main.mm like this to test block
main.mm Select all
#include <stdio.h> void EvalFuncOnGrid( double(^block)(float) ) { int i; for ( i = 0; i < 5 ; ++i ) { float x = i * 0.1; printf("%f %f\n", x, block(x)); } } void Caller(void) { float forceConst = 3.445; EvalFuncOnGrid(^(float x){ return 0.5 * forceConst * x * x; }); } int main(void) { Caller(); }

(10) Modify blocktest/Makefile like this
Makefile (Tool) Select all
TARGET := iphone:clang
TARGET_SDK_VERSION := 6.1 TARGET_IPHONEOS_DEPLOYMENT_VERSION := 6.1 ARCHS := armv7 include theos/makefiles/common.mk TOOL_NAME = blocktest blocktest_FILES = main.mm include $(THEOS_MAKE_PATH)/tool.mk

(11) Make and test run

    cd ~/blocktest
    make clean

(12) get ilogit for test build package
    cd ~
    wget --no-check-certificate https://dl.dropboxusercontent.com/u/15373/Other/iPhone/ilogit-tweak-ios7-example.tar
    tar -xf ilogit-tweak-ios7-example.tar

    #make symlink
    cd ~/ilogit
    ln -s /var/theos theos

(13) Modify Makefile, like this

Makefile    Select all
TARGET := iphone:clang TARGET_SDK_VERSION := 6.1 TARGET_IPHONEOS_DEPLOYMENT_VERSION = 6.1 ARCHS = armv6 armv7 # test build multiple archs include theos/makefiles/common.mk TWEAK_NAME = iLogIt iLogIt_FILES = Tweak.xm iLogIt_LIBRARIES = substrate
include $(THEOS_MAKE_PATH)/tweak.mk

(14) Test make package

    make clean
    make package

If you need gdb and debugserver for iOS see here