There are several ways to access Metal API from Python. I chose to access it via Swift (It's standard way).
Here are the instructions for builiding swift and load dylib from python.
import platform
import numpy as np
import ctypes
import time
import math
import time
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
!python --version
# Loads dylib like dlopen
swift_fun = ctypes.CDLL("./PyMetalBridge/.build/release/libPyMetalBridge.dylib")
# decleare arguments for function pointer
swift_fun.swift_sigmoid_on_gpu.argtypes = [
ctypes.POINTER(ctypes.c_float),
ctypes.POINTER(ctypes.c_float),
ctypes.c_int
]
def swift_sigmoid_on_gpu(input_array):
input_ptr = input_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
output_mutable_ptr = (ctypes.c_float * len(input_array))()
swift_fun.swift_sigmoid_on_gpu(input_ptr, output_mutable_ptr, len(input_array))
return np.array(output_mutable_ptr)
input_array = np.random.uniform(-1, 1, 100).astype("float32") # data type have to be float32 for GPU
swift_result = swift_sigmoid_on_gpu(input_array)
swift_result
It seems to work fine, but you may be wondering if the calculation result is correct? So, let's implement a sigmoid function in python to verify and compare the result.
def sigmoid(x):
return 1 / (1 + np.exp(-x))
python_result = sigmoid(input_array).astype("float32")
Let's compare swift_result and python_results by cosine_similarity to validate calculation justification.
# Returns a matrix whose elements are all 1, if the 2 vectors are same
cosine_similarity([python_result, python_result])
cosine_similarity([swift_result, python_result])
Wow! almost same!
Even if we can access Metal API from Python, there is no point unless the processing performance
improves. So let's run complex calculations in both Pure Python and Metal to compare their performance.
This time, we expand the Fourier series(as shown bellow) for the numerical values of 100,000 samples
created with np.arange(-50, 50, 0.001)
, and then perform the 1-th numerical differentiation.
def f(x):
approximate = 0
for coeff in range(1, 10000, 2):
approximate += (1/coeff)*np.sin(coeff*x)
return approximate
def differential(f, x):
delta = 1e-4
return (f(x+delta) - f(x-delta)) / 2*delta
input_array = np.arange(-50, 50, 0.001).astype("float32") # have to be float32 for GPU
input_array.shape
%%time
python_result = np.apply_along_axis(lambda x: differential(f, x), 0, input_array)
inline float f(const float x) {
float approximate = 0;
for(int coeff = 1; coeff < 10000; coeff+=2) {
approximate += (1.0f/coeff)*sin(coeff*x);
}
return approximate;
}
constant float delta = 1e-4;
kernel void differential(const device float *inVector [[ buffer(0) ]],
device float *outVector [[ buffer(1) ]],
uint id [[ thread_position_in_grid ]]) {
float x = inVector[id];
outVector[id] = (f(x+delta) - f(x-delta)) / 2.0f*delta;
}
swift_fun = ctypes.CDLL("./PyMetalBridge/.build/release/libPyMetalBridge.dylib")
# decleare arguments type
swift_fun.swift_differential_on_gpu.argtypes = [
ctypes.POINTER(ctypes.c_float),
ctypes.POINTER(ctypes.c_float),
ctypes.c_int
]
def swift_differential_on_gpu(input_array):
input_ptr = input_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
output_mutable_ptr = (ctypes.c_float * len(input_array))()
swift_fun.swift_differential_on_gpu(input_ptr, output_mutable_ptr, len(input_array))
return np.array(output_mutable_ptr)
%%time
swift_result = swift_differential_on_gpu(input_array)
%%time
swift_result = swift_differential_on_gpu(input_array)
As results, Python + Numpy took 3.52 s
, but Python + Metal took 8.1 ms
at
second time. So, Metal is incredibly faster!
swift_result.shape, python_result.shape
cosine_similarity([python_result, python_result])
cosine_similarity([swift_result, python_result.astype("float32")])
Wow! almost same.
def measure(f):
elapsed_logs = []
for i in range(10):
input_array = np.random.uniform(-50, 50, 100000).astype("float32")
start = time.time()
f(input_array)
end = time.time()
elapsed_logs.append(end - start)
return elapsed_logs
# np.apply_along_axis
python_results = measure(lambda input_array: np.apply_along_axis(lambda x: differential(f, x), 0, input_array))
print("Python time elapsed (second)")
python_results
# Metal
swift_results = measure(swift_differential_on_gpu)
print("Metal time elapsed (second)")
swift_results
%matplotlib inline
fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(range(10), swift_results, '-o', label='Python+Metal')
ax.plot(range(10), python_results, '-o', label='Python + Numpy')
ax.set_title('Numpy and Metal performance comprison')
ax.set_xlabel('iteration')
ax.set_ylabel('time elapsed (second)')
ax.grid(True)
ax.legend()