How to Access Metal API from Python?

There are several ways to access Metal API from Python. I chose to access it via Swift (It's standard way).

Here are the instructions for builiding swift and load dylib from python.

  1. Writing Swift Program that use Metal
  2. Build and export Swift program as dynamic library
  3. Load dylib(created at step.2) in Python with ctypes

Example

In [1]:
import platform
import numpy as np
import ctypes
import time
import math
import time
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
In [2]:
!python --version
Python 3.8.0
In [3]:
# Loads dylib like dlopen
swift_fun = ctypes.CDLL("./PyMetalBridge/.build/release/libPyMetalBridge.dylib")

# decleare arguments for function pointer
swift_fun.swift_sigmoid_on_gpu.argtypes = [
    ctypes.POINTER(ctypes.c_float), 
    ctypes.POINTER(ctypes.c_float), 
    ctypes.c_int
]

def swift_sigmoid_on_gpu(input_array):
    input_ptr = input_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    output_mutable_ptr = (ctypes.c_float * len(input_array))()
    swift_fun.swift_sigmoid_on_gpu(input_ptr, output_mutable_ptr, len(input_array))
    return np.array(output_mutable_ptr)
In [4]:
input_array = np.random.uniform(-1, 1, 100).astype("float32") # data type have to be float32 for GPU
swift_result = swift_sigmoid_on_gpu(input_array)
swift_result
Out[4]:
array([0.47861016, 0.27524903, 0.71253306, 0.5064557 , 0.64430207,
       0.389694  , 0.35691467, 0.60283643, 0.44320086, 0.31876338,
       0.53592247, 0.67109865, 0.7117729 , 0.5919577 , 0.59793675,
       0.6869189 , 0.61848646, 0.30827925, 0.2912722 , 0.44238877,
       0.33993402, 0.42418775, 0.3466718 , 0.27549532, 0.3260893 ,
       0.28546685, 0.30542856, 0.539691  , 0.48059276, 0.4627214 ,
       0.393174  , 0.37286943, 0.31088033, 0.2696172 , 0.3883789 ,
       0.35709983, 0.6672036 , 0.4951041 , 0.3742879 , 0.6878597 ,
       0.5487302 , 0.49910074, 0.3318065 , 0.64128214, 0.72707283,
       0.2965912 , 0.65328604, 0.6028999 , 0.4742002 , 0.63668376,
       0.62638754, 0.34848648, 0.34663606, 0.30757183, 0.6053475 ,
       0.728238  , 0.718438  , 0.37307972, 0.27234185, 0.43797404,
       0.6288411 , 0.71517515, 0.3995847 , 0.49197   , 0.2955922 ,
       0.6222965 , 0.7299687 , 0.39750096, 0.6785631 , 0.3320321 ,
       0.4686196 , 0.36608127, 0.6808817 , 0.5210217 , 0.34256107,
       0.65955216, 0.48564228, 0.50396264, 0.69157594, 0.5036202 ,
       0.5562569 , 0.3011833 , 0.7227413 , 0.4616875 , 0.3634031 ,
       0.35782716, 0.6736457 , 0.6272397 , 0.6421854 , 0.6845587 ,
       0.6200225 , 0.53924775, 0.6347062 , 0.27120253, 0.7114265 ,
       0.5717736 , 0.5890778 , 0.59483397, 0.60945904, 0.6121304 ],
      dtype=float32)

It seems to work fine, but you may be wondering if the calculation result is correct? So, let's implement a sigmoid function in python to verify and compare the result.

In [5]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

python_result = sigmoid(input_array).astype("float32")

Validate Results with cosine_similarity

Let's compare swift_result and python_results by cosine_similarity to validate calculation justification.

In [6]:
# Returns a matrix whose elements are all 1, if the 2 vectors are same
cosine_similarity([python_result, python_result])
Out[6]:
array([[1.0000002, 1.0000002],
       [1.0000002, 1.0000002]], dtype=float32)
In [7]:
cosine_similarity([swift_result, python_result])
Out[7]:
array([[1.0000002, 1.0000002],
       [1.0000002, 1.0000002]], dtype=float32)

Wow! almost same!

Benchmarking

Test Machine Spec: Apple MacBook Pro (16-inch, 2019)

  • Processor: 2.4 GHz 8-Core Intel Core i9
  • RAM: 64 GB 2667 MHz DDR
  • GPU: AMD Radeon Pro 5500M(4GB)

Even if we can access Metal API from Python, there is no point unless the processing performance improves. So let's run complex calculations in both Pure Python and Metal to compare their performance. This time, we expand the Fourier series(as shown bellow) for the numerical values of 100,000 samples created with np.arange(-50, 50, 0.001), and then perform the 1-th numerical differentiation.

fourier series

$$ f(x) = \sin(x) + \frac{1}{3}\sin(3x) + \frac{1}{5}\sin(5x) + \frac{1}{7}\sin(7x) + \cdots + \frac{1}{9999}\sin(9999x) $$
In [8]:
def f(x):
    approximate = 0
    for coeff in range(1, 10000, 2):
        approximate += (1/coeff)*np.sin(coeff*x)
        
    return approximate

numerical differential

$$ \frac{df}{dx} = \frac{f(x + \Delta) - f(x - \Delta)}{2 \Delta} $$
In [9]:
def differential(f, x):
    delta = 1e-4
    return (f(x+delta) - f(x-delta)) / 2*delta

Benchmarking of Python + Numpy

In [10]:
input_array = np.arange(-50, 50, 0.001).astype("float32") # have to be float32 for GPU
input_array.shape
Out[10]:
(100000,)

Optimized coumputation with numpy.apply_along_axis instead of for loop

In [11]:
%%time
python_result = np.apply_along_axis(lambda x: differential(f, x), 0, input_array)
CPU times: user 3.51 s, sys: 3.18 ms, total: 3.52 s
Wall time: 3.52 s

Benchmarking of Python + Swift and Metal

fourier series and numerical differential in Metal Shders

inline float f(const float x) {
    float approximate = 0;
    for(int coeff = 1; coeff < 10000; coeff+=2) {
        approximate += (1.0f/coeff)*sin(coeff*x);
    }
    return approximate;
}

constant float delta = 1e-4;

kernel void differential(const device float *inVector [[ buffer(0) ]],
                    device float *outVector [[ buffer(1) ]],
                    uint id [[ thread_position_in_grid ]]) {

    float x = inVector[id];
    outVector[id] = (f(x+delta) - f(x-delta)) / 2.0f*delta;
}

Loads dylib and declares swift function

In [12]:
swift_fun = ctypes.CDLL("./PyMetalBridge/.build/release/libPyMetalBridge.dylib")

# decleare arguments type
swift_fun.swift_differential_on_gpu.argtypes = [
    ctypes.POINTER(ctypes.c_float), 
    ctypes.POINTER(ctypes.c_float),
    ctypes.c_int
]

def swift_differential_on_gpu(input_array):
    input_ptr = input_array.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
    output_mutable_ptr = (ctypes.c_float * len(input_array))()
    swift_fun.swift_differential_on_gpu(input_ptr, output_mutable_ptr, len(input_array))
    return np.array(output_mutable_ptr)

First Time(Cold)

In [13]:
%%time
swift_result = swift_differential_on_gpu(input_array)
CPU times: user 982 µs, sys: 1.31 ms, total: 2.29 ms
Wall time: 10.3 ms

Second Time(Hot)

In [14]:
%%time
swift_result = swift_differential_on_gpu(input_array)
CPU times: user 1.11 ms, sys: 1.37 ms, total: 2.47 ms
Wall time: 8.1 ms

As results, Python + Numpy took 3.52 s, but Python + Metal took 8.1 ms at second time. So, Metal is incredibly faster!

Validate Results

In [15]:
swift_result.shape, python_result.shape
Out[15]:
((100000,), (100000,))
In [16]:
cosine_similarity([python_result, python_result])
Out[16]:
array([[1.0000523, 1.0000523],
       [1.0000523, 1.0000523]], dtype=float32)
In [17]:
cosine_similarity([swift_result, python_result.astype("float32")])
Out[17]:
array([[1.0000539, 1.0000181],
       [1.0000181, 1.0000523]], dtype=float32)

Wow! almost same.

Performance comparison between Numpy and Metal in 10 iterations

In [18]:
def measure(f):
    elapsed_logs = []
    for i in range(10):
        input_array = np.random.uniform(-50, 50, 100000).astype("float32")
        start = time.time()
        f(input_array)
        end = time.time()
        elapsed_logs.append(end - start)
        
    return elapsed_logs
In [19]:
# np.apply_along_axis
python_results = measure(lambda input_array: np.apply_along_axis(lambda x: differential(f, x), 0, input_array))
In [20]:
print("Python time elapsed (second)")
python_results
Python time elapsed (second)
Out[20]:
[7.2743401527404785,
 7.240334987640381,
 7.485050678253174,
 7.156182765960693,
 7.047566175460815,
 7.03191614151001,
 7.006180763244629,
 6.995512008666992,
 7.056458950042725,
 7.020463228225708]
In [21]:
# Metal
swift_results = measure(swift_differential_on_gpu)
In [22]:
print("Metal time elapsed (second)")
swift_results
Metal time elapsed (second)
Out[22]:
[0.9524149894714355,
 0.008167028427124023,
 0.008337736129760742,
 0.008405685424804688,
 0.008406639099121094,
 0.008293867111206055,
 0.007641792297363281,
 0.008014678955078125,
 0.008147001266479492,
 0.008392810821533203]
In [23]:
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 4))

ax.plot(range(10), swift_results, '-o', label='Python+Metal')
ax.plot(range(10), python_results, '-o', label='Python + Numpy')
ax.set_title('Numpy and Metal performance comprison')
ax.set_xlabel('iteration')
ax.set_ylabel('time elapsed (second)')
ax.grid(True)
ax.legend()
Out[23]:
<matplotlib.legend.Legend at 0x13b96ba60>