Short examplesΒΆ
First, a kernel that implements a 3x3 median filter on a 2-d array:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | inline float median9(float *a, int n) {
float l,h;
bool lt;
% for i in range(9//2+1):
% for j in range(i+1, 9):
lt = a[${j}] < a[${i}];
l = lt ? a[${j}] : a[${i}];
h = lt ? a[${i}] : a[${j}];
a[${i}] = l;
a[${j}] = h;
%endfor
%endfor
return a[n/2];
}
__kernel
void median3x3(int awidth, int height, __global float* a, __global float* ret )
{
float aa[9];
size_t i;
float m;
size_t left = height + 1;
size_t right = awidth - left;
int h = height;
i = get_global_id(0);
m = a[i];
if ((i >= left) && (i < right)) {
aa[0] = a[(i-h)-1];
aa[1] = a[i-h];
aa[2] = a[(i-h)+1];
aa[3] = a[i-1];
aa[4] = a[i];
aa[5] = a[i+1];
aa[6] = a[(i+h)-1];
aa[7] = a[i+h];
aa[8] = a[(i+h)+1];
m = median9(&aa[0], 9);
}
ret[i] = m;
return;
}
|
and the interface specification that lets us call it:
1 2 3 4 5 6 | interface median3x3 {
kernel median3x3(sizeof int a, heightof int a, in float* a, outlike a );
alias first as median3x3(sizeof int a, heightof int a, in float* a, in float* ret );
alias step as median3x3(sizeof int a, heightof int a, resident float* a, resident float* ret );
alias last as median3x3(sizeof int a, heightof int a, resident float* a, out float* ret );
};
|
The primary kernel entry point has only one visible argument to Python (the input array a). The other values are introspected at run time.
Note the first/step/last aliases to the kernel function which hint buffers to be downloaded (first), used locally (step), and then returned (last).
Now, the wrapper code for loading that in an even prettier way:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | '''
Provides 2d median filter for 2d arrays
'''
# Copyright Sean D. True & SwapWizard dba (c) 2011-2021.
import yapocis.rpc.programs
from yapocis.utils.typing import *
# Provision this program with just one median filter, fixed length of 9
first = last = median3x3_cl = step = None
progran, kernels = yapocis.rpc.programs.import_program("median3x3", import_kernels="*", median3x3="median3x3_cl", width=9, steps=[9])
def median3x3slow(image:Array, iterations:int=1) -> Array:
"""
3x3 median filter reading and writing the data on each pass
Args:
image:
iterations:
Returns: A copy of the array.
"""
while iterations > 0:
image = median3x3_cl(image)
iterations -= 1
return image.copy()
def median3x3fast(image:Array, iterations:int=1) -> Array:
"""
3x3 median filter iterating use ping/pong buffers on the device.
Args:
image:
iterations:
Returns: A copy of the array.
"""
if iterations == 1:
# One pass through
return median3x3_cl(image)
input = image
output = np.zeros_like(input)
if iterations == 2:
# Send in data, don't retrieve
first(input, output)
input,output = output,input
# Don't send in, retrieve data
last(input, output)
return output
# Send in data, no retrieve
first(input, output)
input,output = output,input
# Allow for first and last calls
iterations -= 2
while iterations > 1:
# Iterate with resident data
step(input, output)
input,output = output,input
iterations -= 1
# And retrieve the data
input,output = output,input
last(input, output)
return output.copy()
median3x3=median3x3fast
|
and a demo that compares running the median filter 20 times on all available engines on each of an increasing series of image sizes.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | # Copyright Sean D. True & SwapWizard dba (c) 2011-2021.
import sys
import time
import numpy as np
try:
from scipy.ndimage.filters import median_filter
from matplotlib import pyplot as plot
except ImportError:
print("Please install scipy and matplotlib to compare median filter performance")
sys.exit(1)
from yapocis.lib import median
from yapocis.utils.config import get_available
def median3x3_scipy(input, iterations=1):
for i in range(iterations):
input = median_filter(input, 3)
return input
ITERATIONS = 20
SAMPLES = 20
SMALLEST=100
LARGEST=4000
# Get just the official device name
engines = list(get_available(aliases=False).keys())
# Range of images widths spaced logarithmically
widths = np.logspace(np.log2(SMALLEST), np.log2(LARGEST), num=SAMPLES, base=2).astype(int)
images = []
for width in widths:
images.append(np.float32(np.random.random_sample((width, width))))
# Get the baseline scipy performance
scipy_deltas = []
print("scipy base line")
for width, img in zip(widths, images):
t = time.time()
output1 = median3x3_scipy(img, ITERATIONS)
scipy_delta = time.time() - t
print(width, scipy_delta)
scipy_deltas.append(scipy_delta)
# Slow (fetch result for each iteration)
# and fast (resident ping pong buffers)
by_engine = {}
for engine in engines:
print(engine)
areas = []
fast_deltas = []
slow_deltas = []
errors = []
for scipy_delta, img, width in zip(scipy_deltas, images, widths):
t = time.time()
output2 = median.median3x3slow(img, ITERATIONS)
slow_delta = time.time() - t
t = time.time()
output3 = median.median3x3fast(img, ITERATIONS)
fast_delta = time.time() - t
print(width, scipy_delta, slow_delta, fast_delta)
areas.append(width * width)
slow_deltas.append(slow_delta)
fast_deltas.append(fast_delta)
print()
by_engine[engine] = (areas, slow_deltas, fast_deltas)
# Graphic is a set of verticallu stacked plots, with a common X axis
fig, axs = plot.subplots(len(engines), 1, sharex=True)
fig.suptitle(f"Yapocis Performance Comparison:\nmedian 3x3 filter, {ITERATIONS} iterations", fontsize="large")
for iengine, engine in enumerate(engines):
# Fetch the stats, and compute the performance ratios
areas, slow_deltas, fast_deltas = by_engine[engine]
ax1 = axs[iengine]
areas = np.array(areas) / 1000.
scipy_deltas = np.array(scipy_deltas)
slow_deltas = np.array(slow_deltas)
fast_deltas = np.array(fast_deltas)
ratio = 1.0 / (slow_deltas / scipy_deltas)
ratio2 = 1.0 / (fast_deltas / scipy_deltas)
# Lines show performance improvement over scipy
# Filled areas show total seconds per engine
ax1.set_title(engine)
ax1.set_xlabel("Pixels(k)", fontsize="medium")
# ax1.plot(areas, scipy_deltas, color="darkgreen", label="numpy")
ax1.plot(areas, slow_deltas, color="magenta", label="slow")
ax1.plot(areas, fast_deltas, color="orange", label="fast")
# ax1.fill_between(areas, scipy_deltas, slow_deltas, color="lightgreen")
ax1.fill_between(areas, fast_deltas, slow_deltas, color="pink")
ax1.fill_between(areas, np.zeros_like(slow_deltas), fast_deltas, color="yellow")
ax1.set_ylabel("Seconds", fontsize="medium")
ax2 = ax1.twinx()
ax2.plot(areas, ratio, color="red", label="slow*")
ax2.plot(areas, ratio2, color="black", label="fast*")
ax2.set_ylabel("Speed-up")
if iengine == 0:
ax1.legend(bbox_to_anchor=(1.2, 1.3), loc='upper left')
ax2.legend(bbox_to_anchor=(1.2, .5), loc='upper left')
plot.tight_layout()
plot.savefig("docs/images/median3x3-performance.png")
plot.show()
|
A lot of the code in this example is included to create the following picture, which shows on my machine that the OpenCL median 3x3 is about 8 times faster than Numpy over a fairly wide range of array sizes, and that the same kernel using the resident buffer hinting show above runs about 44 times faster than Numpy.