Short examplesΒΆ

First, a kernel that implements a 3x3 median filter on a 2-d array:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
inline float median9(float *a, int n) {
    float l,h;
    bool lt;
    % for i in range(9//2+1):
        % for j in range(i+1, 9):
            lt = a[${j}] < a[${i}];
            l = lt ? a[${j}] : a[${i}];
            h = lt ? a[${i}] : a[${j}];
            a[${i}] = l;
            a[${j}] = h;
        %endfor
    %endfor
    return a[n/2];
}


__kernel
void median3x3(int awidth, int height, __global float* a, __global float* ret )
{
    float aa[9];
    size_t i;
    float m;
    size_t left = height + 1;
    size_t right = awidth - left;
    int h = height;
    i = get_global_id(0);
    m = a[i];
    if ((i >= left) &&  (i < right)) {
		aa[0] =  a[(i-h)-1];
		aa[1] =  a[i-h];
		aa[2] =  a[(i-h)+1];
		aa[3] =  a[i-1];
		aa[4] =  a[i];
		aa[5] =  a[i+1];
		aa[6] =  a[(i+h)-1];
		aa[7] =  a[i+h];
		aa[8] =  a[(i+h)+1];
	    m = median9(&aa[0], 9);
    }
    ret[i] = m;
    return;
}

and the interface specification that lets us call it:

1
2
3
4
5
6
interface median3x3 {
    kernel median3x3(sizeof int a, heightof int a, in float* a, outlike a );
    alias first as median3x3(sizeof int a, heightof int a, in float* a, in float* ret );
    alias step as median3x3(sizeof int a, heightof int a, resident float* a, resident float* ret );
    alias last as median3x3(sizeof int a, heightof int a, resident float* a, out float* ret );
};

The primary kernel entry point has only one visible argument to Python (the input array a). The other values are introspected at run time.

Note the first/step/last aliases to the kernel function which hint buffers to be downloaded (first), used locally (step), and then returned (last).

Now, the wrapper code for loading that in an even prettier way:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
'''
Provides 2d median filter for 2d arrays
'''
#  Copyright Sean D. True & SwapWizard dba (c) 2011-2021.

import yapocis.rpc.programs
from yapocis.utils.typing import *

# Provision this program with just one median filter, fixed length of 9
first = last = median3x3_cl = step = None
progran, kernels = yapocis.rpc.programs.import_program("median3x3", import_kernels="*", median3x3="median3x3_cl", width=9, steps=[9])

def median3x3slow(image:Array, iterations:int=1) -> Array:
    """
    3x3 median filter reading and writing the data on each pass

    Args:
        image:
        iterations:

    Returns: A copy of the array.

    """
    while iterations > 0:
        image = median3x3_cl(image)
        iterations -= 1
    return image.copy()

def median3x3fast(image:Array, iterations:int=1) -> Array:
    """
    3x3 median filter iterating use ping/pong buffers on the device.

    Args:
        image:
        iterations:

    Returns: A copy of the array.

    """
    if iterations == 1:
        # One pass through
        return median3x3_cl(image)
    input = image
    output = np.zeros_like(input)
    if iterations == 2:
        # Send in data, don't retrieve
        first(input, output)
        input,output = output,input
        # Don't send in, retrieve data
        last(input, output)
        return output
    # Send in data, no retrieve
    first(input, output)
    input,output = output,input
    # Allow for first and last calls
    iterations -= 2
    while iterations > 1:
        # Iterate with resident data
        step(input, output)
        input,output = output,input
        iterations -= 1
    # And retrieve the data
    input,output = output,input
    last(input, output)
    return output.copy()

median3x3=median3x3fast

and a demo that compares running the median filter 20 times on all available engines on each of an increasing series of image sizes.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
#  Copyright Sean D. True & SwapWizard dba (c) 2011-2021.

import sys
import time
import numpy as np


try:
    from scipy.ndimage.filters import median_filter
    from matplotlib import pyplot as plot
except ImportError:
    print("Please install scipy and matplotlib to compare median filter performance")
    sys.exit(1)

from yapocis.lib import median
from yapocis.utils.config import get_available


def median3x3_scipy(input, iterations=1):
    for i in range(iterations):
        input = median_filter(input, 3)
    return input


ITERATIONS = 20
SAMPLES = 20
SMALLEST=100
LARGEST=4000

# Get just the official device name
engines = list(get_available(aliases=False).keys())

# Range of images widths spaced logarithmically
widths = np.logspace(np.log2(SMALLEST), np.log2(LARGEST), num=SAMPLES, base=2).astype(int)
images = []
for width in widths:
    images.append(np.float32(np.random.random_sample((width, width))))

# Get the baseline scipy performance
scipy_deltas = []
print("scipy base line")
for width, img in zip(widths, images):
    t = time.time()
    output1 = median3x3_scipy(img, ITERATIONS)
    scipy_delta = time.time() - t
    print(width, scipy_delta)
    scipy_deltas.append(scipy_delta)

# Slow (fetch result for each iteration)
# and fast (resident ping pong buffers)
by_engine = {}
for engine in engines:
    print(engine)
    areas = []
    fast_deltas = []
    slow_deltas = []
    errors = []
    for scipy_delta, img, width in zip(scipy_deltas, images, widths):
        t = time.time()
        output2 = median.median3x3slow(img, ITERATIONS)
        slow_delta = time.time() - t

        t = time.time()
        output3 = median.median3x3fast(img, ITERATIONS)
        fast_delta = time.time() - t

        print(width, scipy_delta, slow_delta, fast_delta)
        areas.append(width * width)
        slow_deltas.append(slow_delta)
        fast_deltas.append(fast_delta)
    print()
    by_engine[engine] = (areas, slow_deltas, fast_deltas)

# Graphic is a set of verticallu stacked plots, with a common X axis
fig, axs = plot.subplots(len(engines), 1, sharex=True)
fig.suptitle(f"Yapocis Performance Comparison:\nmedian 3x3 filter, {ITERATIONS} iterations", fontsize="large")

for iengine, engine in enumerate(engines):
    # Fetch the stats, and compute the performance ratios
    areas, slow_deltas, fast_deltas = by_engine[engine]
    ax1 = axs[iengine]
    areas = np.array(areas) / 1000.
    scipy_deltas = np.array(scipy_deltas)
    slow_deltas = np.array(slow_deltas)
    fast_deltas = np.array(fast_deltas)
    ratio = 1.0 / (slow_deltas / scipy_deltas)
    ratio2 = 1.0 / (fast_deltas / scipy_deltas)

    # Lines show performance improvement over scipy
    # Filled areas show total seconds per engine
    ax1.set_title(engine)
    ax1.set_xlabel("Pixels(k)", fontsize="medium")
    # ax1.plot(areas, scipy_deltas, color="darkgreen", label="numpy")
    ax1.plot(areas, slow_deltas, color="magenta", label="slow")
    ax1.plot(areas, fast_deltas, color="orange", label="fast")
    # ax1.fill_between(areas, scipy_deltas, slow_deltas, color="lightgreen")
    ax1.fill_between(areas, fast_deltas, slow_deltas, color="pink")
    ax1.fill_between(areas, np.zeros_like(slow_deltas), fast_deltas, color="yellow")
    ax1.set_ylabel("Seconds", fontsize="medium")
    ax2 = ax1.twinx()
    ax2.plot(areas, ratio, color="red", label="slow*")
    ax2.plot(areas, ratio2, color="black", label="fast*")
    ax2.set_ylabel("Speed-up")
    if iengine == 0:
        ax1.legend(bbox_to_anchor=(1.2, 1.3), loc='upper left')
        ax2.legend(bbox_to_anchor=(1.2, .5), loc='upper left')
plot.tight_layout()
plot.savefig("docs/images/median3x3-performance.png")
plot.show()

A lot of the code in this example is included to create the following picture, which shows on my machine that the OpenCL median 3x3 is about 8 times faster than Numpy over a fairly wide range of array sizes, and that the same kernel using the resident buffer hinting show above runs about 44 times faster than Numpy.

_images/median3x3-performance.png