Wednesday, 20 November 2019

Convert PyCuda code to PyOpenCL code : How to perform a high number of 3x3 matrix inversion?

I try to convert a PyCuda code to PyOpenCL. This is the following of a previous version of working NVIDIA GPU code. This code aims to invert a high number of 3x3 matrixes.

Here's the PyCuda working version :

$ cat t14.py
import numpy as np
# import matplotlib.pyplot as plt 
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""

__device__ unsigned getoff(unsigned &off){
  unsigned ret = off & 0x0F;
  off >>= 4;
  return ret;
}   

// in-place is acceptable i.e. out == in) 
// T = float or double only
const int block_size = 288;
typedef double T; // *** can set to float or double
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){

  __shared__ T si[block_size];
  size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
  T det = 1;
  if (idx < n*9)
    det = in[idx];
  unsigned sibase = (threadIdx.x / 9)*9;
  unsigned lane = threadIdx.x - sibase; // cheaper modulo
  si[threadIdx.x] = det;
  __syncthreads();
  unsigned off = pat[lane];
  T a  = si[sibase + getoff(off)];
  a   *= si[sibase + getoff(off)];
  T b  = si[sibase + getoff(off)];
  b   *= si[sibase + getoff(off)];
  a -= b;
  __syncthreads();
  if (lane == 0) si[sibase+3] = a;
  if (lane == 3) si[sibase+4] = a;
  if (lane == 6) si[sibase+5] = a;
  __syncthreads();
  det =  si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
  if (idx < n*9)
    out[idx] = a / det;
}   

""")
# host code
def gpuinv3x3(inp, n):
    # internal constants not to be modified
    hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
    # Convert parameters into numpy array
    # *** change next line between float32 and float64 to match float or double
    inpd = np.array(inp, dtype=np.float64)
    hpatd = np.array(hpat, dtype=np.uint32)
    # *** change next line between float32 and float64 to match float or double
    output = np.empty((n*9), dtype= np.float64)
    # Get kernel function
    matinv3x3 = kernel.get_function("inv3x3")
    # Define block, grid and compute
    blockDim = (288,1,1) # do not change
    gridDim = ((n/32)+1,1,1)
    # Kernel function
    matinv3x3 (
        cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
        block=blockDim, grid=gridDim)
    return output
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
$ python t14.py

And here below my attempt to get the equivalent PyOpenCL version :

$ cat gpu_opencl.py
from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl

# Initialization
n = 2
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
a_in = np.array(n*9).astype(np.float64)
res_np = np.zeros(n*9).astype(np.float64)
mf = cl.mem_flags
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_in)
res_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=res_np)
# kernel
prg = cl.Program(ctx, """

unsigned getoff(unsigned &off){
  unsigned ret = off & 0x0F;
  off >>= 4;
  return ret;
}   

// OpenCL
// T = float or double only
typedef double T; 

const int block_size = 288;

__kernel 
void 
inv3x3(
__global const T* restrict in,
__global T* restrict out,
const size_t n,
const unsigned* restrict pat
)
{
  unsigned int tid = get_local_id(0);
  unsigned int gid = get_global_id(0);
  unsigned int groupid = get_group_id(0);
  unsigned int localSize = get_local_size(0);

  __local T si[block_size];
  size_t idx = tid + groupid*localSize;
  T det = 1;
  if (idx < n*9)
    det = in[idx];
  unsigned sibase = (tid / 9)*9;
  unsigned lane = tid - sibase; // cheaper modulo
  si[tid] = det;
  // Synchronize to make sure data is available for processing
  barrier(CLK_LOCAL_MEM_FENCE);
  unsigned off = pat[lane];
  T a  = si[sibase + getoff(off)];
  a   *= si[sibase + getoff(off)];
  T b  = si[sibase + getoff(off)];
  b   *= si[sibase + getoff(off)];
  a -= b;
  barrier(CLK_LOCAL_MEM_FENCE);
  if (lane == 0) si[sibase+3] = a;
  if (lane == 3) si[sibase+4] = a;
  if (lane == 6) si[sibase+5] = a;
  barrier(CLK_LOCAL_MEM_FENCE);
  det =  si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
  if (idx < n*9)
    out[idx] = a / det;
}   
""").build()

def gpuinv3x3(inp, n):
    # internal constants not to be modified
    hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
    hpatd = np.array(hpat, dtype=np.uint32)
    # Convert parameters into numpy array
    # *** change next line between float32 and float64 to match float or double
    inpd = np.array(inp, dtype=np.float64)
    hpatd = np.array(hpat, dtype=np.uint32)
    # *** change next line between float32 and float64 to match float or double
    output = np.empty((n*9), dtype= np.float64)
    # Get kernel function
    res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_in.nbytes)
    prg.inv3x3(queue, a_g, res_g, np.uint64(n), htpatd)
    res_np = np.empty_like(a_in)
    cl.enqueue_copy(queue, res_np, res_g)
    return res_np

inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))

$ python gpu_opencl.py

But when I execute this latter with PyOpenCL, I get the following errors :

Traceback (most recent call last):
  File "gpu_opencl_inv_3x3.py", line 69, in <module>
    """).build()
  File "/Users/fab/Library/Python/2.7/lib/python/site-packages/pyopencl/__init__.py", line 510, in build
    options_bytes=options_bytes, source=self._source)
  File "/Users/fab/Library/Python/2.7/lib/python/site-packages/pyopencl/__init__.py", line 554, in _build_and_catch_errors
    raise err
pyopencl._cl.RuntimeError: clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE

Build on <pyopencl.Device 'AMD Radeon Pro Vega 20 Compute Engine' on 'Apple' at 0x1021d00>:

<program source>:5:26: error: expected ')'
unsigned getoff(unsigned &off){
                         ^
<program source>:5:16: note: to match this '('
unsigned getoff(unsigned &off){
               ^
<program source>:5:10: warning: no previous prototype for function 'getoff'
unsigned getoff(unsigned &off){
         ^
<program source>:5:26: error: parameter name omitted
unsigned getoff(unsigned &off){
                         ^
<program source>:6:18: error: use of undeclared identifier 'off'
  unsigned ret = off & 0x0F;
                 ^
<program source>:7:3: error: use of undeclared identifier 'off'
  off >>= 4;
  ^
<program source>:15:11: error: global variables must have a constant address space qualifier
const int block_size = 288;
          ^
<program source>:23:26: error: kernel pointer arguments must have a global, local, or constant address space qualifier
const unsigned* restrict pat
                         ^

I don't know how to modify exactly the equivalent for PyOpenCL. Someone could see by chance an error in my PyOpenCL code. I would really appreciaate if I could reveive some clues/tracks to make this PyOpenCL code working like with PyCuda version :

EDIT 1: I have made corrections (on keywords and syntax) but some errors of compilation remain. The main problem seems to be about the getoff(unsigned &off) function which may not be necessary with OpenCL ?



from Convert PyCuda code to PyOpenCL code : How to perform a high number of 3x3 matrix inversion?

No comments:

Post a Comment