# -*- coding: utf-8 -*-

import pyopencl
from pyopencl import mem_flags
import numpy
import time

size = 1024
a = numpy.random.randint(0, 256, (size,size)).astype(numpy.int32)
b = numpy.random.randint(0, 256, (size,size)).astype(numpy.int32)
dest = numpy.empty_like(a)

context = pyopencl.create_some_context(interactive=False)
queue = pyopencl.CommandQueue(context)
a_buf = pyopencl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=a)
b_buf = pyopencl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=b)
dest_buf = pyopencl.Buffer(context, mem_flags.WRITE_ONLY, dest.nbytes)

program = pyopencl.Program(context, '''
__kernel void matrix_mul(
    __global const int* a,
    __global const int* b,
    __global int* dest,
    const int n
)
{
    const int i = get_global_id(0);
    const int j = get_global_id(1);
    const int dest_index = j * n + i;

    dest[dest_index] = 0;
    for(int k = 0; k < n; k++){
        dest[dest_index] += a[j * n + k] * b[k * n + i];
    }
}
''').build()

n = numpy.int32(size) # カーネル関数にスカラー値を渡すにはnumpyの型を使う
start = time.time()
e = program.matrix_mul(queue, a.shape, None, a_buf, b_buf, dest_buf, n)
e.wait()
stop = time.time()

pyopencl.enqueue_copy(queue, dest, dest_buf)

print numpy.all(numpy.dot(a, b) == dest)
print stop - start

