10.1
Main Application
#define NX 2048
extern void scal_cpu_func(void *buffers[], void *_args);
extern void scal_sse_func(void *buffers[], void *_args);
extern void scal_cuda_func(void *buffers[], void *_args);
extern void scal_opencl_func(void *buffers[], void *_args);
{
.cpu_funcs = { scal_cpu_func, scal_sse_func },
.cpu_funcs_name = { "scal_cpu_func", "scal_sse_func" },
#ifdef STARPU_USE_CUDA
.cuda_funcs = { scal_cuda_func },
#endif
#ifdef STARPU_USE_OPENCL
.opencl_funcs = { scal_opencl_func },
#endif
.nbuffers = 1,
};
#ifdef STARPU_USE_OPENCL
#endif
int main(int argc, char **argv)
{
float vector[NX];
unsigned i;
for (i = 0; i < NX; i++)
vector[i] = 1.0f;
fprintf(stderr, "BEFORE: First element was %f\n", vector[0]);
#ifdef STARPU_USE_OPENCL
#endif
float factor = 3.14;
#ifdef STARPU_USE_OPENCL
#endif
fprintf(stderr, "AFTER First element is %f\n", vector[0]);
return 0;
}
unsigned synchronous
Definition starpu_task.h:1125
void * cl_arg
Definition starpu_task.h:856
uint32_t where
Definition starpu_task.h:353
struct starpu_codelet * cl
Definition starpu_task.h:717
size_t cl_arg_size
Definition starpu_task.h:873
starpu_data_handle_t handles[STARPU_NMAXBUFS]
Definition starpu_task.h:798
#define STARPU_CUDA
Definition starpu_task.h:65
struct starpu_task * starpu_task_create(void) STARPU_ATTRIBUTE_MALLOC
#define STARPU_MAIN_RAM
Definition starpu_task.h:144
int starpu_task_submit(struct starpu_task *task)
#define STARPU_OPENCL
Definition starpu_task.h:79
#define STARPU_CPU
Definition starpu_task.h:58
Definition starpu_task.h:338
Definition starpu_task.h:688
void starpu_vector_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, uint32_t nx, size_t elemsize)
void starpu_data_unregister(starpu_data_handle_t handle)
struct _starpu_data_state * starpu_data_handle_t
Definition starpu_data.h:45
@ STARPU_RW
Definition starpu_data.h:60
void starpu_shutdown(void)
int starpu_init(struct starpu_conf *conf)
cl_program programs[STARPU_MAXOPENCLDEVS]
Definition starpu_opencl.h:48
int starpu_opencl_load_opencl_from_file(const char *source_file_name, struct starpu_opencl_program *opencl_programs, const char *build_options)
int starpu_opencl_unload_opencl(struct starpu_opencl_program *opencl_programs)
Definition starpu_opencl.h:46
10.2
CPU Kernel
#include <xmmintrin.h>
void scal_cpu_func(
void *buffers[],
void *
cl_arg)
{
unsigned i;
for (i = 0; i < n; i++)
val[i] *= *factor;
}
void scal_sse_func(void *buffers[], void *cl_arg)
{
unsigned int n_iterations = n/4;
__m128 *VECTOR = (__m128*) vector;
float factor = *(float *) cl_arg;
FACTOR = _mm_set1_ps(factor);
unsigned int i;
for (i = 0; i < n_iterations; i++)
VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
unsigned int remainder = n%4;
if (remainder != 0)
{
unsigned int start = 4 * n_iterations;
for (i = start; i < start+remainder; ++i)
{
vector[i] = factor * vector[i];
}
}
}
#define STARPU_VECTOR_GET_NX(interface)
Definition starpu_data_interfaces.h:2101
#define STARPU_VECTOR_GET_PTR(interface)
Definition starpu_data_interfaces.h:2085
Definition starpu_data_interfaces.h:1982
10.3
CUDA Kernel
static __global__ void vector_mult_cuda(unsigned n, float *val, float factor)
{
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
val[i] *= factor;
}
extern "C" void scal_cuda_func(void *buffers[], void *_args)
{
float *factor = (float *)_args;
unsigned threads_per_block = 64;
unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
cudaError_t status = cudaGetLastError();
}
cudaStream_t starpu_cuda_get_local_stream(void)
#define STARPU_CUDA_REPORT_ERROR(status)
Definition starpu_cuda.h:60
10.4
OpenCL Kernel
10.4.1
Invoking the Kernel
void scal_opencl_func(void *buffers[], void *_args)
{
float *factor = _args;
int id, devid, err;
cl_kernel kernel;
cl_command_queue queue;
cl_event event;
{
"vector_mult_opencl",
devid);
err = clSetKernelArg(kernel, 0, sizeof(n), &n);
err |= clSetKernelArg(kernel, 1, sizeof(val), &val);
err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
}
{
size_t global=n;
size_t local;
size_t s;
cl_device_id device;
err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s);
if (local > global) local=global;
else global = (global + local-1) / local * local;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
}
{
clFinish(queue);
clReleaseEvent(event);
}
}
#define STARPU_VECTOR_GET_DEV_HANDLE(interface)
Definition starpu_data_interfaces.h:2091
int starpu_opencl_load_kernel(cl_kernel *kernel, cl_command_queue *queue, struct starpu_opencl_program *opencl_programs, const char *kernel_name, int devid)
#define STARPU_OPENCL_REPORT_ERROR(status)
Definition starpu_opencl.h:276
int starpu_opencl_release_kernel(cl_kernel kernel)
void starpu_opencl_get_device(int devid, cl_device_id *device)
int starpu_opencl_collect_stats(cl_event event)
int starpu_worker_get_devid(int id)
int starpu_worker_get_id(void)
10.4.2
Source of the Kernel
__kernel void vector_mult_opencl(int nx, __global float* val, float factor)
{
const int i = get_global_id(0);
if (i < nx)
{
val[i] *= factor;
}
}