Simple OpenCL program compiles and runs but output is incorrect


I wrote a simply OpenCL program based off the SDK and it compiles and runs, however the output is wrong. Is there something I'm doing wrong?

Any suggestions for learning to debug C and OpenCL is much appreciated. I'm quite new to the platform.

Code is below.

The output in array c is all zeros.



#ifndef _TEST_OPENCL_H_
#define _TEST_OPENCL_H_

int main( int argc, const char** argv);
int runTest( int argc, const char** argv);


// simple test of adding a[i] to b[i] to get c[i]
__kernel void add_array(__global float *a, __global float *b, __global float *c)
    int xid = get_global_id(0);
    c[xid] = a[xid] + b[xid];


// standard utility and system includes
#include <oclUtils.h>
#include "test_opencl.h"

// OpenCL error catcher
cl_int err = 0;

// Main Program
// *********************************************************************
int main( int argc, const char** argv) 
    // set logfile name and start logs
    shrSetLogFileName ("test_opencl.txt");
    shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); 

    // run the main test
    int result = runTest(argc, argv);
    shrCheckError(result, 0);

    // finish
    shrEXIT(argc, argv);

//! Run a simple test for OPENCL
// *********************************************************************
int runTest( int argc, const char** argv) 
    cl_context gpu_context;
    cl_command_queue cmd_queue;
    cl_program program;
    cl_kernel test_kernel;

    const size_t szGlobalWorkSize = 10;
    const size_t szLocalWorkSize = 10;

    // size of memory required to store the array
    const unsigned int mem_size = sizeof(int) * 10;

    // create the OpenCL context on a GPU device
    gpu_context = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &err);
    shrCheckError(err, CL_SUCCESS);

    // get devices
    cl_device_id device;
    if( shrCheckCmdLineFlag(argc, argv, "device") ) {
      int device_nr = 0;
      shrGetCmdLineArgumenti(argc, argv, "device", &device_nr);
      device = oclGetDev(gpu_context, device_nr);
    } else {
      device = oclGetMaxFlopsDev(gpu_context);

    // create a command-queue
    cmd_queue = clCreateCommandQueue(gpu_context, device, 0, &err);
    shrCheckError(err, CL_SUCCESS);

    // allocate and initalize host memory
    int a[10], b[10], c[10];
    for (int i = 0; i < 10; i++) {
        a[i] = i;
        b[i] = i * i;

    // create buffers on device
    cl_mem vol_a = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, a, &err);
    shrCheckError(err, CL_SUCCESS);

    cl_mem vol_b = clCreateBuffer(gpu_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, b, &err);
    shrCheckError(err, CL_SUCCESS);

    cl_mem vol_c = clCreateBuffer(gpu_context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, mem_size, c, &err);
    shrCheckError(err, CL_SUCCESS);

    // copy data from host to device
    err = clEnqueueWriteBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, a, 0, NULL, NULL);
    err |= clEnqueueWriteBuffer(cmd_queue, vol_b, CL_TRUE, 0, mem_size, b, 0, NULL, NULL);
    shrCheckError(err, CL_SUCCESS);

    // Program Setup
    size_t program_length;
    char* source_path = shrFindFilePath("", argv[0]);
    shrCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    shrCheckError(source != NULL, shrTRUE);

    // create the program
    program = clCreateProgramWithSource(gpu_context, 1, (const char **)&source, &program_length, &err);
    shrCheckError(err, CL_SUCCESS);

    // build the program
    err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
    if (err != CL_SUCCESS)
        // write out standard error, Build Log and PTX, then return error
        shrLog(LOGBOTH | ERRORMSG, err, STDERROR);


    shrLog(LOGBOTH, 0, "%s Starting kernel operation...\n\n", argv[0]); 

    // create the test kernel
    test_kernel = clCreateKernel(program, "add_array", &err);
    shrCheckError(err, CL_SUCCESS);

    // set the args values for the kernel
    err  = clSetKernelArg(test_kernel, 0, sizeof(cl_mem), (void *) &vol_a);
    err |= clSetKernelArg(test_kernel, 1, sizeof(cl_mem), (void *) &vol_b);
    err |= clSetKernelArg(test_kernel, 2, sizeof(cl_mem), (void *) &vol_c);
    shrCheckError(err, CL_SUCCESS);

    err = clEnqueueNDRangeKernel(cmd_queue, test_kernel, 1, NULL, &szGlobalWorkSize, NULL, 0, NULL, NULL);
    shrCheckError(err, CL_SUCCESS);


    // copy result from device to host
    err = clEnqueueReadBuffer(cmd_queue, vol_c, CL_TRUE, 0, mem_size, c, 0, NULL, NULL);
    shrCheckError(err, CL_SUCCESS);

    int d[10];
    err = clEnqueueReadBuffer(cmd_queue, vol_a, CL_TRUE, 0, mem_size, d, 0, NULL, NULL);
    shrCheckError(err, CL_SUCCESS);


    shrLog(LOGBOTH, 0, "%s Finished kernel operation...\n\n", argv[0]); 

    bool passed = true;

    for (int i = 0; i < 10; i++) {
        if (c[i] != i + i * i)
            passed = false;
            shrLog(LOGBOTH, 0, "c = %d    d = %d\n", c[i], d[i]); 

    if (passed)
        shrLog(LOGBOTH, 0, "%s Test Passed\n\n", argv[0]); 
        shrLog(LOGBOTH, 0, "%s Test Failed\n\n", argv[0]); 

    // cleanup OpenCL


    return 0;

Best Solution

The problems in the code and the solution can be found here.

Related Question