#include #include #include #include /* Constants */ #define KERNEL_FILE "memory_test.cl" #define KERNEL_FUNCTION "read_linear" #define BUFFER_SIZE (16 << 20) #define NUM_READS 32 #define TRIES 10 #define LOCAL_SIZE 128 #define MEM_FLAG (CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR) /* Struct representing basic device info */ typedef struct CLDevice { cl_device_id id; char* name; cl_device_type type; } CLDevice; /* Struct representing the test execution environment for one device */ typedef struct CLEnvironment { cl_context context; cl_platform_id platform; cl_device_id device; cl_command_queue queue; cl_program program; cl_kernel* kernels; unsigned int kernels_count; } CLEnvironment; /* Function used to get the list of devices in a platform */ void clListDevices( cl_platform_id platform, unsigned int* count, CLDevice** output); /* Function used to garbage the content of a device struct */ void clFreeDevice( CLDevice* device); /* Transform the integer representing a device type into a string */ void clDeviceTypeToString( cl_device_type type, char** output); /* Utility functions */ void clLoadFile( const char* file_name, unsigned int* size, const char* * output); /* Create the environment for the execution of a set of kernels on a specific device */ void clCreateEnvironment( CLDevice* device, const char* kernel_path, const char** kernel_functions, unsigned int kernel_functions_count, const char* build_options, unsigned int enable_gpu_profiling, CLEnvironment* output); /* Function used to garbage the content of an environment struct */ void clFreeEnvironment( CLEnvironment* environment); /* Set up the GPU computation */ double runGpuComputation( CLEnvironment* env, unsigned int kernel_index, float* input_data, float* correct_output, unsigned int input_buffer_size, unsigned int output_buffer_size, unsigned int global_size, bool* success); /* Main */ int main(int argc, char* argv[]) { /* Get platform id */ cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); /* List devices */ /* Discover devices */ printf("- Tested devices listed below\n"); unsigned int device_count; CLDevice* devices; clListDevices(platform, &device_count, &devices); for(unsigned int i = 0; i < device_count; i++) { char* device_type; clDeviceTypeToString(devices[i].type, &device_type); printf(" %s[%s]\n", devices[i].name, device_type); free(device_type); } /* Create opencl environment for each device */ printf("\n- Creating opencl environment for each tested device..."); char* kernel_functions[1] = { KERNEL_FUNCTION }; char build_options[32]; memset(build_options, 0, 32); sprintf(build_options, "-D NUM_READS=%d", NUM_READS); CLEnvironment* environments = (CLEnvironment*)malloc(device_count * sizeof(CLEnvironment)); for(unsigned int device_index = 0; device_index < device_count; device_index++) { clCreateEnvironment( &devices[device_index], KERNEL_FILE, (const char**)kernel_functions, 1, build_options, true, &environments[device_index]); } /* Test all the opencl supported device */ /* * We create a thread for each element of the buffer. * Each thread reads NUM_READS sequential elements starting from the element * corresponding to its global position, sums all of them and set the * output buffer element correponding to its global posizion to the result of the sum. * The numer of threads (and of elements) for an input buffer of BUFFER_SIZE bytes is * num_elements = BUFFER_SIZE / sizeof(float). * If we want to create such a number of thread we need to allocate * (num_elements + NUM_READS) * sizeof(float) bytes, or, equivalently * BUFFER_SIZE + (NUM_READS * sizeof(float)) */ unsigned int global_size = BUFFER_SIZE / sizeof(float); unsigned int input_buffer_size = BUFFER_SIZE + (NUM_READS * sizeof(float)); unsigned int output_buffer_size = BUFFER_SIZE; /* Allocate data and compute correct output */ float* input_data = (float*)malloc(input_buffer_size); float* output_data = (float*)malloc(output_buffer_size); /* Simple initialization of the input */ for(unsigned int i = 0; i < global_size + NUM_READS; i++) input_data[i] = (float)i; /* Init output data with the correct output */ memset(output_data, 0, output_buffer_size); for(unsigned int i = 0; i < global_size; i++) { for(unsigned int j = 0; j < NUM_READS; j++) output_data[i] += input_data[i + j]; } /* Run test for every supported device */ for(unsigned int device_index = 0; device_index < device_count; device_index++) { char* type; clDeviceTypeToString(devices[device_index].type, &type); printf("\n- Testing %s [%s] (%d bytes buffer, %d reads per thread)\n", devices[device_index].name, type, BUFFER_SIZE, NUM_READS); free(type); /* Test bandwidth TRIES time and get the average */ bool success = true; double samples[TRIES]; for(unsigned int sample_index = 0; sample_index < TRIES; sample_index++) { bool sample_success; samples[sample_index] = runGpuComputation( &environments[device_index], 0, input_data, output_data, input_buffer_size, output_buffer_size, global_size, &sample_success); success &= sample_success; } double average = 0; for(unsigned int sample_index = 0; sample_index < TRIES; sample_index++) average += samples[sample_index]; printf("Estimated bandwidth: %*.*f MB/s (success = %d)\n", 8, 2, average/TRIES, success); } /* Free resource */ for(unsigned int i = 0; i < device_count; i++) { clFreeEnvironment(&environments[i]); clFreeDevice(&devices[i]); } free(environments); free(devices); free(input_data); free(output_data); printf("\n- Test ended. Press a key to exit..."); getchar(); } /* -------------------------------------------------------------------------------------------*/ /* Computation */ double runGpuComputation( CLEnvironment* env, unsigned int kernel_index, float* input_data, float* correct_output, unsigned int input_buffer_size, unsigned int output_buffer_size, unsigned int global_size, bool* success) { cl_int err; size_t local_size = LOCAL_SIZE; cl_mem input, output; /* Create buffers */ input = clCreateBuffer(env->context, MEM_FLAG, input_buffer_size, 0, &err); /* Writes are performed on device memory (we can assume writes are not a bootleneck for read bandwidth) */ output = clCreateBuffer(env->context, CL_MEM_WRITE_ONLY, output_buffer_size, 0, &err); /* Instantiate input buffer */ err |= clEnqueueWriteBuffer(env->queue, input, CL_TRUE, 0, input_buffer_size, input_data, 0, NULL, NULL); /* Set kernel args */ err |= clSetKernelArg(env->kernels[kernel_index], 0, sizeof(cl_mem), (void*)&input); err |= clSetKernelArg(env->kernels[kernel_index], 1, sizeof(cl_mem), (void*)&output); /* Run kernel */ cl_event exec_event; err |= clEnqueueNDRangeKernel(env->queue, env->kernels[kernel_index], 1, NULL, &global_size, &local_size, 0, NULL, &exec_event); clFinish(env->queue); /* Read and validate output */ float* output_data = (float*)malloc(output_buffer_size); err |= clEnqueueReadBuffer(env->queue, output, CL_TRUE, 0, output_buffer_size, output_data, 0, 0, 0); /* Summing floating point data with no decimal digits should not give approximation problems, so memcmp */ *success = (memcmp(correct_output, output_data, output_buffer_size) == 0); /* Check exec_time in seconds */ cl_ulong start, end; clWaitForEvents(1, &exec_event); clGetEventProfilingInfo(exec_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, 0); clGetEventProfilingInfo(exec_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, 0); double exec_time = (double)(end - start) / (double)1.0e9; /* Compute bandwidth */ double bytes_readed = (double)(global_size * NUM_READS) * sizeof(float); double bytes_per_second = bytes_readed / exec_time; double megabytes_per_second = bytes_per_second / (1 << 20); /* Free resources */ clReleaseMemObject(input); clReleaseMemObject(output); free(output_data); return megabytes_per_second; } /* Environment related function */ void clCreateEnvironment( CLDevice* device, const char* kernel_path, const char** kernel_functions, unsigned int kernel_functions_count, const char* build_options, unsigned int enable_gpu_profiling, CLEnvironment* output) { output->device = device->id; printf("\n %-35s", "Getting platform id..."); clGetPlatformIDs(1, &output->platform, NULL); printf("DONE!\n"); char print[256]; sprintf(print, "Searching device (%s)...", device->name); printf(" %-35s", print); clGetDeviceIDs(output->platform, device->type, 1, &device->id, NULL); printf("DONE!\n"); printf(" %-35s", "Creating context..."); output->context = clCreateContext(0, 1, &output->device, NULL, NULL, NULL); printf("DONE!\n"); printf(" %-35s", "Creating command queue..."); if(enable_gpu_profiling) output->queue = clCreateCommandQueue(output->context, output->device, CL_QUEUE_PROFILING_ENABLE, NULL); else output->queue = clCreateCommandQueue(output->context, output->device, 0, NULL); printf("DONE!\n"); printf(" %-35s", "Loading kernel file..."); const char* kernel_src; unsigned int kernel_size; clLoadFile(kernel_path, &kernel_size, &kernel_src); printf("DONE!\n"); printf(" %-35s", "Creating program with source..."); output->program = clCreateProgramWithSource(output->context, 1, &kernel_src, &kernel_size, NULL); printf("DONE!\n"); printf(" %-35s", "Building program..."); cl_int status = clBuildProgram(output->program, 0, NULL, build_options, NULL, NULL); if(status != CL_SUCCESS) { char build_log[2048]; memset(build_log, 0, 2048); clGetProgramBuildInfo(output->program, device->id, CL_PROGRAM_BUILD_LOG, 2048, build_log, NULL); printf("\n\nBUILD LOG: \n %s\n\n", build_log); } printf("DONE!\n"); output->kernels_count = kernel_functions_count; output->kernels = (cl_kernel*)malloc(kernel_functions_count * sizeof(cl_kernel)); for(unsigned int i = 0; i < kernel_functions_count; i++) { cl_int err; char title[2048]; memset(title, 0, 2048); sprintf(title, " Creating kernel %s", kernel_functions[i]); printf(" %-35s", title, i); output->kernels[i] = clCreateKernel(output->program, kernel_functions[i], &err); if(err == 0) printf("DONE!\n"); else printf("FAIL! (%d)\n", err); } free((char*)kernel_src); } void clFreeEnvironment( CLEnvironment* environment) { for(unsigned int i = 0; i < environment->kernels_count; i++) { if(environment->kernels[i]) clReleaseKernel(environment->kernels[i]); } if(environment->program) clReleaseProgram(environment->program); if(environment->queue) clReleaseCommandQueue(environment->queue); if(environment->context) clReleaseContext(environment->context); if(environment->kernels != NULL) free(environment->kernels); } /* Device related functions */ void clListDevices( cl_platform_id platform, unsigned int* count, CLDevice** output) { cl_device_id device_ids[100]; CLDevice* devices; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 100, device_ids, count); devices = (CLDevice*)malloc(*count * sizeof(CLDevice)); for(unsigned int i = 0; i < *count; i++) { devices[i].id = device_ids[i]; devices[i].name = (char*)malloc(256 * sizeof(char)); cl_device_type type; clGetDeviceInfo(devices[i].id, CL_DEVICE_NAME, 256, devices[i].name, NULL); clGetDeviceInfo(devices[i].id, CL_DEVICE_TYPE, sizeof(type), &type, NULL); devices[i].type = type; } *output = devices; } void clFreeDevice( CLDevice* device) { free(device->name); } void clDeviceTypeToString( cl_device_type type, char** output) { char* temp = "DEFAULT"; switch(type) { case CL_DEVICE_TYPE_GPU: { temp = "GPU"; } break; case CL_DEVICE_TYPE_CPU: { temp = "CPU"; } break; case CL_DEVICE_TYPE_ACCELERATOR: { temp = "ACCELERATOR"; } break; default: { temp = "DEFAULT"; } } *output = (char*)malloc(strlen(temp) + 1); memset(*output, 0, strlen(temp) + 1); strncpy(*output, temp, strlen(temp)); } /* Utility functions */ void clLoadFile( const char* file_name, unsigned int* size, const char* * output) { // locals FILE* pFileStream = NULL; size_t szSourceLength; if(fopen_s(&pFileStream, file_name, "rb") != 0) *output = NULL; // get the length of the source code fseek(pFileStream, 0, SEEK_END); szSourceLength = ftell(pFileStream); fseek(pFileStream, 0, SEEK_SET); // allocate a buffer for the source code string and read it in char* cSourceString = (char*)malloc(szSourceLength + 1); if (fread(cSourceString, szSourceLength, 1, pFileStream) != 1) { fclose(pFileStream); free(cSourceString); } // close the file and return the total length of the combined (preamble + source) string fclose(pFileStream); cSourceString[szSourceLength] = '\0'; *size = szSourceLength; *output = cSourceString; }