%%writefile sgemm.cu #include #include #include #include #include "cublas_v2.h" #include "cuda_stuff.cuh" #define THREADS_PER_BLOCK 1024 #define TILE_WIDTH 32 #define SIZE 20 /** Error checking, * taken from https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api */ #define gpuErrCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true) { if (code != cudaSuccess) { fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } #endif void matmult_basic_host(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_LineA, int nb_LineB) { for (int i = 0; i < nb_LineA; i++){ for (int j = 0; j< nb_ColB; j++){ C[i * nb_ColB + j] = 0.; for (int k = 0; k < nb_ColA; k++){ C[i * nb_ColB + j] += A[i * nb_ColA + k] * B[k * nb_ColB + j]; } } } } /********************** Kernels **************************/ __global__ void matmult_basic_kernel(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_LineA, int nb_LineB) { /* TODO */ } __global__ void matmult_tiled_kernel(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_LineA, int nb_LineB){ /* TODO */ } /********************** main **************************/ int main(void){ float *A, *B, *C, *gpu_A, *gpu_B, *gpu_C; int nb_LineA, nb_LineB, nb_ColA, nb_ColB; clock_t start, end; float cpu_time_used; nb_LineA = TILE_WIDTH * SIZE; nb_LineB = TILE_WIDTH * SIZE; nb_ColA = TILE_WIDTH * SIZE; nb_ColB = TILE_WIDTH * SIZE; A = (float*) malloc(nb_LineA * nb_ColA * sizeof(float)); B = (float*) malloc(nb_LineB * nb_ColB * sizeof(float)); C = (float*) malloc(nb_LineA * nb_ColB * sizeof(float)); /* Initialization of A et B */ for (int i = 0; i < nb_LineA * nb_ColA; i++) { A[i] = 1.0; } for (int i = 0; i < nb_LineB * nb_ColB; i++) { B[i] = 2.0; } /* Allocation in GPU memory */ /* TODO */ /* Copies of matrixes A and B on the GPU */ /* TODO */ gpuErrCheck(cudaMemset(gpu_C, nb_LineA * nb_ColB * sizeof(float), 0.)); /******* Matrix Multiplication on the host ************************/ start = clock(); matmult_basic_host(A, B, C, nb_ColA, nb_ColB, nb_LineA, nb_LineB); end = clock(); cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC; printf("Time taken by CPU in milliseconds: %.2f\n", cpu_time_used); { /* Result correctness */ float maxError = 0.0f; for (int i = 0; i < nb_LineA * nb_ColB; i++){ maxError = max(maxError, abs(C[i]- 2*nb_LineB)); } printf("Max error: %f\n", maxError); } /* Grid and block distributions */ /* TODO */ /******* Matrix Multiplication on the GPU ************************/ start = clock(); matmult_basic_kernel <<< dimGrid, dimBlock >>> (gpu_A, gpu_B, gpu_C, nb_ColA, nb_ColB, nb_LineA, nb_LineB); gpuErrCheck(cudaDeviceSynchronize()); end = clock(); cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC; printf("Time taken on GPU in global memory in milliseconds: %.2f\n", cpu_time_used); gpuErrCheck(cudaMemcpy(C, gpu_C, nb_LineA * nb_ColB * sizeof(float), cudaMemcpyDeviceToHost)); { /* Result correctness */ float maxError = 0.0f; for (int i = 0; i < nb_LineA * nb_ColB; i++){ maxError = max(maxError, abs(C[i]- 2*nb_LineB)); } printf("Max error: %f\n", maxError); } /******* Matrix Multiplication on the GPU in shared memory ************************/ gpuErrCheck(cudaMemset(gpu_C, nb_LineA * nb_ColB * sizeof(float), 0.)); start = clock(); matmult_tiled_kernel <<< dimGrid, dimBlock >>> (gpu_A, gpu_B, gpu_C, nb_ColA, nb_ColB, nb_LineA, nb_LineB); gpuErrCheck(cudaDeviceSynchronize()); end = clock(); cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC; printf("Time taken on GPU in shared memory in milliseconds: %.2f\n", cpu_time_used); gpuErrCheck(cudaMemcpy(C, gpu_C, nb_LineA * nb_ColB * sizeof(float), cudaMemcpyDeviceToHost)); { /* Result correctness */ float maxError = 0.0f; for (int i = 0; i < nb_LineA * nb_ColB; i++){ maxError = max(maxError, abs(C[i]- 2*nb_LineB)); } printf("Max error: %f\n", maxError); } /********* CuBLAS ******************************/ cublasHandle_t handle; cublasCreate(&handle); float alpha = 1.; float beta = 0.; for (int i = 0; i < 3; i++) cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, nb_LineA, nb_ColB, nb_ColA, &alpha, gpu_A, nb_LineA, gpu_B, nb_LineB, &beta, gpu_C, nb_ColA); start = clock(); cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, nb_LineA, nb_ColB, nb_ColA, &alpha, gpu_A, nb_LineA, gpu_B, nb_LineB, &beta, gpu_C, nb_ColA); gpuErrCheck(cudaMemset(gpu_C, nb_LineA * nb_ColB * sizeof(float), 0.)); end = clock(); cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC; printf("Time taken with CuBLAS: %.2f\n", cpu_time_used); gpuErrCheck(cudaMemcpy(C, gpu_C, nb_LineA * nb_ColB * sizeof(float), cudaMemcpyDeviceToHost)); { /* Result correctness */ float maxError = 0.0f; for (int i = 0; i < nb_LineA * nb_ColB; i++){ maxError = max(maxError, abs(C[i]- 2*nb_LineB)); } printf("Max error: %f\n", maxError); } free(A); free(B); free(C); gpuErrCheck(cudaFree(gpu_A)); gpuErrCheck(cudaFree(gpu_B)); gpuErrCheck(cudaFree(gpu_C)); }