#include #include #include #include // helper for shared functions common to CUDA SDK samples #include #include #include #include #include #include #define TILE_WIDTH 32 #define SIZE 20 /********************** kernel **************************/ __global__ void matmul(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_LigneA, int nb_LigneB) { float tmp_c = 0; int col = blockIdx.x * blockDim.x + threadIdx.x; int ligne = blockIdx.y * blockDim.y + threadIdx.y; /* m is the Tile ID*/ for (int k = 0; k < nb_ColB; k++) { tmp_c += A[ligne * nb_ColA + k] * B[k * nb_LigneB + col]; } C[nb_ColA * ligne + col] = tmp_c; } /********************** main **************************/ int main(void) { float *A, *B, *C, *gpu_A, *gpu_B, *gpu_C; int nbLigneA, nbLigneB, nbColA, nbColB; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); nbLigneA = TILE_WIDTH * SIZE; nbLigneB = TILE_WIDTH * SIZE; nbColA = TILE_WIDTH * SIZE; nbColB = TILE_WIDTH * SIZE; A = (float*) malloc(nbLigneA * nbColA * sizeof(float)); B = (float*) malloc(nbLigneB * nbColB * sizeof(float)); C = (float*) malloc(nbLigneA * nbColB * sizeof(float)); /*Allocation de l'espace pour gpu_x et gpu_y qui vont recevoir x et y sur le GPU*/ cudaMalloc(&gpu_A, nbLigneA * nbColA * sizeof(float)); cudaMalloc(&gpu_B, nbLigneB * nbColB * sizeof(float)); cudaMalloc(&gpu_C, nbLigneA * nbColB * sizeof(float)); /* Initialisation de x et y*/ for (int i = 0; i < nbLigneA * nbColA; i++) { A[i] = 1.0; } for (int i = 0; i < nbLigneB * nbColB; i++) { B[i] = 2.0; } /* Copie de A et B sur le GPU dans gpu_x et gpu_y respectivement*/ cudaMemcpy(gpu_A, A, nbLigneA * nbColA * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(gpu_B, B, nbLigneB * nbColB * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(gpu_C, C, nbLigneA * nbColB * sizeof(float), cudaMemcpyHostToDevice); /* Appel au kernel saxpy sur les N éléments */ //saxpy<<<(N+255)/256, 256>>>(N, 2.0f, gpu_x, gpu_y); // dim3 grid(N,1); dim3 dimGrid((nbColB - 1) / TILE_WIDTH+1, (nbLigneA - 1)/TILE_WIDTH + 1, 1); dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); cudaEventRecord(start); // execute the kernel int nIter = 1; // CUBLAS version 2.0 API interface { cublasHandle_t handle; cublasCreate(&handle); const float alpha = 1.0f; const float beta = 0.0f; //Perform warmup operation with cublas cublasStatus_t ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, nbColB, nbLigneA, nbColA, &alpha, gpu_B, nbColB, gpu_A, nbColA, &beta, gpu_C, nbColA); // checkError(ret, "cublas Sgemm returned an error!\n"); // Start Timing (CUBLAS) StartTimer(); /* for (int j = 0; j < nIter; j++) */ /* { */ //note cublas is column primary! //need to transpose the order cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, nbColB, nbLigneA, nbColA, &alpha, gpu_B, nbColB, gpu_A, nbColA, &beta, gpu_C, nbColA); // } double dSeconds = GetTimer()/((double)nIter * 1000.0); double dNumOps = 2.0 * (double)nbColA * (double)nbLigneA * (double)nbColB; double gflops = 1.0e-9 * dNumOps/dSeconds; printf("done.\n"); //Log througput, etc printf("CUBLAS= %.4f GFlop/s, Time= %.2f(ms), Size = %.0f Ops\n", gflops, dSeconds*1000., dNumOps); // copy result from device to host cudaMemcpy(gpu_C, C, nbLigneA * nbColB * sizeof(float), cudaMemcpyDeviceToHost); cublasDestroy(handle); } cudaFree(gpu_A); cudaFree(gpu_B); cudaFree(gpu_C); free(A); free(B); free(C); }