%%writefile sgemm.cu
#include <stdlib.h>
#include <stdio.h>

#include <cuda.h>
#include <cuda_runtime.h>

#include "cublas_v2.h"

#include "cuda_stuff.cuh"

#define THREADS_PER_BLOCK 1024
#define TILE_WIDTH 32
#define SIZE 20


/** Error checking,
 *  taken from https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api
 */
#define gpuErrCheck(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

#endif


void matmult_basic_host(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_LineA, int nb_LineB) { 
  for (int i = 0; i < nb_LineA; i++){
    for (int j = 0; j< nb_ColB; j++){
      C[i * nb_ColB + j] = 0.; 
      for (int k = 0; k < nb_ColA; k++){
        C[i * nb_ColB + j] += A[i * nb_ColA + k] * B[k * nb_ColB + j];
      }
    }
  } 
}


/********************** Kernels **************************/
__global__
void matmult_basic_kernel(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_LineA, int nb_LineB) {
 /* TODO */
}

__global__
void matmult_tiled_kernel(float *A, float *B, float *C, int nb_ColA, int nb_ColB, int nb_LineA, int nb_LineB){
  /* TODO */
}


/********************** main **************************/
int main(void){
  float *A, *B, *C, *gpu_A, *gpu_B, *gpu_C;
  int nb_LineA, nb_LineB, nb_ColA, nb_ColB;

  clock_t start, end;
  float cpu_time_used;  

  nb_LineA = TILE_WIDTH * SIZE;
  nb_LineB = TILE_WIDTH * SIZE;
  nb_ColA = TILE_WIDTH * SIZE;
  nb_ColB = TILE_WIDTH * SIZE;

  A = (float*) malloc(nb_LineA * nb_ColA * sizeof(float));
  B = (float*) malloc(nb_LineB * nb_ColB * sizeof(float));
  C = (float*) malloc(nb_LineA * nb_ColB * sizeof(float));

  /* Initialization of A et B */
  for (int i = 0; i < nb_LineA * nb_ColA; i++) {
    A[i] = 1.0;
  }
  for (int i = 0; i < nb_LineB * nb_ColB; i++) {
    B[i] = 2.0;
  }

  /* Allocation in GPU memory */
  /* TODO */

  /* Copies of matrixes A and B on the GPU */
  /* TODO */

  gpuErrCheck(cudaMemset(gpu_C, nb_LineA * nb_ColB * sizeof(float), 0.));

  /******* Matrix Multiplication on the host ************************/
  start = clock();
  matmult_basic_host(A, B, C, nb_ColA, nb_ColB, nb_LineA, nb_LineB);
  end = clock();
  cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC;
  printf("Time taken by CPU in milliseconds: %.2f\n", cpu_time_used);

  {
    /* Result correctness */
    float maxError = 0.0f;
    for (int i = 0; i < nb_LineA * nb_ColB; i++){
        maxError = max(maxError, abs(C[i]- 2*nb_LineB));
      }
    printf("Max error: %f\n", maxError);
  }

  /* Grid and block distributions */
  /* TODO */ 


  /******* Matrix Multiplication on the GPU ************************/
  start = clock();
  matmult_basic_kernel <<< dimGrid, dimBlock >>> (gpu_A, gpu_B, gpu_C, nb_ColA, nb_ColB, nb_LineA, nb_LineB);
  gpuErrCheck(cudaDeviceSynchronize());
  end = clock();
  cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC;
  printf("Time taken on GPU in global memory in milliseconds: %.2f\n", cpu_time_used);
  
  gpuErrCheck(cudaMemcpy(C, gpu_C, nb_LineA * nb_ColB * sizeof(float), cudaMemcpyDeviceToHost));
  {
    /* Result correctness */
    float maxError = 0.0f;
    for (int i = 0; i < nb_LineA * nb_ColB; i++){
        maxError = max(maxError, abs(C[i]- 2*nb_LineB));
      }
    printf("Max error: %f\n", maxError);
  }
  
  /******* Matrix Multiplication on the GPU in shared memory ************************/
  gpuErrCheck(cudaMemset(gpu_C, nb_LineA * nb_ColB * sizeof(float), 0.));
  
  start = clock();
  matmult_tiled_kernel <<< dimGrid, dimBlock >>> (gpu_A, gpu_B, gpu_C, nb_ColA, nb_ColB, nb_LineA, nb_LineB);
  gpuErrCheck(cudaDeviceSynchronize());
  end = clock();
  cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC;
  
  printf("Time taken on GPU in shared memory in milliseconds: %.2f\n", cpu_time_used);
  gpuErrCheck(cudaMemcpy(C, gpu_C, nb_LineA * nb_ColB * sizeof(float), cudaMemcpyDeviceToHost));
  {
    /* Result correctness */
    float maxError = 0.0f;
    for (int i = 0; i < nb_LineA * nb_ColB; i++){
        maxError = max(maxError, abs(C[i]- 2*nb_LineB));
      }
    printf("Max error: %f\n", maxError);
  }

  /********* CuBLAS ******************************/
  cublasHandle_t handle;
  cublasCreate(&handle);  
  float alpha = 1.;
  float beta = 0.;

  for (int i = 0; i < 3; i++)
   cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, nb_LineA, nb_ColB, nb_ColA, &alpha, gpu_A, nb_LineA, gpu_B, nb_LineB, &beta, gpu_C, nb_ColA);

  start = clock();
  cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, nb_LineA, nb_ColB, nb_ColA, &alpha, gpu_A, nb_LineA, gpu_B, nb_LineB, &beta, gpu_C, nb_ColA);
  gpuErrCheck(cudaMemset(gpu_C, nb_LineA * nb_ColB * sizeof(float), 0.));
  end = clock();
  cpu_time_used = ((double) (end - start)) * 1000 / CLOCKS_PER_SEC;
  
  printf("Time taken with CuBLAS: %.2f\n", cpu_time_used);
  gpuErrCheck(cudaMemcpy(C, gpu_C, nb_LineA * nb_ColB * sizeof(float), cudaMemcpyDeviceToHost));
  {
    /* Result correctness */
    float maxError = 0.0f;
    for (int i = 0; i < nb_LineA * nb_ColB; i++){
        maxError = max(maxError, abs(C[i]- 2*nb_LineB));
      }
    printf("Max error: %f\n", maxError);
  }

  free(A);
  free(B);
  free(C);
  gpuErrCheck(cudaFree(gpu_A));
  gpuErrCheck(cudaFree(gpu_B));
  gpuErrCheck(cudaFree(gpu_C));
}