#include #include #include #include static inline uint64_t read_tscp(void) { uint64_t rax, rcx, rdx; asm volatile("rdtscp" : "=a" (rax), "=d" (rdx), "=c" (rcx)); return rax | (rdx << 32); } /* 1024 float can remain in the L1 cache even if we have other data */ #define L1_FIT 1024 #define FLOAT_NBBITS (sizeof(float)*8) #define VEC_NBBITS 128 #define VEC_NBBYTES (VEC_NBBITS/8) #define VEC_SIZE (VEC_NBBYTES/sizeof(float)) #define AS_FLOAT_N (8*1024*1024) #define AS_VSF_N (AS_FLOAT_N/VEC_SIZE) #define POW 16 typedef float vsf_t __attribute__((vector_size VEC_NBBYTES)); /* vector of single floats */ typedef union vector { vsf_t as_vsf[AS_VSF_N]; float as_float[AS_FLOAT_N]; } vector_t; void arrayPow_v0(vector_t* tab) { for(uint64_t i=0; ias_float[j] *= tab->as_float[j]; } } } void arrayPow_v1(vector_t* tab) { for(uint64_t k=0; kas_float[j] *= tab->as_float[j]; } } } } void arrayPow_v2(vector_t* tab) { for(uint64_t k=0; kas_vsf[j] *= tab->as_vsf[j]; } } } } void test(char* msg, void (*arrayPow)(vector_t*), vector_t* tab) { uint64_t start = read_tscp(); /* start of experiment */ arrayPow(tab); uint64_t end = read_tscp(); /* end of experiment */ printf("[%20s] Elasped time: %ld millions cycles\n", msg, (end - start)/1000000); } int main(int argc, char** argv) { union vector* tab = malloc(sizeof(union vector)); srand(time(0)); for(uint64_t i=0; ias_float[i] = (float)(1 + rand() % 10); } printf("Compute the power of %u for each element of an array of %u MiB elements\n", POW, AS_FLOAT_N/(1024*1024)); test("Direct", arrayPow_v0, tab); test("Tilling", arrayPow_v1, tab); test("Vectorization", arrayPow_v2, tab); return 0; }