// example codes showing instruction latency and throughput #include #include #include #define CPU_clockrate 3.3 // GHz template void test_add() { Type x[K], one = 1.0; for (long k = 0; k < K; k++) x[k] = 3.14 + k; double T = -omp_get_wtime(); for (long i = 0; i < 1000000000L; i++) for (long k = 0; k < K; k++) x[k] = one + x[k]; T += omp_get_wtime(); std::cout<<"T = "<< T <<'\n'; std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n'; // print the result otherwise the // compiler optimize out everything Type sum = 0.; for (long k = 0; k < K; k++) sum += x[k]; std::cout<<"Result = "< void test_division() { Type x[K], one = 1.0; for (long k = 0; k < K; k++) x[k] = 3.14 + k; double T = -omp_get_wtime(); for (long i = 0; i < 1000000000L; i++) for (long k = 0; k < K; k++) x[k] = one / x[k]; T += omp_get_wtime(); std::cout<<"T = "<< T <<'\n'; std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n'; // print the result otherwise the // compiler optimize out everything Type sum = 0.; for (long k = 0; k < K; k++) sum += x[k]; std::cout<<"Result = "<(); std::cout<<"\n\nAdding 32 doubles at a time:\n"; test_add(); std::cout<<"\n\nAdding 8 Vec at a time:\n"; test_add, 8>(); std::cout<<"\n\nDividing 8 Vec at a time:\n"; test_division, 8>(); return 0; }