2 rokov pred · dc0fdf708d
--- a/code/.gitmodules
+++ b/code/.gitmodules
@@ -0,0 +1,3 @@
 
				+[submodule "SCTL"]
			
 
				+	path = SCTL
			
 
				+	url = git@github.com:dmalhotra/SCTL.git
			
--- a/code/Makefile
+++ b/code/Makefile
@@ -1,8 +1,6 @@
 
				 CXX=g++ # requires g++-8 or newer / icpc (with gcc compatibility 7.5 or newer) / clang++ with llvm-10 or newer
			
 
				 CXXFLAGS = -O3 -march=native -std=c++11 -fopenmp # need C++11 and OpenMP
			
 
				 
			
 
				-#CXXFLAGS += -DSCTL_HAVE_MPI #use MPI
			
 
				-
			
 
				 RM = rm -f
			
 
				 MKDIRS = mkdir -p
			
 
				 
			
@@ -12,19 +10,18 @@ OBJDIR = ./obj
 
				 INCDIR = ./SCTL/include
			
 
				 
			
 
				 TARGET_BIN = \
			
 
				-       $(BINDIR)/instruction \
			
 
				+       $(BINDIR)/instruction-cost \
			
 
				        $(BINDIR)/poly-eval \
			
 
				        $(BINDIR)/gemm-ker \
			
 
				-       $(BINDIR)/bandwidth \
			
 
				-       $(BINDIR)/gemm
			
 
				+       $(BINDIR)/gemm-blocking \
			
 
				+       $(BINDIR)/bandwidth-l1 \
			
 
				+       $(BINDIR)/bandwidth-main-memory
			
 
				 
			
 
				 all : $(TARGET_BIN)
			
 
				 
			
 
				 $(BINDIR)/%: $(OBJDIR)/%.o
			
 
				 	-@$(MKDIRS) $(dir $@)
			
 
				 	$(CXX) $^ $(CXXFLAGS) $(LDLIBS) -o $@
			
 
				-#perf stat -e L1-dcache-load-misses -e L1-dcache-loads -e l2_rqsts.miss -e l2_rqsts.references -e LLC-load-misses -e LLC-loads mpiexec -n 1 --map-by slot:pe=16 ./$@
			
 
				-
			
 
				 
			
 
				 $(OBJDIR)/%.o: $(SRCDIR)/%.cpp
			
 
				 	-@$(MKDIRS) $(dir $@)
			
--- a/code/README.md
+++ b/code/README.md
@@ -0,0 +1,15 @@
 
				+# Cloning repository
			
 
				+
			
 
				+git clone git@github.com:dmalhotra/fwam2022.git
			
 
				+
			
 
				+git submodule init
			
 
				+
			
 
				+git submodule update
			
 
				+
			
 
				+# Requirements
			
 
				+
			
 
				+g++-8 or newer
			
 
				+
			
 
				+# Compiling
			
 
				+
			
 
				+make -j
			
--- a/code/src/bandwidth-l1.cpp
+++ b/code/src/bandwidth-l1.cpp
@@ -0,0 +1,93 @@
 
				+// example code showing bandwidth of L1 cache and effect of memory alignment
			
 
				+
			
 
				+#include <iostream>
			
 
				+#include <omp.h>
			
 
				+#include <sctl.hpp>
			
 
				+using Vec = sctl::Vec<double>;
			
 
				+constexpr int VecLen = Vec::Size();
			
 
				+
			
 
				+void profile_write(double* X, long N, long Niter, double val = 3.14) {
			
 
				+  double T = -omp_get_wtime();
			
 
				+  for (long j = 0; j < Niter; j++) {
			
 
				+    Vec v = 3.14;
			
 
				+    #pragma GCC unroll (4)
			
 
				+    for (long i = 0; i < N; i+=VecLen) {
			
 
				+      v.Store(X+i);
			
 
				+    }
			
 
				+  }
			
 
				+  T += omp_get_wtime();
			
 
				+  std::cout<<"Bandwidth = "<< N*Niter*sizeof(double)/T/1e9 <<" GB/s";
			
 
				+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
			
 
				+}
			
 
				+
			
 
				+void profile_read(double* X, long N, long Niter) {
			
 
				+  Vec sum[8];
			
 
				+  for (long i = 0; i < 8; i++) sum[i] = 0.0;
			
 
				+
			
 
				+  double T = -omp_get_wtime();
			
 
				+  for (long j = 0; j < Niter; j++) {
			
 
				+    for (long i = 0; i < N; i+=VecLen*8) {
			
 
				+      sum[0] = sum[0] + Vec::Load(X+VecLen*0+i);
			
 
				+      sum[1] = sum[1] + Vec::Load(X+VecLen*1+i);
			
 
				+      sum[2] = sum[2] + Vec::Load(X+VecLen*2+i);
			
 
				+      sum[3] = sum[3] + Vec::Load(X+VecLen*3+i);
			
 
				+      sum[4] = sum[4] + Vec::Load(X+VecLen*4+i);
			
 
				+      sum[5] = sum[5] + Vec::Load(X+VecLen*5+i);
			
 
				+      sum[6] = sum[6] + Vec::Load(X+VecLen*6+i);
			
 
				+      sum[7] = sum[7] + Vec::Load(X+VecLen*7+i);
			
 
				+    }
			
 
				+  }
			
 
				+  T += omp_get_wtime();
			
 
				+  std::cout<<"Bandwidth = "<< N*Niter*sizeof(double)/T/1e9 <<" GB/s";
			
 
				+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
			
 
				+
			
 
				+  for (long i = 1; i < 8; i++) sum[0] += sum[i];
			
 
				+  if (sum[0][0] < 0) std::cout<<sum[0]<<'\n';
			
 
				+}
			
 
				+
			
 
				+void profile_vector_add(double* Y, const double* X, long N, long Niter) { // Y = X + Y
			
 
				+  double T = -omp_get_wtime();
			
 
				+  for (long j = 0; j < Niter; j++) {
			
 
				+    for (long i = 0; i < N; i+=VecLen*2) {
			
 
				+      (Vec::Load(X+VecLen*0+i) + Vec::Load(Y+VecLen*0+i)).Store(Y+VecLen*0+i);
			
 
				+      (Vec::Load(X+VecLen*1+i) + Vec::Load(Y+VecLen*1+i)).Store(Y+VecLen*1+i);
			
 
				+    }
			
 
				+  }
			
 
				+  T += omp_get_wtime();
			
 
				+  std::cout<<"Bandwidth = "<< 3*N*Niter*sizeof(double)/T/1e9 <<" GB/s";
			
 
				+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char** argv) {
			
 
				+  if (argc <= 1) {
			
 
				+    std::cout<<"Usage: ./bandwidth <size-in-bytes> <#-of-iter>\n";
			
 
				+    return 0;
			
 
				+  }
			
 
				+
			
 
				+  long N = atol(argv[1]) / sizeof(double);
			
 
				+  long Niter = (argc <= 2 ? std::max<long>(1,1e9/N) : atol(argv[2]));
			
 
				+  std::cout<<"\nSize = "<< N*sizeof(double)<<", Iterations = "<< Niter<<"\n";
			
 
				+  SCTL_ASSERT_MSG(N % 512 == 0, "N must be a multiple of 4192"); // because of vectorizing and loop unrolling
			
 
				+
			
 
				+  //double* X = (double*)malloc(N*sizeof(double));
			
 
				+  //double* Y = (double*)malloc(N*sizeof(double));
			
 
				+  double* X = sctl::aligned_new<double>(N);
			
 
				+  double* Y = sctl::aligned_new<double>(N);
			
 
				+  for (long i = 0; i < N; i++) X[i] = Y[i] = i;
			
 
				+
			
 
				+  std::cout<<"\n\nWriting to array:\n";
			
 
				+  profile_write(X, N, Niter);
			
 
				+
			
 
				+  std::cout<<"\n\nReading from array:\n";
			
 
				+  profile_read(X, N, Niter);
			
 
				+
			
 
				+  std::cout<<"\n\nAdding arrays:\n";
			
 
				+  profile_vector_add(Y, X, N, Niter);
			
 
				+
			
 
				+  //free(X);
			
 
				+  //free(Y);
			
 
				+  sctl::aligned_delete(X);
			
 
				+  sctl::aligned_delete(Y);
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
--- a/code/src/bandwidth-main-memory.cpp
+++ b/code/src/bandwidth-main-memory.cpp
@@ -0,0 +1,96 @@
 
				+// example code showing cost of memory initialization (first-touch) and NUMA
			
 
				+//
			
 
				+// OpenMP thread pinning for NUMA
			
 
				+// export OMP_PLACES=cores
			
 
				+// export OMP_PROC_BIND=spread
			
 
				+
			
 
				+#include <iostream>
			
 
				+#include <omp.h>
			
 
				+#include <sctl.hpp>
			
 
				+using Vec = sctl::Vec<double>;
			
 
				+constexpr int VecLen = Vec::Size();
			
 
				+
			
 
				+// Benchmark to show cost of memory allocations
			
 
				+void benchmark_memory_init() {
			
 
				+  long N = 1e9; // 8 GB
			
 
				+  double T;
			
 
				+
			
 
				+  // Allocate memory
			
 
				+  T = -omp_get_wtime();
			
 
				+  double* X = (double*)malloc(N*sizeof(double));
			
 
				+  std::cout<<"Array alloc time = "<<T+omp_get_wtime()<<'\n';
			
 
				+
			
 
				+  // Initialize array
			
 
				+  T = -omp_get_wtime();
			
 
				+  for (long i = 0; i < N; i++) X[i] = i;
			
 
				+  std::cout<<"Array init time  = "<<T+omp_get_wtime()<<'\n';
			
 
				+
			
 
				+  // Write to array
			
 
				+  T = -omp_get_wtime();
			
 
				+  for (long i = 0; i < N; i++) X[i] = 2*i;
			
 
				+  std::cout<<"Array write time = "<<T+omp_get_wtime()<<'\n';
			
 
				+
			
 
				+  // Free memory
			
 
				+  T = -omp_get_wtime();
			
 
				+  free(X);
			
 
				+  std::cout<<"Array free time  = "<<T+omp_get_wtime()<<'\n';
			
 
				+}
			
 
				+
			
 
				+// Benchmark to show effect of NUMA
			
 
				+void benchmark_numa(bool numa_aware) {
			
 
				+  long N = 1e9; // 8 BG
			
 
				+  double T;
			
 
				+
			
 
				+  // Allocate memory
			
 
				+  double* X = sctl::aligned_new<double>(N);
			
 
				+  double* Y = sctl::aligned_new<double>(N);
			
 
				+
			
 
				+  // Initialize X, Y : this is when memory pages are assigned to each NUMA node
			
 
				+  if (numa_aware) {
			
 
				+    #pragma omp parallel for schedule(static)
			
 
				+    for (long i = 0; i < N; i++) X[i] = Y[i] = i;
			
 
				+  } else {
			
 
				+    for (long i = 0; i < N; i++) X[i] = Y[i] = i;
			
 
				+  }
			
 
				+
			
 
				+  // Write to array
			
 
				+  T = -omp_get_wtime();
			
 
				+  #pragma omp parallel for schedule(static)
			
 
				+  for (long i = 0; i < N; i++) X[i] = 3.14;
			
 
				+  T += omp_get_wtime();
			
 
				+  std::cout<<"Write Bandwidth   = "<< N*sizeof(double)/T/1e9 <<" GB/s\n";
			
 
				+
			
 
				+  // Read from array
			
 
				+  double sum = 0;
			
 
				+  T = -omp_get_wtime();
			
 
				+  #pragma omp parallel for schedule(static) reduction(+:sum)
			
 
				+  for (long i = 0; i < N; i++) sum += X[i];
			
 
				+  T += omp_get_wtime();
			
 
				+  std::cout<<"Read Bandwidth    = "<< N*sizeof(double)/T/1e9 <<" GB/s\n";
			
 
				+  if (sum < 0) std::cout<<sum<<'\n';
			
 
				+
			
 
				+  // Adding arrays: 2-reads, 1-write
			
 
				+  T = -omp_get_wtime();
			
 
				+  #pragma omp parallel for schedule(static)
			
 
				+  for (long i = 0; i < N; i++) Y[i] += X[i];
			
 
				+  T += omp_get_wtime();
			
 
				+  std::cout<<"Vec-Add Bandwidth = "<< 3*N*sizeof(double)/T/1e9 <<" GB/s\n";
			
 
				+
			
 
				+  sctl::aligned_delete(X);
			
 
				+  sctl::aligned_delete(Y);
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char** argv) {
			
 
				+
			
 
				+  std::cout<<"\nBenchmarking memory initialization cost:\n";
			
 
				+  benchmark_memory_init();
			
 
				+
			
 
				+  std::cout<<"\n\nBenchmarking main memory without parallel initialization (NUMA unaware):\n";
			
 
				+  benchmark_numa(false);
			
 
				+
			
 
				+  std::cout<<"\n\nBenchmarking main memory with parallel initialization (NUMA aware):\n";
			
 
				+  benchmark_numa(true);
			
 
				+
			
 
				+  return 0;
			
 
				+}
			
 
				+
			
--- a/code/src/gemm-blocking.cpp
+++ b/code/src/gemm-blocking.cpp
@@ -1,9 +1,14 @@
 
				 // example code showing blocking of GEMM to optimize memory access
			
 
				+//
			
 
				+// Cache profiling using perf:
			
 
				+// perf stat -e L1-dcache-load-misses -e L1-dcache-loads -e l2_rqsts.miss -e l2_rqsts.references -e LLC-load-misses -e LLC-loads ./$@
			
 
				 
			
 
				 #include <iostream>
			
 
				 #include <omp.h>
			
 
				 #include <sctl.hpp>
			
 
				 
			
 
				+constexpr int VecLen = sctl::DefaultVecLen<double>();
			
 
				+
			
 
				 void GEMM_naive(int M, int N, int K, double* A, int LDA, double* B, int LDB, double* C, int LDC) {
			
 
				   for (int j = 0; j < N; j++)
			
 
				     for (int k = 0; k < K; k++)
			
@@ -75,11 +80,12 @@ int main(int argc, char** argv) {
 
				   T = -omp_get_wtime();
			
 
				   for (long i = 0; i < iter; i++)
			
 
				     //GEMM_naive(M,N,K, A,M, B,K, C,M);
			
 
				-    GEMM_blocked<M,N,K, 200,200,200, 40,40,40, 8,10,40>(A,M, B,K, C,M);
			
 
				+    GEMM_blocked<M,N,K, 200,200,200, 40,40,40, VecLen,10,40>(A,M, B,K, C,M);
			
 
				   T += omp_get_wtime();
			
 
				+  std::cout<<"M = "<<M<<"  N = "<<N<<"  K = "<<K<<'\n';
			
 
				   std::cout<<"T = "<<T<<"    GFLOPS = "<<2*M*N*K*iter/T/1e9<<'\n';
			
 
				 
			
 
				-  if (0) { // check
			
 
				+  if (0) { // verify result
			
 
				     T = -omp_get_wtime();
			
 
				     for (long i = 0; i < iter; i++)
			
 
				       GEMM_naive(M,N,K, A,M, B,K, C_ref,M);
			
--- a/code/src/gemm-ker.cpp
+++ b/code/src/gemm-ker.cpp
@@ -4,6 +4,8 @@
 
				 #include <omp.h>
			
 
				 #include <sctl.hpp>
			
 
				 
			
 
				+constexpr int VecLen = sctl::DefaultVecLen<double>();
			
 
				+
			
 
				 template <int M, int N, int K>
			
 
				 void GEMM_ker_naive(double* C, double* A, double* B) {
			
 
				   for (int k = 0; k < K; k++)
			
@@ -58,7 +60,7 @@ void GEMM_ker_vec_unrolled(double* C, double* A, double* B) {
 
				 
			
 
				 int main(int argc, char** argv) {
			
 
				   long L = 1e6;
			
 
				-  constexpr int M = 8, N = 10, K = 40;
			
 
				+  constexpr int M = VecLen, N = 10, K = 40;
			
 
				   double* C = new double[M*N];
			
 
				   double* A = new double[M*K];
			
 
				   double* B = new double[K*N];
			
--- a/code/src/instruction-cost.cpp
+++ b/code/src/instruction-cost.cpp
@@ -5,11 +5,12 @@
 
				 #include <omp.h>
			
 
				 
			
 
				 #define CPU_clockrate 3.3 // GHz
			
 
				+constexpr int VecLen = sctl::DefaultVecLen<double>();
			
 
				 
			
 
				-template <class Type, int K> void test_add() {
			
 
				+template <class Type, int K> void test_add() { // add K elements of Type
			
 
				   Type x[K], one = 1.0;
			
 
				   for (long k = 0; k < K; k++)
			
 
				-    x[k] = 3.14 + k;
			
 
				+    x[k] = 3.14 + k; // initialize x[k]
			
 
				 
			
 
				   double T = -omp_get_wtime();
			
 
				   for (long i = 0; i < 1000000000L; i++)
			
@@ -20,17 +21,16 @@ template <class Type, int K> void test_add() {
 
				   std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n';
			
 
				 
			
 
				   // print the result otherwise the
			
 
				-  // compiler optimize out everything
			
 
				+  // compiler skips everything
			
 
				   Type sum = 0.;
			
 
				-  for (long k = 0; k < K; k++)
			
 
				-    sum += x[k];
			
 
				+  for (long k = 0; k < K; k++) sum += x[k];
			
 
				   std::cout<<"Result = "<<sum<<'\n';
			
 
				 }
			
 
				 
			
 
				-template <class Type, int K> void test_division() {
			
 
				+template <class Type, int K> void test_division() { // divide K elements of Type
			
 
				   Type x[K], one = 1.0;
			
 
				   for (long k = 0; k < K; k++)
			
 
				-    x[k] = 3.14 + k;
			
 
				+    x[k] = 3.14 + k; // initialize x[k]
			
 
				 
			
 
				   double T = -omp_get_wtime();
			
 
				   for (long i = 0; i < 1000000000L; i++)
			
@@ -41,10 +41,9 @@ template <class Type, int K> void test_division() {
 
				   std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n';
			
 
				 
			
 
				   // print the result otherwise the
			
 
				-  // compiler optimize out everything
			
 
				+  // compiler skips everything
			
 
				   Type sum = 0.;
			
 
				-  for (long k = 0; k < K; k++)
			
 
				-    sum += x[k];
			
 
				+  for (long k = 0; k < K; k++) sum += x[k];
			
 
				   std::cout<<"Result = "<<sum<<'\n';
			
 
				 }
			
 
				 
			
@@ -58,11 +57,11 @@ int main(int argc, char** argv) {
 
				   std::cout<<"\n\nAdding 32 doubles at a time:\n";
			
 
				   test_add<double, 32>();
			
 
				 
			
 
				-  std::cout<<"\n\nAdding 8 Vec<doubles,8> at a time:\n";
			
 
				-  test_add<sctl::Vec<double,8>, 8>();
			
 
				+  std::cout<<"\n\nAdding 8 Vec<doubles,"<<VecLen<<"> at a time:\n";
			
 
				+  test_add<sctl::Vec<double,VecLen>, 8>();
			
 
				 
			
 
				-  std::cout<<"\n\nDividing 8 Vec<doubles,8> at a time:\n";
			
 
				-  test_division<sctl::Vec<double,8>, 8>();
			
 
				+  std::cout<<"\n\nDividing 8 Vec<doubles,"<<VecLen<<"> at a time:\n";
			
 
				+  test_division<sctl::Vec<double,8>,VecLen>();
			
 
				 
			
 
				   return 0;
			
 
				 }
			
--- a/code/src/poly-eval.cpp
+++ b/code/src/poly-eval.cpp
@@ -5,6 +5,7 @@
 
				 #include <omp.h>
			
 
				 
			
 
				 #define CPU_clockrate 3.3 // GHz
			
 
				+constexpr int VecLen = sctl::DefaultVecLen<double>();
			
 
				 
			
 
				 template <class Type> void test_polynomial() {
			
 
				   Type a,b,c,d,e,f,g,h; // coefficients
			
@@ -64,7 +65,7 @@ int main(int argc, char** argv) {
 
				 
			
 
				   test_polynomial<double>(); // scalar
			
 
				 
			
 
				-  //test_polynomial<sctl::Vec<double,8>>(); // vectorized
			
 
				+  //test_polynomial<sctl::Vec<double,VecLen>>(); // vectorized
			
 
				 
			
 
				   return 0;
			
 
				 }