2 роки тому · dc0fdf708d
--- a/code/.gitmodules
+++ b/code/.gitmodules
@@ -0,0 +1,3 @@
 
															+[submodule "SCTL"]
														
 
															+	path = SCTL
														
 
															+	url = git@github.com:dmalhotra/SCTL.git
														
--- a/code/Makefile
+++ b/code/Makefile
@@ -1,8 +1,6 @@
 
															 CXX=g++ # requires g++-8 or newer / icpc (with gcc compatibility 7.5 or newer) / clang++ with llvm-10 or newer
														
 
															 CXXFLAGS = -O3 -march=native -std=c++11 -fopenmp # need C++11 and OpenMP
														
 
															-#CXXFLAGS += -DSCTL_HAVE_MPI #use MPI
														
 
															-
														
 
															 RM = rm -f
														
 
															 MKDIRS = mkdir -p
														
@@ -12,19 +10,18 @@ OBJDIR = ./obj
 
															 INCDIR = ./SCTL/include
														
 
															 TARGET_BIN = \
														
 
															-       $(BINDIR)/instruction \
														
 
															+       $(BINDIR)/instruction-cost \
														
 
															        $(BINDIR)/poly-eval \
														
 
															        $(BINDIR)/gemm-ker \
														
 
															-       $(BINDIR)/bandwidth \
														
 
															-       $(BINDIR)/gemm
														
 
															+       $(BINDIR)/gemm-blocking \
														
 
															+       $(BINDIR)/bandwidth-l1 \
														
 
															+       $(BINDIR)/bandwidth-main-memory
														
 
															 all : $(TARGET_BIN)
														
 
															 $(BINDIR)/%: $(OBJDIR)/%.o
														
 
															 	-@$(MKDIRS) $(dir $@)
														
 
															 	$(CXX) $^ $(CXXFLAGS) $(LDLIBS) -o $@
														
 
															-#perf stat -e L1-dcache-load-misses -e L1-dcache-loads -e l2_rqsts.miss -e l2_rqsts.references -e LLC-load-misses -e LLC-loads mpiexec -n 1 --map-by slot:pe=16 ./$@
														
 
															-
														
 
															 $(OBJDIR)/%.o: $(SRCDIR)/%.cpp
														
 
															 	-@$(MKDIRS) $(dir $@)
														
--- a/code/README.md
+++ b/code/README.md
@@ -0,0 +1,15 @@
 
															+# Cloning repository
														
 
															+
														
 
															+git clone git@github.com:dmalhotra/fwam2022.git
														
 
															+
														
 
															+git submodule init
														
 
															+
														
 
															+git submodule update
														
 
															+
														
 
															+# Requirements
														
 
															+
														
 
															+g++-8 or newer
														
 
															+
														
 
															+# Compiling
														
 
															+
														
 
															+make -j
														
--- a/code/src/bandwidth-l1.cpp
+++ b/code/src/bandwidth-l1.cpp
@@ -0,0 +1,93 @@
 
															+// example code showing bandwidth of L1 cache and effect of memory alignment
														
 
															+
														
 
															+#include <iostream>
														
 
															+#include <omp.h>
														
 
															+#include <sctl.hpp>
														
 
															+using Vec = sctl::Vec<double>;
														
 
															+constexpr int VecLen = Vec::Size();
														
 
															+
														
 
															+void profile_write(double* X, long N, long Niter, double val = 3.14) {
														
 
															+  double T = -omp_get_wtime();
														
 
															+  for (long j = 0; j < Niter; j++) {
														
 
															+    Vec v = 3.14;
														
 
															+    #pragma GCC unroll (4)
														
 
															+    for (long i = 0; i < N; i+=VecLen) {
														
 
															+      v.Store(X+i);
														
 
															+    }
														
 
															+  }
														
 
															+  T += omp_get_wtime();
														
 
															+  std::cout<<"Bandwidth = "<< N*Niter*sizeof(double)/T/1e9 <<" GB/s";
														
 
															+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
														
 
															+}
														
 
															+
														
 
															+void profile_read(double* X, long N, long Niter) {
														
 
															+  Vec sum[8];
														
 
															+  for (long i = 0; i < 8; i++) sum[i] = 0.0;
														
 
															+
														
 
															+  double T = -omp_get_wtime();
														
 
															+  for (long j = 0; j < Niter; j++) {
														
 
															+    for (long i = 0; i < N; i+=VecLen*8) {
														
 
															+      sum[0] = sum[0] + Vec::Load(X+VecLen*0+i);
														
 
															+      sum[1] = sum[1] + Vec::Load(X+VecLen*1+i);
														
 
															+      sum[2] = sum[2] + Vec::Load(X+VecLen*2+i);
														
 
															+      sum[3] = sum[3] + Vec::Load(X+VecLen*3+i);
														
 
															+      sum[4] = sum[4] + Vec::Load(X+VecLen*4+i);
														
 
															+      sum[5] = sum[5] + Vec::Load(X+VecLen*5+i);
														
 
															+      sum[6] = sum[6] + Vec::Load(X+VecLen*6+i);
														
 
															+      sum[7] = sum[7] + Vec::Load(X+VecLen*7+i);
														
 
															+    }
														
 
															+  }
														
 
															+  T += omp_get_wtime();
														
 
															+  std::cout<<"Bandwidth = "<< N*Niter*sizeof(double)/T/1e9 <<" GB/s";
														
 
															+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
														
 
															+
														
 
															+  for (long i = 1; i < 8; i++) sum[0] += sum[i];
														
 
															+  if (sum[0][0] < 0) std::cout<<sum[0]<<'\n';
														
 
															+}
														
 
															+
														
 
															+void profile_vector_add(double* Y, const double* X, long N, long Niter) { // Y = X + Y
														
 
															+  double T = -omp_get_wtime();
														
 
															+  for (long j = 0; j < Niter; j++) {
														
 
															+    for (long i = 0; i < N; i+=VecLen*2) {
														
 
															+      (Vec::Load(X+VecLen*0+i) + Vec::Load(Y+VecLen*0+i)).Store(Y+VecLen*0+i);
														
 
															+      (Vec::Load(X+VecLen*1+i) + Vec::Load(Y+VecLen*1+i)).Store(Y+VecLen*1+i);
														
 
															+    }
														
 
															+  }
														
 
															+  T += omp_get_wtime();
														
 
															+  std::cout<<"Bandwidth = "<< 3*N*Niter*sizeof(double)/T/1e9 <<" GB/s";
														
 
															+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
														
 
															+}
														
 
															+
														
 
															+int main(int argc, char** argv) {
														
 
															+  if (argc <= 1) {
														
 
															+    std::cout<<"Usage: ./bandwidth <size-in-bytes> <#-of-iter>\n";
														
 
															+    return 0;
														
 
															+  }
														
 
															+
														
 
															+  long N = atol(argv[1]) / sizeof(double);
														
 
															+  long Niter = (argc <= 2 ? std::max<long>(1,1e9/N) : atol(argv[2]));
														
 
															+  std::cout<<"\nSize = "<< N*sizeof(double)<<", Iterations = "<< Niter<<"\n";
														
 
															+  SCTL_ASSERT_MSG(N % 512 == 0, "N must be a multiple of 4192"); // because of vectorizing and loop unrolling
														
 
															+
														
 
															+  //double* X = (double*)malloc(N*sizeof(double));
														
 
															+  //double* Y = (double*)malloc(N*sizeof(double));
														
 
															+  double* X = sctl::aligned_new<double>(N);
														
 
															+  double* Y = sctl::aligned_new<double>(N);
														
 
															+  for (long i = 0; i < N; i++) X[i] = Y[i] = i;
														
 
															+
														
 
															+  std::cout<<"\n\nWriting to array:\n";
														
 
															+  profile_write(X, N, Niter);
														
 
															+
														
 
															+  std::cout<<"\n\nReading from array:\n";
														
 
															+  profile_read(X, N, Niter);
														
 
															+
														
 
															+  std::cout<<"\n\nAdding arrays:\n";
														
 
															+  profile_vector_add(Y, X, N, Niter);
														
 
															+
														
 
															+  //free(X);
														
 
															+  //free(Y);
														
 
															+  sctl::aligned_delete(X);
														
 
															+  sctl::aligned_delete(Y);
														
 
															+  return 0;
														
 
															+}
														
 
															+
														
--- a/code/src/bandwidth-main-memory.cpp
+++ b/code/src/bandwidth-main-memory.cpp
@@ -0,0 +1,96 @@
 
															+// example code showing cost of memory initialization (first-touch) and NUMA
														
 
															+//
														
 
															+// OpenMP thread pinning for NUMA
														
 
															+// export OMP_PLACES=cores
														
 
															+// export OMP_PROC_BIND=spread
														
 
															+
														
 
															+#include <iostream>
														
 
															+#include <omp.h>
														
 
															+#include <sctl.hpp>
														
 
															+using Vec = sctl::Vec<double>;
														
 
															+constexpr int VecLen = Vec::Size();
														
 
															+
														
 
															+// Benchmark to show cost of memory allocations
														
 
															+void benchmark_memory_init() {
														
 
															+  long N = 1e9; // 8 GB
														
 
															+  double T;
														
 
															+
														
 
															+  // Allocate memory
														
 
															+  T = -omp_get_wtime();
														
 
															+  double* X = (double*)malloc(N*sizeof(double));
														
 
															+  std::cout<<"Array alloc time = "<<T+omp_get_wtime()<<'\n';
														
 
															+
														
 
															+  // Initialize array
														
 
															+  T = -omp_get_wtime();
														
 
															+  for (long i = 0; i < N; i++) X[i] = i;
														
 
															+  std::cout<<"Array init time  = "<<T+omp_get_wtime()<<'\n';
														
 
															+
														
 
															+  // Write to array
														
 
															+  T = -omp_get_wtime();
														
 
															+  for (long i = 0; i < N; i++) X[i] = 2*i;
														
 
															+  std::cout<<"Array write time = "<<T+omp_get_wtime()<<'\n';
														
 
															+
														
 
															+  // Free memory
														
 
															+  T = -omp_get_wtime();
														
 
															+  free(X);
														
 
															+  std::cout<<"Array free time  = "<<T+omp_get_wtime()<<'\n';
														
 
															+}
														
 
															+
														
 
															+// Benchmark to show effect of NUMA
														
 
															+void benchmark_numa(bool numa_aware) {
														
 
															+  long N = 1e9; // 8 BG
														
 
															+  double T;
														
 
															+
														
 
															+  // Allocate memory
														
 
															+  double* X = sctl::aligned_new<double>(N);
														
 
															+  double* Y = sctl::aligned_new<double>(N);
														
 
															+
														
 
															+  // Initialize X, Y : this is when memory pages are assigned to each NUMA node
														
 
															+  if (numa_aware) {
														
 
															+    #pragma omp parallel for schedule(static)
														
 
															+    for (long i = 0; i < N; i++) X[i] = Y[i] = i;
														
 
															+  } else {
														
 
															+    for (long i = 0; i < N; i++) X[i] = Y[i] = i;
														
 
															+  }
														
 
															+
														
 
															+  // Write to array
														
 
															+  T = -omp_get_wtime();
														
 
															+  #pragma omp parallel for schedule(static)
														
 
															+  for (long i = 0; i < N; i++) X[i] = 3.14;
														
 
															+  T += omp_get_wtime();
														
 
															+  std::cout<<"Write Bandwidth   = "<< N*sizeof(double)/T/1e9 <<" GB/s\n";
														
 
															+
														
 
															+  // Read from array
														
 
															+  double sum = 0;
														
 
															+  T = -omp_get_wtime();
														
 
															+  #pragma omp parallel for schedule(static) reduction(+:sum)
														
 
															+  for (long i = 0; i < N; i++) sum += X[i];
														
 
															+  T += omp_get_wtime();
														
 
															+  std::cout<<"Read Bandwidth    = "<< N*sizeof(double)/T/1e9 <<" GB/s\n";
														
 
															+  if (sum < 0) std::cout<<sum<<'\n';
														
 
															+
														
 
															+  // Adding arrays: 2-reads, 1-write
														
 
															+  T = -omp_get_wtime();
														
 
															+  #pragma omp parallel for schedule(static)
														
 
															+  for (long i = 0; i < N; i++) Y[i] += X[i];
														
 
															+  T += omp_get_wtime();
														
 
															+  std::cout<<"Vec-Add Bandwidth = "<< 3*N*sizeof(double)/T/1e9 <<" GB/s\n";
														
 
															+
														
 
															+  sctl::aligned_delete(X);
														
 
															+  sctl::aligned_delete(Y);
														
 
															+}
														
 
															+
														
 
															+int main(int argc, char** argv) {
														
 
															+
														
 
															+  std::cout<<"\nBenchmarking memory initialization cost:\n";
														
 
															+  benchmark_memory_init();
														
 
															+
														
 
															+  std::cout<<"\n\nBenchmarking main memory without parallel initialization (NUMA unaware):\n";
														
 
															+  benchmark_numa(false);
														
 
															+
														
 
															+  std::cout<<"\n\nBenchmarking main memory with parallel initialization (NUMA aware):\n";
														
 
															+  benchmark_numa(true);
														
 
															+
														
 
															+  return 0;
														
 
															+}
														
 
															+
														
--- a/code/src/gemm-blocking.cpp
+++ b/code/src/gemm-blocking.cpp
@@ -1,9 +1,14 @@
 
															 // example code showing blocking of GEMM to optimize memory access
														
 
															+//
														
 
															+// Cache profiling using perf:
														
 
															+// perf stat -e L1-dcache-load-misses -e L1-dcache-loads -e l2_rqsts.miss -e l2_rqsts.references -e LLC-load-misses -e LLC-loads ./$@
														
 
															 #include <iostream>
														
 
															 #include <omp.h>
														
 
															 #include <sctl.hpp>
														
 
															+constexpr int VecLen = sctl::DefaultVecLen<double>();
														
 
															+
														
 
															 void GEMM_naive(int M, int N, int K, double* A, int LDA, double* B, int LDB, double* C, int LDC) {
														
 
															   for (int j = 0; j < N; j++)
														
 
															     for (int k = 0; k < K; k++)
														
@@ -75,11 +80,12 @@ int main(int argc, char** argv) {
 
															   T = -omp_get_wtime();
														
 
															   for (long i = 0; i < iter; i++)
														
 
															     //GEMM_naive(M,N,K, A,M, B,K, C,M);
														
 
															-    GEMM_blocked<M,N,K, 200,200,200, 40,40,40, 8,10,40>(A,M, B,K, C,M);
														
 
															+    GEMM_blocked<M,N,K, 200,200,200, 40,40,40, VecLen,10,40>(A,M, B,K, C,M);
														
 
															   T += omp_get_wtime();
														
 
															+  std::cout<<"M = "<<M<<"  N = "<<N<<"  K = "<<K<<'\n';
														
 
															   std::cout<<"T = "<<T<<"    GFLOPS = "<<2*M*N*K*iter/T/1e9<<'\n';
														
 
															-  if (0) { // check
														
 
															+  if (0) { // verify result
														
 
															     T = -omp_get_wtime();
														
 
															     for (long i = 0; i < iter; i++)
														
 
															       GEMM_naive(M,N,K, A,M, B,K, C_ref,M);
														
--- a/code/src/gemm-ker.cpp
+++ b/code/src/gemm-ker.cpp
@@ -4,6 +4,8 @@
 
															 #include <omp.h>
														
 
															 #include <sctl.hpp>
														
 
															+constexpr int VecLen = sctl::DefaultVecLen<double>();
														
 
															+
														
 
															 template <int M, int N, int K>
														
 
															 void GEMM_ker_naive(double* C, double* A, double* B) {
														
 
															   for (int k = 0; k < K; k++)
														
@@ -58,7 +60,7 @@ void GEMM_ker_vec_unrolled(double* C, double* A, double* B) {
 
															 int main(int argc, char** argv) {
														
 
															   long L = 1e6;
														
 
															-  constexpr int M = 8, N = 10, K = 40;
														
 
															+  constexpr int M = VecLen, N = 10, K = 40;
														
 
															   double* C = new double[M*N];
														
 
															   double* A = new double[M*K];
														
 
															   double* B = new double[K*N];
														
--- a/code/src/instruction-cost.cpp
+++ b/code/src/instruction-cost.cpp
@@ -5,11 +5,12 @@
 
															 #include <omp.h>
														
 
															 #define CPU_clockrate 3.3 // GHz
														
 
															+constexpr int VecLen = sctl::DefaultVecLen<double>();
														
 
															-template <class Type, int K> void test_add() {
														
 
															+template <class Type, int K> void test_add() { // add K elements of Type
														
 
															   Type x[K], one = 1.0;
														
 
															   for (long k = 0; k < K; k++)
														
 
															-    x[k] = 3.14 + k;
														
 
															+    x[k] = 3.14 + k; // initialize x[k]
														
 
															   double T = -omp_get_wtime();
														
 
															   for (long i = 0; i < 1000000000L; i++)
														
@@ -20,17 +21,16 @@ template <class Type, int K> void test_add() {
 
															   std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n';
														
 
															   // print the result otherwise the
														
 
															-  // compiler optimize out everything
														
 
															+  // compiler skips everything
														
 
															   Type sum = 0.;
														
 
															-  for (long k = 0; k < K; k++)
														
 
															-    sum += x[k];
														
 
															+  for (long k = 0; k < K; k++) sum += x[k];
														
 
															   std::cout<<"Result = "<<sum<<'\n';
														
 
															 }
														
 
															-template <class Type, int K> void test_division() {
														
 
															+template <class Type, int K> void test_division() { // divide K elements of Type
														
 
															   Type x[K], one = 1.0;
														
 
															   for (long k = 0; k < K; k++)
														
 
															-    x[k] = 3.14 + k;
														
 
															+    x[k] = 3.14 + k; // initialize x[k]
														
 
															   double T = -omp_get_wtime();
														
 
															   for (long i = 0; i < 1000000000L; i++)
														
@@ -41,10 +41,9 @@ template <class Type, int K> void test_division() {
 
															   std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n';
														
 
															   // print the result otherwise the
														
 
															-  // compiler optimize out everything
														
 
															+  // compiler skips everything
														
 
															   Type sum = 0.;
														
 
															-  for (long k = 0; k < K; k++)
														
 
															-    sum += x[k];
														
 
															+  for (long k = 0; k < K; k++) sum += x[k];
														
 
															   std::cout<<"Result = "<<sum<<'\n';
														
 
															 }
														
@@ -58,11 +57,11 @@ int main(int argc, char** argv) {
 
															   std::cout<<"\n\nAdding 32 doubles at a time:\n";
														
 
															   test_add<double, 32>();
														
 
															-  std::cout<<"\n\nAdding 8 Vec<doubles,8> at a time:\n";
														
 
															-  test_add<sctl::Vec<double,8>, 8>();
														
 
															+  std::cout<<"\n\nAdding 8 Vec<doubles,"<<VecLen<<"> at a time:\n";
														
 
															+  test_add<sctl::Vec<double,VecLen>, 8>();
														
 
															-  std::cout<<"\n\nDividing 8 Vec<doubles,8> at a time:\n";
														
 
															-  test_division<sctl::Vec<double,8>, 8>();
														
 
															+  std::cout<<"\n\nDividing 8 Vec<doubles,"<<VecLen<<"> at a time:\n";
														
 
															+  test_division<sctl::Vec<double,8>,VecLen>();
														
 
															   return 0;
														
 
															 }
														
--- a/code/src/poly-eval.cpp
+++ b/code/src/poly-eval.cpp
@@ -5,6 +5,7 @@
 
															 #include <omp.h>
														
 
															 #define CPU_clockrate 3.3 // GHz
														
 
															+constexpr int VecLen = sctl::DefaultVecLen<double>();
														
 
															 template <class Type> void test_polynomial() {
														
 
															   Type a,b,c,d,e,f,g,h; // coefficients
														
@@ -64,7 +65,7 @@ int main(int argc, char** argv) {
 
															   test_polynomial<double>(); // scalar
														
 
															-  //test_polynomial<sctl::Vec<double,8>>(); // vectorized
														
 
															+  //test_polynomial<sctl::Vec<double,VecLen>>(); // vectorized
														
 
															   return 0;
														
 
															 }