Dhairya Malhotra 2 роки тому
батько
коміт
dc0fdf708d

+ 3 - 0
code/.gitmodules

@@ -0,0 +1,3 @@
+[submodule "SCTL"]
+	path = SCTL
+	url = git@github.com:dmalhotra/SCTL.git

+ 4 - 7
code/Makefile

@@ -1,8 +1,6 @@
 CXX=g++ # requires g++-8 or newer / icpc (with gcc compatibility 7.5 or newer) / clang++ with llvm-10 or newer
 CXX=g++ # requires g++-8 or newer / icpc (with gcc compatibility 7.5 or newer) / clang++ with llvm-10 or newer
 CXXFLAGS = -O3 -march=native -std=c++11 -fopenmp # need C++11 and OpenMP
 CXXFLAGS = -O3 -march=native -std=c++11 -fopenmp # need C++11 and OpenMP
 
 
-#CXXFLAGS += -DSCTL_HAVE_MPI #use MPI
-
 RM = rm -f
 RM = rm -f
 MKDIRS = mkdir -p
 MKDIRS = mkdir -p
 
 
@@ -12,19 +10,18 @@ OBJDIR = ./obj
 INCDIR = ./SCTL/include
 INCDIR = ./SCTL/include
 
 
 TARGET_BIN = \
 TARGET_BIN = \
-       $(BINDIR)/instruction \
+       $(BINDIR)/instruction-cost \
        $(BINDIR)/poly-eval \
        $(BINDIR)/poly-eval \
        $(BINDIR)/gemm-ker \
        $(BINDIR)/gemm-ker \
-       $(BINDIR)/bandwidth \
-       $(BINDIR)/gemm
+       $(BINDIR)/gemm-blocking \
+       $(BINDIR)/bandwidth-l1 \
+       $(BINDIR)/bandwidth-main-memory
 
 
 all : $(TARGET_BIN)
 all : $(TARGET_BIN)
 
 
 $(BINDIR)/%: $(OBJDIR)/%.o
 $(BINDIR)/%: $(OBJDIR)/%.o
 	-@$(MKDIRS) $(dir $@)
 	-@$(MKDIRS) $(dir $@)
 	$(CXX) $^ $(CXXFLAGS) $(LDLIBS) -o $@
 	$(CXX) $^ $(CXXFLAGS) $(LDLIBS) -o $@
-#perf stat -e L1-dcache-load-misses -e L1-dcache-loads -e l2_rqsts.miss -e l2_rqsts.references -e LLC-load-misses -e LLC-loads mpiexec -n 1 --map-by slot:pe=16 ./$@
-
 
 
 $(OBJDIR)/%.o: $(SRCDIR)/%.cpp
 $(OBJDIR)/%.o: $(SRCDIR)/%.cpp
 	-@$(MKDIRS) $(dir $@)
 	-@$(MKDIRS) $(dir $@)

+ 15 - 0
code/README.md

@@ -0,0 +1,15 @@
+# Cloning repository
+
+git clone git@github.com:dmalhotra/fwam2022.git
+
+git submodule init
+
+git submodule update
+
+# Requirements
+
+g++-8 or newer
+
+# Compiling
+
+make -j

+ 93 - 0
code/src/bandwidth-l1.cpp

@@ -0,0 +1,93 @@
+// example code showing bandwidth of L1 cache and effect of memory alignment
+
+#include <iostream>
+#include <omp.h>
+#include <sctl.hpp>
+using Vec = sctl::Vec<double>;
+constexpr int VecLen = Vec::Size();
+
+void profile_write(double* X, long N, long Niter, double val = 3.14) {
+  double T = -omp_get_wtime();
+  for (long j = 0; j < Niter; j++) {
+    Vec v = 3.14;
+    #pragma GCC unroll (4)
+    for (long i = 0; i < N; i+=VecLen) {
+      v.Store(X+i);
+    }
+  }
+  T += omp_get_wtime();
+  std::cout<<"Bandwidth = "<< N*Niter*sizeof(double)/T/1e9 <<" GB/s";
+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
+}
+
+void profile_read(double* X, long N, long Niter) {
+  Vec sum[8];
+  for (long i = 0; i < 8; i++) sum[i] = 0.0;
+
+  double T = -omp_get_wtime();
+  for (long j = 0; j < Niter; j++) {
+    for (long i = 0; i < N; i+=VecLen*8) {
+      sum[0] = sum[0] + Vec::Load(X+VecLen*0+i);
+      sum[1] = sum[1] + Vec::Load(X+VecLen*1+i);
+      sum[2] = sum[2] + Vec::Load(X+VecLen*2+i);
+      sum[3] = sum[3] + Vec::Load(X+VecLen*3+i);
+      sum[4] = sum[4] + Vec::Load(X+VecLen*4+i);
+      sum[5] = sum[5] + Vec::Load(X+VecLen*5+i);
+      sum[6] = sum[6] + Vec::Load(X+VecLen*6+i);
+      sum[7] = sum[7] + Vec::Load(X+VecLen*7+i);
+    }
+  }
+  T += omp_get_wtime();
+  std::cout<<"Bandwidth = "<< N*Niter*sizeof(double)/T/1e9 <<" GB/s";
+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
+
+  for (long i = 1; i < 8; i++) sum[0] += sum[i];
+  if (sum[0][0] < 0) std::cout<<sum[0]<<'\n';
+}
+
+void profile_vector_add(double* Y, const double* X, long N, long Niter) { // Y = X + Y
+  double T = -omp_get_wtime();
+  for (long j = 0; j < Niter; j++) {
+    for (long i = 0; i < N; i+=VecLen*2) {
+      (Vec::Load(X+VecLen*0+i) + Vec::Load(Y+VecLen*0+i)).Store(Y+VecLen*0+i);
+      (Vec::Load(X+VecLen*1+i) + Vec::Load(Y+VecLen*1+i)).Store(Y+VecLen*1+i);
+    }
+  }
+  T += omp_get_wtime();
+  std::cout<<"Bandwidth = "<< 3*N*Niter*sizeof(double)/T/1e9 <<" GB/s";
+  std::cout<<"    cycles/iter = "<< 3.3e9*T/(Niter*N/VecLen) <<"\n";
+}
+
+int main(int argc, char** argv) {
+  if (argc <= 1) {
+    std::cout<<"Usage: ./bandwidth <size-in-bytes> <#-of-iter>\n";
+    return 0;
+  }
+
+  long N = atol(argv[1]) / sizeof(double);
+  long Niter = (argc <= 2 ? std::max<long>(1,1e9/N) : atol(argv[2]));
+  std::cout<<"\nSize = "<< N*sizeof(double)<<", Iterations = "<< Niter<<"\n";
+  SCTL_ASSERT_MSG(N % 512 == 0, "N must be a multiple of 4192"); // because of vectorizing and loop unrolling
+
+  //double* X = (double*)malloc(N*sizeof(double));
+  //double* Y = (double*)malloc(N*sizeof(double));
+  double* X = sctl::aligned_new<double>(N);
+  double* Y = sctl::aligned_new<double>(N);
+  for (long i = 0; i < N; i++) X[i] = Y[i] = i;
+
+  std::cout<<"\n\nWriting to array:\n";
+  profile_write(X, N, Niter);
+
+  std::cout<<"\n\nReading from array:\n";
+  profile_read(X, N, Niter);
+
+  std::cout<<"\n\nAdding arrays:\n";
+  profile_vector_add(Y, X, N, Niter);
+
+  //free(X);
+  //free(Y);
+  sctl::aligned_delete(X);
+  sctl::aligned_delete(Y);
+  return 0;
+}
+

+ 96 - 0
code/src/bandwidth-main-memory.cpp

@@ -0,0 +1,96 @@
+// example code showing cost of memory initialization (first-touch) and NUMA
+//
+// OpenMP thread pinning for NUMA
+// export OMP_PLACES=cores
+// export OMP_PROC_BIND=spread
+
+#include <iostream>
+#include <omp.h>
+#include <sctl.hpp>
+using Vec = sctl::Vec<double>;
+constexpr int VecLen = Vec::Size();
+
+// Benchmark to show cost of memory allocations
+void benchmark_memory_init() {
+  long N = 1e9; // 8 GB
+  double T;
+
+  // Allocate memory
+  T = -omp_get_wtime();
+  double* X = (double*)malloc(N*sizeof(double));
+  std::cout<<"Array alloc time = "<<T+omp_get_wtime()<<'\n';
+
+  // Initialize array
+  T = -omp_get_wtime();
+  for (long i = 0; i < N; i++) X[i] = i;
+  std::cout<<"Array init time  = "<<T+omp_get_wtime()<<'\n';
+
+  // Write to array
+  T = -omp_get_wtime();
+  for (long i = 0; i < N; i++) X[i] = 2*i;
+  std::cout<<"Array write time = "<<T+omp_get_wtime()<<'\n';
+
+  // Free memory
+  T = -omp_get_wtime();
+  free(X);
+  std::cout<<"Array free time  = "<<T+omp_get_wtime()<<'\n';
+}
+
+// Benchmark to show effect of NUMA
+void benchmark_numa(bool numa_aware) {
+  long N = 1e9; // 8 BG
+  double T;
+
+  // Allocate memory
+  double* X = sctl::aligned_new<double>(N);
+  double* Y = sctl::aligned_new<double>(N);
+
+  // Initialize X, Y : this is when memory pages are assigned to each NUMA node
+  if (numa_aware) {
+    #pragma omp parallel for schedule(static)
+    for (long i = 0; i < N; i++) X[i] = Y[i] = i;
+  } else {
+    for (long i = 0; i < N; i++) X[i] = Y[i] = i;
+  }
+
+  // Write to array
+  T = -omp_get_wtime();
+  #pragma omp parallel for schedule(static)
+  for (long i = 0; i < N; i++) X[i] = 3.14;
+  T += omp_get_wtime();
+  std::cout<<"Write Bandwidth   = "<< N*sizeof(double)/T/1e9 <<" GB/s\n";
+
+  // Read from array
+  double sum = 0;
+  T = -omp_get_wtime();
+  #pragma omp parallel for schedule(static) reduction(+:sum)
+  for (long i = 0; i < N; i++) sum += X[i];
+  T += omp_get_wtime();
+  std::cout<<"Read Bandwidth    = "<< N*sizeof(double)/T/1e9 <<" GB/s\n";
+  if (sum < 0) std::cout<<sum<<'\n';
+
+  // Adding arrays: 2-reads, 1-write
+  T = -omp_get_wtime();
+  #pragma omp parallel for schedule(static)
+  for (long i = 0; i < N; i++) Y[i] += X[i];
+  T += omp_get_wtime();
+  std::cout<<"Vec-Add Bandwidth = "<< 3*N*sizeof(double)/T/1e9 <<" GB/s\n";
+
+  sctl::aligned_delete(X);
+  sctl::aligned_delete(Y);
+}
+
+int main(int argc, char** argv) {
+
+  std::cout<<"\nBenchmarking memory initialization cost:\n";
+  benchmark_memory_init();
+
+  std::cout<<"\n\nBenchmarking main memory without parallel initialization (NUMA unaware):\n";
+  benchmark_numa(false);
+
+  std::cout<<"\n\nBenchmarking main memory with parallel initialization (NUMA aware):\n";
+  benchmark_numa(true);
+
+  return 0;
+}
+

+ 8 - 2
code/src/gemm-blocking.cpp

@@ -1,9 +1,14 @@
 // example code showing blocking of GEMM to optimize memory access
 // example code showing blocking of GEMM to optimize memory access
+//
+// Cache profiling using perf:
+// perf stat -e L1-dcache-load-misses -e L1-dcache-loads -e l2_rqsts.miss -e l2_rqsts.references -e LLC-load-misses -e LLC-loads ./$@
 
 
 #include <iostream>
 #include <iostream>
 #include <omp.h>
 #include <omp.h>
 #include <sctl.hpp>
 #include <sctl.hpp>
 
 
+constexpr int VecLen = sctl::DefaultVecLen<double>();
+
 void GEMM_naive(int M, int N, int K, double* A, int LDA, double* B, int LDB, double* C, int LDC) {
 void GEMM_naive(int M, int N, int K, double* A, int LDA, double* B, int LDB, double* C, int LDC) {
   for (int j = 0; j < N; j++)
   for (int j = 0; j < N; j++)
     for (int k = 0; k < K; k++)
     for (int k = 0; k < K; k++)
@@ -75,11 +80,12 @@ int main(int argc, char** argv) {
   T = -omp_get_wtime();
   T = -omp_get_wtime();
   for (long i = 0; i < iter; i++)
   for (long i = 0; i < iter; i++)
     //GEMM_naive(M,N,K, A,M, B,K, C,M);
     //GEMM_naive(M,N,K, A,M, B,K, C,M);
-    GEMM_blocked<M,N,K, 200,200,200, 40,40,40, 8,10,40>(A,M, B,K, C,M);
+    GEMM_blocked<M,N,K, 200,200,200, 40,40,40, VecLen,10,40>(A,M, B,K, C,M);
   T += omp_get_wtime();
   T += omp_get_wtime();
+  std::cout<<"M = "<<M<<"  N = "<<N<<"  K = "<<K<<'\n';
   std::cout<<"T = "<<T<<"    GFLOPS = "<<2*M*N*K*iter/T/1e9<<'\n';
   std::cout<<"T = "<<T<<"    GFLOPS = "<<2*M*N*K*iter/T/1e9<<'\n';
 
 
-  if (0) { // check
+  if (0) { // verify result
     T = -omp_get_wtime();
     T = -omp_get_wtime();
     for (long i = 0; i < iter; i++)
     for (long i = 0; i < iter; i++)
       GEMM_naive(M,N,K, A,M, B,K, C_ref,M);
       GEMM_naive(M,N,K, A,M, B,K, C_ref,M);

+ 3 - 1
code/src/gemm-ker.cpp

@@ -4,6 +4,8 @@
 #include <omp.h>
 #include <omp.h>
 #include <sctl.hpp>
 #include <sctl.hpp>
 
 
+constexpr int VecLen = sctl::DefaultVecLen<double>();
+
 template <int M, int N, int K>
 template <int M, int N, int K>
 void GEMM_ker_naive(double* C, double* A, double* B) {
 void GEMM_ker_naive(double* C, double* A, double* B) {
   for (int k = 0; k < K; k++)
   for (int k = 0; k < K; k++)
@@ -58,7 +60,7 @@ void GEMM_ker_vec_unrolled(double* C, double* A, double* B) {
 
 
 int main(int argc, char** argv) {
 int main(int argc, char** argv) {
   long L = 1e6;
   long L = 1e6;
-  constexpr int M = 8, N = 10, K = 40;
+  constexpr int M = VecLen, N = 10, K = 40;
   double* C = new double[M*N];
   double* C = new double[M*N];
   double* A = new double[M*K];
   double* A = new double[M*K];
   double* B = new double[K*N];
   double* B = new double[K*N];

+ 13 - 14
code/src/instruction.cpp → code/src/instruction-cost.cpp

@@ -5,11 +5,12 @@
 #include <omp.h>
 #include <omp.h>
 
 
 #define CPU_clockrate 3.3 // GHz
 #define CPU_clockrate 3.3 // GHz
+constexpr int VecLen = sctl::DefaultVecLen<double>();
 
 
-template <class Type, int K> void test_add() {
+template <class Type, int K> void test_add() { // add K elements of Type
   Type x[K], one = 1.0;
   Type x[K], one = 1.0;
   for (long k = 0; k < K; k++)
   for (long k = 0; k < K; k++)
-    x[k] = 3.14 + k;
+    x[k] = 3.14 + k; // initialize x[k]
 
 
   double T = -omp_get_wtime();
   double T = -omp_get_wtime();
   for (long i = 0; i < 1000000000L; i++)
   for (long i = 0; i < 1000000000L; i++)
@@ -20,17 +21,16 @@ template <class Type, int K> void test_add() {
   std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n';
   std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n';
 
 
   // print the result otherwise the
   // print the result otherwise the
-  // compiler optimize out everything
+  // compiler skips everything
   Type sum = 0.;
   Type sum = 0.;
-  for (long k = 0; k < K; k++)
-    sum += x[k];
+  for (long k = 0; k < K; k++) sum += x[k];
   std::cout<<"Result = "<<sum<<'\n';
   std::cout<<"Result = "<<sum<<'\n';
 }
 }
 
 
-template <class Type, int K> void test_division() {
+template <class Type, int K> void test_division() { // divide K elements of Type
   Type x[K], one = 1.0;
   Type x[K], one = 1.0;
   for (long k = 0; k < K; k++)
   for (long k = 0; k < K; k++)
-    x[k] = 3.14 + k;
+    x[k] = 3.14 + k; // initialize x[k]
 
 
   double T = -omp_get_wtime();
   double T = -omp_get_wtime();
   for (long i = 0; i < 1000000000L; i++)
   for (long i = 0; i < 1000000000L; i++)
@@ -41,10 +41,9 @@ template <class Type, int K> void test_division() {
   std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n';
   std::cout<<"cycles/iter = "<< CPU_clockrate*T <<'\n';
 
 
   // print the result otherwise the
   // print the result otherwise the
-  // compiler optimize out everything
+  // compiler skips everything
   Type sum = 0.;
   Type sum = 0.;
-  for (long k = 0; k < K; k++)
-    sum += x[k];
+  for (long k = 0; k < K; k++) sum += x[k];
   std::cout<<"Result = "<<sum<<'\n';
   std::cout<<"Result = "<<sum<<'\n';
 }
 }
 
 
@@ -58,11 +57,11 @@ int main(int argc, char** argv) {
   std::cout<<"\n\nAdding 32 doubles at a time:\n";
   std::cout<<"\n\nAdding 32 doubles at a time:\n";
   test_add<double, 32>();
   test_add<double, 32>();
 
 
-  std::cout<<"\n\nAdding 8 Vec<doubles,8> at a time:\n";
-  test_add<sctl::Vec<double,8>, 8>();
+  std::cout<<"\n\nAdding 8 Vec<doubles,"<<VecLen<<"> at a time:\n";
+  test_add<sctl::Vec<double,VecLen>, 8>();
 
 
-  std::cout<<"\n\nDividing 8 Vec<doubles,8> at a time:\n";
-  test_division<sctl::Vec<double,8>, 8>();
+  std::cout<<"\n\nDividing 8 Vec<doubles,"<<VecLen<<"> at a time:\n";
+  test_division<sctl::Vec<double,8>,VecLen>();
 
 
   return 0;
   return 0;
 }
 }

+ 2 - 1
code/src/poly-eval.cpp

@@ -5,6 +5,7 @@
 #include <omp.h>
 #include <omp.h>
 
 
 #define CPU_clockrate 3.3 // GHz
 #define CPU_clockrate 3.3 // GHz
+constexpr int VecLen = sctl::DefaultVecLen<double>();
 
 
 template <class Type> void test_polynomial() {
 template <class Type> void test_polynomial() {
   Type a,b,c,d,e,f,g,h; // coefficients
   Type a,b,c,d,e,f,g,h; // coefficients
@@ -64,7 +65,7 @@ int main(int argc, char** argv) {
 
 
   test_polynomial<double>(); // scalar
   test_polynomial<double>(); // scalar
 
 
-  //test_polynomial<sctl::Vec<double,8>>(); // vectorized
+  //test_polynomial<sctl::Vec<double,VecLen>>(); // vectorized
 
 
   return 0;
   return 0;
 }
 }