Parcourir la source

Fix multiply defined symbols.

- Fix multiply defined symbols by moving global variable definitions
  from device_wrapper.txx to device_wrapper.cpp.
- Change configure script to use ARFLAGS="cru -qoffload-build" and
  AR="xiar" when compiling library for Phi.
- Add template parameter to specify device synchronization. Fixes an
  issue, which caused linking errors when using different option than
  what was used to compile the library.
- Reorganize includes, making sure no circular dependencies exist.
Dhairya Malhotra il y a 11 ans
Parent
commit
8c2b816f95
59 fichiers modifiés avec 422 ajouts et 263 suppressions
  1. 6 0
      .gitignore
  2. 2 0
      Makefile.am
  3. 8 2
      configure.ac
  4. 6 6
      examples/Makefile
  5. 4 1
      examples/src/example1.cpp
  6. 5 4
      examples/src/fmm_cheb.cpp
  7. 7 6
      include/cheb_node.hpp
  8. 6 4
      include/cheb_node.txx
  9. 5 4
      include/cheb_utils.hpp
  10. 10 3
      include/cheb_utils.txx
  11. 8 6
      include/device_wrapper.hpp
  12. 14 13
      include/device_wrapper.txx
  13. 3 3
      include/dtypes.h
  14. 9 4
      include/fft_wrapper.hpp
  15. 9 6
      include/fmm_cheb.hpp
  16. 11 3
      include/fmm_cheb.txx
  17. 5 2
      include/fmm_gll.hpp
  18. 7 5
      include/fmm_node.hpp
  19. 5 0
      include/fmm_node.txx
  20. 15 6
      include/fmm_pts.hpp
  21. 22 12
      include/fmm_pts.txx
  22. 8 5
      include/fmm_tree.hpp
  23. 9 1
      include/fmm_tree.txx
  24. 5 3
      include/interac_list.hpp
  25. 4 4
      include/interac_list.txx
  26. 28 38
      include/kernel.hpp
  27. 24 9
      include/kernel.txx
  28. 3 2
      include/legendre_rule.hpp
  29. 2 2
      include/mat_utils.hpp
  30. 6 4
      include/mat_utils.txx
  31. 5 5
      include/matrix.hpp
  32. 8 2
      include/matrix.txx
  33. 12 10
      include/mem_mgr.hpp
  34. 2 7
      include/mem_utils.hpp
  35. 1 2
      include/mem_utils.txx
  36. 5 5
      include/mortonid.hpp
  37. 2 0
      include/mortonid.txx
  38. 6 3
      include/mpi_node.hpp
  39. 5 3
      include/mpi_node.txx
  40. 5 4
      include/mpi_tree.hpp
  41. 15 4
      include/mpi_tree.txx
  42. 3 3
      include/ompUtils.txx
  43. 4 4
      include/parUtils.h
  44. 6 4
      include/parUtils.txx
  45. 5 3
      include/precomp_mat.hpp
  46. 9 1
      include/precomp_mat.txx
  47. 4 4
      include/profile.hpp
  48. 12 7
      include/pvfmm.hpp
  49. 6 2
      include/pvfmm_common.hpp
  50. 2 4
      include/quad_utils.hpp
  51. 4 2
      include/quad_utils.txx
  52. 6 3
      include/tree.hpp
  53. 1 4
      include/tree.txx
  54. 2 4
      include/tree_node.hpp
  55. 5 4
      include/vector.hpp
  56. 4 4
      include/vector.txx
  57. 6 6
      m4/ac_check_intel_offload.m4
  58. 19 0
      src/device_wrapper.cpp
  59. 2 1
      src/fmm_gll.cpp

+ 6 - 0
.gitignore

@@ -22,3 +22,9 @@
 /src/*.o
 /stamp-h1
 
+m4/libtool.m4
+m4/ltoptions.m4
+m4/ltsugar.m4
+m4/ltversion.m4
+m4/lt~obsolete.m4
+libtool

+ 2 - 0
Makefile.am

@@ -100,6 +100,7 @@ lib_libfmm_a_HEADERS = \
 # the sources to add to the library and to add to the source distribution
 lib_libpvfmm_a_SOURCES = \
 									$(lib_libpvfmm_a_HEADERS) \
+									src/device_wrapper.cpp \
 									src/fmm_gll.cpp \
 									src/legendre_rule.cpp \
 									src/mortonid.cpp \
@@ -216,6 +217,7 @@ clean-local: clean-doxygen
 	cd $(EX_DIR) && $(MAKE) clean;
 	$(RM) -r $(RESULT_DIR)/*
 	$(RM) *~ */*~ */*/*~
+	$(RM) ./lib/*
 
 #------------------------------------------------------------------------------
 

+ 8 - 2
configure.ac

@@ -25,8 +25,14 @@ AC_PROG_INSTALL
 AC_PROG_MKDIR_P
 AC_PROG_LN_S
 AC_PROG_MAKE_SET
-AC_PROG_RANLIB
-#AM_PROG_AR
+#AC_PROG_RANLIB
+##AM_PROG_AR
+
+# automake 1.12 seems to require AM_PROG_AR, but automake 1.11 doesn't
+# recognize it
+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
+LT_INIT
+AC_PROG_LIBTOOL
 
 # Check for CUDA
 AX_CHECK_CUDA

+ 6 - 6
examples/Makefile

@@ -27,15 +27,15 @@ ifeq ($(INTEL_OFFLOAD_OK),yes)
 
 $(BINDIR)/%: $(OBJDIR)/%.o
 	-@$(MKDIRS) $(dir $@)
-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -no-offload      $^       $(LDFLAGS_PVFMM) -o $@
-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM)                  $^_mic   $(LDFLAGS_PVFMM) -o $@_mic
-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -D__MIC_ASYNCH__ $^_async $(LDFLAGS_PVFMM) -o $@_async
+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -no-offload         $^       $(LDFLAGS_PVFMM) -o $@
+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM)                     $^_async $(LDFLAGS_PVFMM) -o $@_async
+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -D__DEVICE_SYNC__=1 $^_mic   $(LDFLAGS_PVFMM) -o $@_mic
 
 $(OBJDIR)/%.o: $(SRCDIR)/%.cpp
 	-@$(MKDIRS) $(dir $@)
-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -no-offload      -I$(INCDIR) -c $^ -o $@
-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM)                  -I$(INCDIR) -c $^ -o $@_mic
-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -D__MIC_ASYNCH__ -I$(INCDIR) -c $^ -o $@_async
+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -no-offload         -I$(INCDIR) -c $^ -o $@
+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM)                     -I$(INCDIR) -c $^ -o $@_async
+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -D__DEVICE_SYNC__=1 -I$(INCDIR) -c $^ -o $@_mic
 
 else
 

+ 4 - 1
examples/src/example1.cpp

@@ -76,12 +76,15 @@ void fmm_test(size_t N, int mult_order, MPI_Comm comm){
   for(size_t i=0;i< src_value.size();i++)  src_value[i]=drand48();
   for(size_t i=0;i<surf_value.size();i++) surf_value[i]=drand48();
 
+  // Create memory-manager (optional)
+  pvfmm::mem::MemoryManager mem_mgr(10000000);
+
   // Construct tree.
   size_t max_pts=300;
   pvfmm::PtFMM_Tree* tree=PtFMM_CreateTree(src_coord, src_value, surf_coord, surf_value, trg_coord, comm, max_pts, pvfmm::FreeSpace);
 
   // Load matrices.
-  pvfmm::PtFMM matrices;
+  pvfmm::PtFMM matrices(&mem_mgr);
   matrices.Initialize(mult_order, comm, &kernel_fn, &kernel_fn_aux);
 
   // FMM Setup

+ 5 - 4
examples/src/fmm_cheb.cpp

@@ -9,6 +9,7 @@
 #include <fmm_cheb.hpp>
 #include <fmm_node.hpp>
 #include <fmm_tree.hpp>
+#include <cheb_node.hpp>
 #include <utils.hpp>
 
 //////////////////////////////////////////////////////////////////////////////
@@ -229,16 +230,16 @@ void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int
       fn_input_=fn_input_t1<Real_t>;
       fn_poten_=fn_poten_t1<Real_t>;
       fn_grad_ =fn_grad_t1<Real_t>;
-      mykernel     =pvfmm::LaplaceKernel<Real_t>::potn_ker;
-      //mykernel_grad=pvfmm::LaplaceKernel<Real_t>::grad_ker;
+      mykernel     =&pvfmm::LaplaceKernel<Real_t>::potn_ker();
+      //mykernel_grad=&pvfmm::LaplaceKernel<Real_t>::grad_ker();
       bndry=pvfmm::Periodic;
       break;
     case 2:
       fn_input_=fn_input_t2<Real_t>;
       fn_poten_=fn_poten_t2<Real_t>;
       fn_grad_ =fn_grad_t2<Real_t>;
-      mykernel     =pvfmm::LaplaceKernel<Real_t>::potn_ker;
-      //mykernel_grad=pvfmm::LaplaceKernel<Real_t>::grad_ker;
+      mykernel     =&pvfmm::LaplaceKernel<Real_t>::potn_ker();
+      //mykernel_grad=&pvfmm::LaplaceKernel<Real_t>::grad_ker();
       bndry=pvfmm::FreeSpace;
       break;
     case 3:

+ 7 - 6
include/cheb_node.hpp

@@ -5,15 +5,16 @@
  * \brief This is a derived cheb class of MPI_Node.
  */
 
-#ifndef _PVFMM_CHEB_NODE_HPP_
-#define _PVFMM_CHEB_NODE_HPP_
+#include <vector>
+#include <stdint.h>
 
 #include <pvfmm_common.hpp>
-#include <assert.h>
-#include <vector.hpp>
+#include <tree_node.hpp>
 #include <mpi_node.hpp>
-#include <mortonid.hpp>
-#include <cheb_utils.hpp>
+#include <vector.hpp>
+
+#ifndef _PVFMM_CHEB_NODE_HPP_
+#define _PVFMM_CHEB_NODE_HPP_
 
 namespace pvfmm{
 

+ 6 - 4
include/cheb_node.txx

@@ -1,14 +1,16 @@
 /**
- * \file cheb_node.cpp
+ * \file cheb_node.txx
  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  * \date 1-22-2010
  * \brief This file contains the implementation of the class Cheb_Node.
  */
 
-#include <iostream>
-#include <matrix.hpp>
-#include <omp.h>
+#include <cmath>
+#include <cassert>
+#include <algorithm>
+
 #include <cheb_utils.hpp>
+#include <matrix.hpp>
 
 namespace pvfmm{
 

+ 5 - 4
include/cheb_utils.hpp

@@ -5,13 +5,14 @@
  * \brief This file contains chebyshev related functions.
  */
 
-#ifndef _PVFMM_CHEB_UTILS_HPP_
-#define _PVFMM_CHEB_UTILS_HPP_
+#include <vector>
 
 #include <pvfmm_common.hpp>
-#include <vector>
-#include <kernel.hpp>
 #include <vector.hpp>
+#include <kernel.hpp>
+
+#ifndef _PVFMM_CHEB_UTILS_HPP_
+#define _PVFMM_CHEB_UTILS_HPP_
 
 namespace pvfmm{
 

+ 10 - 3
include/cheb_utils.txx

@@ -5,11 +5,18 @@
  * \brief This file contains chebyshev related functions.
  */
 
-#include <assert.h>
+#include <omp.h>
+#include <cmath>
+#include <cassert>
+#include <iostream>
 #include <algorithm>
-#include <matrix.hpp>
-#include <mem_mgr.hpp>
+
 #include <legendre_rule.hpp>
+#include <mem_utils.hpp>
+#include <mat_utils.hpp>
+#include <mem_mgr.hpp>
+#include <matrix.hpp>
+#include <profile.hpp>
 
 namespace pvfmm{
 

+ 8 - 6
include/device_wrapper.hpp

@@ -5,13 +5,13 @@
  * \brief This file contains definition of DeviceWrapper.
  */
 
-#ifndef _PVFMM_DEVICE_WRAPPER_HPP_
-#define _PVFMM_DEVICE_WRAPPER_HPP_
-
-#include <cstdlib>
-#include <cassert>
 #include <stdint.h>
+
 #include <pvfmm_common.hpp>
+#include <vector.hpp>
+
+#ifndef _PVFMM_DEVICE_WRAPPER_HPP_
+#define _PVFMM_DEVICE_WRAPPER_HPP_
 
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
@@ -24,8 +24,10 @@ namespace DeviceWrapper{
 
   void free_device(char* dev_handle, uintptr_t dev_ptr);
 
+  template <int SYNC=__DEVICE_SYNC__>
   int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len);
 
+  template <int SYNC=__DEVICE_SYNC__>
   int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len);
 
   void wait(int lock_idx);
@@ -51,7 +53,7 @@ Note: Any MIC offload section should look like this:
       MIC_Lock::release_lock(lock_idx);
     }
 
-    #ifdef __MIC_ASYNCH__
+    #ifdef __DEVICE_SYNC__
     MIC_Lock::wait_lock(lock_idx);
     #endif
 

+ 14 - 13
include/device_wrapper.txx

@@ -5,8 +5,9 @@
  * \brief This file contains implementation of DeviceWrapper.
  */
 
-#include <vector.hpp>
-#include <device_wrapper.hpp>
+#include <omp.h>
+#include <cassert>
+#include <cstdlib>
 
 namespace pvfmm{
 
@@ -14,6 +15,10 @@ namespace DeviceWrapper{
 
   // MIC functions
 
+  #define ALLOC alloc_if(1) free_if(0)
+  #define FREE alloc_if(0) free_if(1)
+  #define REUSE alloc_if(0) free_if(0)
+
   inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
     assert(dev_handle!=NULL);
     uintptr_t dev_ptr=(uintptr_t)NULL;
@@ -52,10 +57,6 @@ namespace DeviceWrapper{
         MIC_Lock::release_lock(lock_idx);
       }
     }
-    #ifndef __MIC_ASYNCH__ // Wait
-    #pragma offload target(mic:0)
-    {MIC_Lock::wait_lock(lock_idx);}
-    #endif
     return lock_idx;
     #endif
     return -1;
@@ -80,9 +81,6 @@ namespace DeviceWrapper{
         MIC_Lock::release_lock(lock_idx);
       }
     }
-    #ifndef __MIC_ASYNCH__ // Wait
-    MIC_Lock::wait_lock(lock_idx);
-    #endif
     return lock_idx;
     #endif
     return -1;
@@ -116,20 +114,27 @@ namespace DeviceWrapper{
     #endif
   }
 
+  template <int SYNC=__DEVICE_SYNC__>
   inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
     int lock_idx=-1;
     #ifdef __INTEL_OFFLOAD
     lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
+    if(SYNC){
+      #pragma offload target(mic:0)
+      {MIC_Lock::wait_lock(lock_idx);}
+    }
     #else
     ;
     #endif
     return lock_idx;
   }
 
+  template <int SYNC=__DEVICE_SYNC__>
   inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
     int lock_idx=-1;
     #ifdef __INTEL_OFFLOAD
     lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
+    if(SYNC) MIC_Lock::wait_lock(lock_idx);
     #else
     ;
     #endif
@@ -230,8 +235,4 @@ namespace DeviceWrapper{
     #endif
   }
 
-  Vector<char> MIC_Lock::lock_vec;
-  Vector<char>::Device MIC_Lock::lock_vec_;
-  int MIC_Lock::lock_idx;
-
 }//end namespace

+ 3 - 3
include/dtypes.h

@@ -1,9 +1,9 @@
-#ifndef __PVFMM_DTYPES_H_
-#define __PVFMM_DTYPES_H_
-
 #include <mpi.h>
 #include <complex>
 
+#ifndef __PVFMM_DTYPES_H_
+#define __PVFMM_DTYPES_H_
+
 /**
  * \file	dtypes.h
  * \brief	Traits to determine MPI_DATATYPE from a C++ datatype

+ 9 - 4
include/fft_wrapper.hpp

@@ -1,20 +1,25 @@
 /**
- * \file mat_utils.hpp
+ * \file fft_wrapper.hpp
  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  * \date 2-11-2011
  * \brief This file contains FFTW3 wrapper functions.
  */
 
-#ifndef _PVFMM_FFT_WRAPPER_
-#define _PVFMM_FFT_WRAPPER_
-
+#include <cmath>
+#include <cassert>
+#include <vector>
 #include <fftw3.h>
 #ifdef FFTW3_MKL
 #include <fftw3_mkl.h>
 #endif
+
 #include <pvfmm_common.hpp>
+#include <mem_utils.hpp>
 #include <matrix.hpp>
 
+#ifndef _PVFMM_FFT_WRAPPER_
+#define _PVFMM_FFT_WRAPPER_
+
 namespace pvfmm{
 
 template<class T>

+ 9 - 6
include/fmm_cheb.hpp

@@ -6,16 +6,19 @@
  * This handles all the translations through matrix multiplications.
  */
 
-#ifndef _PVFMM_FMM_CHEB_HPP_
-#define _PVFMM_FMM_CHEB_HPP_
+#include <mpi.h>
+#include <vector>
 
 #include <pvfmm_common.hpp>
-#include <mpi.h>
-#include <matrix.hpp>
 #include <precomp_mat.hpp>
-#include <cheb_utils.hpp>
-#include <cheb_node.hpp>
+#include <mem_mgr.hpp>
 #include <fmm_pts.hpp>
+#include <vector.hpp>
+#include <matrix.hpp>
+#include <kernel.hpp>
+
+#ifndef _PVFMM_FMM_CHEB_HPP_
+#define _PVFMM_FMM_CHEB_HPP_
 
 namespace pvfmm{
 

+ 11 - 3
include/fmm_cheb.txx

@@ -5,13 +5,21 @@
  * \brief This file contains the implementation of the FMM_Cheb class.
  */
 
-#include <mpi.h>
-#include <fftw3.h>
-#include <parUtils.h>
+#include <omp.h>
+#include <sstream>
+#include <iostream>
+#include <cstdlib>
+#include <cmath>
 #ifdef PVFMM_HAVE_SYS_STAT_H
 #include <sys/stat.h>
 #endif
 
+#include <dtypes.h>
+#include <parUtils.h>
+#include <cheb_utils.hpp>
+#include <mem_utils.hpp>
+#include <profile.hpp>
+
 namespace pvfmm{
 
 template <class FMMNode>

+ 5 - 2
include/fmm_gll.hpp

@@ -1,8 +1,11 @@
-#ifndef _PVFMM_FMM_GLL_HPP_
-#define _PVFMM_FMM_GLL_HPP_
 
 #include <mpi.h>
 
+#include <pvfmm_common.hpp>
+
+#ifndef _PVFMM_FMM_GLL_HPP_
+#define _PVFMM_FMM_GLL_HPP_
+
 #ifdef __cplusplus
 extern "C" {
 #endif

+ 7 - 5
include/fmm_node.hpp

@@ -5,19 +5,21 @@
  * \brief This file contains the definition of the FMM_Node class.
  */
 
-#ifndef _PVFMM_FMM_NODE_HPP_
-#define _PVFMM_FMM_NODE_HPP_
+#include <vector>
 
-#include <mpi.h>
-#include <iostream>
 #include <pvfmm_common.hpp>
+#include <tree_node.hpp>
 #include <mpi_node.hpp>
 #include <fmm_pts.hpp>
+#include <vector.hpp>
+
+#ifndef _PVFMM_FMM_NODE_HPP_
+#define _PVFMM_FMM_NODE_HPP_
 
 namespace pvfmm{
 
 /**
- * \brief Base class for node of FMM_Tree.
+ * \brief Base class for node of FMM_Node.
  */
 template <class Node>
 class FMM_Node: public Node{

+ 5 - 0
include/fmm_node.txx

@@ -5,6 +5,11 @@
  * \brief This file contains the implementation of the FMM_Node class.
  */
 
+#include <cassert>
+
+#include <mem_utils.hpp>
+#include <mpi_node.hpp>
+
 namespace pvfmm{
 
 template <class Node>

+ 15 - 6
include/fmm_pts.hpp

@@ -6,16 +6,23 @@
  * This handles all the translations for point sources and targets.
  */
 
-#ifndef _PVFMM_FMM_PTS_HPP_
-#define _PVFMM_FMM_PTS_HPP_
+#include <mpi.h>
+#include <string>
+#include <vector>
 
 #include <pvfmm_common.hpp>
-#include <mpi.h>
-#include <matrix.hpp>
-#include <precomp_mat.hpp>
 #include <interac_list.hpp>
-#include <kernel.hpp>
+#include <precomp_mat.hpp>
+#include <fft_wrapper.hpp>
+#include <mem_utils.hpp>
 #include <mpi_node.hpp>
+#include <mem_mgr.hpp>
+#include <vector.hpp>
+#include <matrix.hpp>
+#include <kernel.hpp>
+
+#ifndef _PVFMM_FMM_PTS_HPP_
+#define _PVFMM_FMM_PTS_HPP_
 
 namespace pvfmm{
 
@@ -130,9 +137,11 @@ class FMM_Pts{
 
   void SetupPrecomp(SetupData<Real_t>& setup_data, bool device=false);
   void SetupInterac(SetupData<Real_t>& setup_data, bool device=false);
+  template <int SYNC=__DEVICE_SYNC__>
   void EvalList    (SetupData<Real_t>& setup_data, bool device=false); // Run on CPU by default.
 
   void SetupInteracPts(SetupData<Real_t>& setup_data, bool shift_src, bool shift_trg, Matrix<Real_t>* M, bool device);
+  template <int SYNC=__DEVICE_SYNC__>
   void EvalListPts    (SetupData<Real_t>& setup_data, bool device=false); // Run on CPU by default.
 
   /**

+ 22 - 12
include/fmm_pts.txx

@@ -5,11 +5,14 @@
  * \brief This file contains the implementation of the FMM_Pts class.
  */
 
-#include <mpi.h>
-#include <set>
+#include <omp.h>
+#include <cmath>
+#include <cstdlib>
+#include <cassert>
 #include <sstream>
-#include <fft_wrapper.hpp>
-#include <mat_utils.hpp>
+#include <iostream>
+#include <stdint.h>
+#include <set>
 #ifdef PVFMM_HAVE_SYS_STAT_H
 #include <sys/stat.h>
 #endif
@@ -17,6 +20,9 @@
 #ifdef __SSE__
 #include <xmmintrin.h>
 #endif
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
 #ifdef __SSE3__
 #include <pmmintrin.h>
 #endif
@@ -27,6 +33,8 @@
 #include <immintrin.h>
 #endif
 
+#include <profile.hpp>
+
 namespace pvfmm{
 
 /**
@@ -1485,6 +1493,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 }
 
 template <class FMMNode>
+template <int SYNC=__DEVICE_SYNC__>
 void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
     Profile::Tic("Host2Device",&this->comm,false,25);
@@ -1722,11 +1731,11 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
     if(device) MIC_Lock::release_lock(lock_idx);
   }
 
-  #ifndef __MIC_ASYNCH__
   #ifdef __INTEL_OFFLOAD
-  #pragma offload if(device) target(mic:0)
-  {if(device) MIC_Lock::wait_lock(lock_idx);}
-  #endif
+  if(SYNC){
+    #pragma offload if(device) target(mic:0)
+    {if(device) MIC_Lock::wait_lock(lock_idx);}
+  }
   #endif
 
   Profile::Toc();
@@ -3089,6 +3098,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 }
 
 template <class FMMNode>
+template <int SYNC=__DEVICE_SYNC__>
 void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
     Profile::Tic("Host2Device",&this->comm,false,25);
@@ -3268,11 +3278,11 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
     if(device) MIC_Lock::release_lock(lock_idx);
   }
 
-  #ifndef __MIC_ASYNCH__
   #ifdef __INTEL_OFFLOAD
-  #pragma offload if(device) target(mic:0)
-  {if(device) MIC_Lock::wait_lock(lock_idx);}
-  #endif
+  if(SYNC){
+    #pragma offload if(device) target(mic:0)
+    {if(device) MIC_Lock::wait_lock(lock_idx);}
+  }
   #endif
 
   Profile::Toc();

+ 8 - 5
include/fmm_tree.hpp

@@ -5,14 +5,17 @@
  * \brief This file contains the definition of the FMM_Tree class.
  */
 
-#ifndef _PVFMM_FMM_TREE_HPP_
-#define _PVFMM_FMM_TREE_HPP_
-
-#include <iostream>
 #include <mpi.h>
+#include <vector>
+
 #include <pvfmm_common.hpp>
-#include <mpi_tree.hpp>
+#include <interac_list.hpp>
 #include <fmm_node.hpp>
+#include <mpi_tree.hpp>
+#include <matrix.hpp>
+
+#ifndef _PVFMM_FMM_TREE_HPP_
+#define _PVFMM_FMM_TREE_HPP_
 
 namespace pvfmm{
 

+ 9 - 1
include/fmm_tree.txx

@@ -5,9 +5,17 @@
  * \brief This file contains the implementation of the class FMM_Tree.
  */
 
-#include <assert.h>
+#include <omp.h>
+#include <sstream>
+#include <iomanip>
+#include <cassert>
+
+#include <mpi_node.hpp>
 #include <fmm_node.hpp>
+#include <mem_utils.hpp>
+#include <mortonid.hpp>
 #include <profile.hpp>
+#include <vector.hpp>
 
 namespace pvfmm{
 

+ 5 - 3
include/interac_list.hpp

@@ -7,12 +7,14 @@
  * symmetry class for each interaction.
  */
 
-#ifndef _PVFMM_INTERAC_LIST_HPP_
-#define _PVFMM_INTERAC_LIST_HPP_
+#include <vector>
 
 #include <pvfmm_common.hpp>
-#include <tree_node.hpp>
 #include <precomp_mat.hpp>
+#include <matrix.hpp>
+
+#ifndef _PVFMM_INTERAC_LIST_HPP_
+#define _PVFMM_INTERAC_LIST_HPP_
 
 namespace pvfmm{
 

+ 4 - 4
include/interac_list.txx

@@ -7,10 +7,10 @@
  * symmetry class for each interaction.
  */
 
-#include <math.h>
-#include <algorithm>
-#include <tree_node.hpp>
-#include <precomp_mat.hpp>
+#include <cmath>
+#include <cassert>
+
+#include <parUtils.h>
 #include <ompUtils.h>
 
 namespace pvfmm{

+ 28 - 38
include/kernel.hpp

@@ -6,13 +6,13 @@
  * implementation of various kernels for FMM.
  */
 
-#ifndef _PVFMM_FMM_KERNEL_HPP_
-#define _PVFMM_FMM_KERNEL_HPP_
+#include <string>
 
 #include <pvfmm_common.hpp>
-#include <quad_utils.hpp>
 #include <mem_mgr.hpp>
-#include <string>
+
+#ifndef _PVFMM_FMM_KERNEL_HPP_
+#define _PVFMM_FMM_KERNEL_HPP_
 
 namespace pvfmm{
 
@@ -41,7 +41,7 @@ struct Kernel{
    * \brief Constructor.
    */
   Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
-         const int (&k_dim)[2], bool homogen_=false, T ker_scale=0,
+         std::pair<int,int> k_dim, bool homogen_=false, T ker_scale=0,
          size_t dev_poten=(size_t)NULL, size_t dev_dbl_poten=(size_t)NULL);
 
   /**
@@ -73,7 +73,7 @@ struct Kernel{
 template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr),
                      void (*B)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
 Kernel<T> BuildKernel(const char* name, int dim,
-         const int (&k_dim)[2], bool homogen=false, T ker_scale=0){
+         std::pair<int,int> k_dim, bool homogen=false, T ker_scale=0){
   size_t dev_ker_poten      ;
   size_t dev_dbl_layer_poten;
   #ifdef __INTEL_OFFLOAD
@@ -91,7 +91,7 @@ Kernel<T> BuildKernel(const char* name, int dim,
 
 template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
 Kernel<T> BuildKernel(const char* name, int dim,
-         const int (&k_dim)[2], bool homogen=false, T ker_scale=0){
+         std::pair<int,int> k_dim, bool homogen=false, T ker_scale=0){
   size_t dev_ker_poten      ;
   #ifdef __INTEL_OFFLOAD
   #pragma offload target(mic:0)
@@ -133,36 +133,33 @@ void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cn
 
 
 
-int dim_laplace_poten[2]={1,1};
-int dim_laplace_grad [2]={1,3};
-
 #ifdef QuadReal_t
-const Kernel<QuadReal_t> laplace_potn_q=BuildKernel<QuadReal_t, laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
-const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
+const Kernel<QuadReal_t> laplace_potn_q=BuildKernel<QuadReal_t, laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
+const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
 #endif
 
-const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
-const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
+const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
+const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
 
-const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
-const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
+const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
+const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
 
 template<class T>
 struct LaplaceKernel{
-  static Kernel<T>* potn_ker;
-  static Kernel<T>* grad_ker;
+  inline static const Kernel<T>& potn_ker();
+  inline static const Kernel<T>& grad_ker();
 };
 
 #ifdef QuadReal_t
-template<> Kernel<QuadReal_t>* LaplaceKernel<QuadReal_t>::potn_ker=(Kernel<QuadReal_t>*)&laplace_potn_q;
-template<> Kernel<QuadReal_t>* LaplaceKernel<QuadReal_t>::grad_ker=(Kernel<QuadReal_t>*)&laplace_grad_q;
+template<> const Kernel<QuadReal_t>& LaplaceKernel<QuadReal_t>::potn_ker(){ return laplace_potn_q; };
+template<> const Kernel<QuadReal_t>& LaplaceKernel<QuadReal_t>::grad_ker(){ return laplace_grad_q; };
 #endif
 
-template<> Kernel<double>* LaplaceKernel<double>::potn_ker=(Kernel<double>*)&laplace_potn_d;
-template<> Kernel<double>* LaplaceKernel<double>::grad_ker=(Kernel<double>*)&laplace_grad_d;
+template<> const Kernel<double>& LaplaceKernel<double>::potn_ker(){ return laplace_potn_d; };
+template<> const Kernel<double>& LaplaceKernel<double>::grad_ker(){ return laplace_grad_d; };
 
-template<> Kernel<float>* LaplaceKernel<float>::potn_ker=(Kernel<float>*)&laplace_potn_f;
-template<> Kernel<float>* LaplaceKernel<float>::grad_ker=(Kernel<float>*)&laplace_grad_f;
+template<> const Kernel<float>& LaplaceKernel<float>::potn_ker(){ return laplace_potn_f; };
+template<> const Kernel<float>& LaplaceKernel<float>::grad_ker(){ return laplace_grad_f; };
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////                   STOKES KERNEL                             ////////
@@ -189,17 +186,13 @@ void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cn
 
 
 
-int dim_stokes_vel   [2]={3,3};
-const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_sym_dip>("stokes_vel"   , 3, dim_stokes_vel   ,true,1.0);
+const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_sym_dip>("stokes_vel"   , 3, std::pair<int,int>(3,3),true,1.0);
 
-int dim_stokes_press [2]={3,1};
-const Kernel<double> ker_stokes_press =BuildKernel<double, stokes_press              >("stokes_press" , 3, dim_stokes_press ,true,2.0);
+const Kernel<double> ker_stokes_press =BuildKernel<double, stokes_press              >("stokes_press" , 3, std::pair<int,int>(3,1),true,2.0);
 
-int dim_stokes_stress[2]={3,9};
-const Kernel<double> ker_stokes_stress=BuildKernel<double, stokes_stress             >("stokes_stress", 3, dim_stokes_stress,true,2.0);
+const Kernel<double> ker_stokes_stress=BuildKernel<double, stokes_stress             >("stokes_stress", 3, std::pair<int,int>(3,9),true,2.0);
 
-int dim_stokes_grad  [2]={3,9};
-const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad               >("stokes_grad"  , 3, dim_stokes_grad  ,true,2.0);
+const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad               >("stokes_grad"  , 3, std::pair<int,int>(3,9),true,2.0);
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////                  BIOT-SAVART KERNEL                            ////////
@@ -208,8 +201,7 @@ const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad
 template <class T>
 void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
 
-int dim_biot_savart[2]={3,3};
-const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_savart", 3, dim_biot_savart,true,2.0);
+const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_savart", 3, std::pair<int,int>(3,3),true,2.0);
 
 ////////////////////////////////////////////////////////////////////////////////
 ////////                   HELMHOLTZ KERNEL                             ////////
@@ -227,11 +219,9 @@ void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_
 
 
 
-int dim_helmholtz     [2]={2,2};
-const Kernel<double> ker_helmholtz     =BuildKernel<double, helmholtz_poten>("helmholtz"     , 3, dim_helmholtz     );
+const Kernel<double> ker_helmholtz     =BuildKernel<double, helmholtz_poten>("helmholtz"     , 3, std::pair<int,int>(2,2));
 
-int dim_helmholtz_grad[2]={2,6};
-const Kernel<double> ker_helmholtz_grad=BuildKernel<double, helmholtz_grad >("helmholtz_grad", 3, dim_helmholtz_grad);
+const Kernel<double> ker_helmholtz_grad=BuildKernel<double, helmholtz_grad >("helmholtz_grad", 3, std::pair<int,int>(2,6));
 
 }//end namespace
 #ifdef __INTEL_OFFLOAD

+ 24 - 9
include/kernel.txx

@@ -6,14 +6,29 @@
  * implementation of various kernels for FMM.
  */
 
-#ifdef USE_SSE
-#include <emmintrin.h>
-#endif
-
-#include <math.h>
-#include <assert.h>
+#include <cmath>
+#include <cstdlib>
 #include <vector>
+
+#include <mem_utils.hpp>
 #include <profile.hpp>
+#include <vector.hpp>
+
+#ifdef __SSE__
+#include <xmmintrin.h>
+#endif
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+#if defined(__MIC__)
+#include <immintrin.h>
+#endif
 
 namespace pvfmm{
 
@@ -31,11 +46,11 @@ Kernel<T>::Kernel(): dim(0){
  */
 template <class T>
 Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
-                  const int (&k_dim)[2], bool homogen_, T ker_scale,
+                  std::pair<int,int> k_dim, bool homogen_, T ker_scale,
                   size_t dev_poten, size_t dev_dbl_poten){
   dim=dim_;
-  ker_dim[0]=k_dim[0];
-  ker_dim[1]=k_dim[1];
+  ker_dim[0]=k_dim.first;
+  ker_dim[1]=k_dim.second;
   ker_poten=poten;
   dbl_layer_poten=dbl_poten;
   homogen=homogen_;

+ 3 - 2
include/legendre_rule.hpp

@@ -1,8 +1,9 @@
-#ifndef _LEGENDRE_RULE_HPP_
-#define _LEGENDRE_RULE_HPP_
 
 # include <cstring>
 
+#ifndef _LEGENDRE_RULE_HPP_
+#define _LEGENDRE_RULE_HPP_
+
 void cdgqf ( int nt, int kind, double alpha, double beta, double t[],
   double wts[] );
 void cgqf ( int nt, int kind, double alpha, double beta, double a, double b,

+ 2 - 2
include/mat_utils.hpp

@@ -5,11 +5,11 @@
  * \brief This file contains BLAS and LAPACK wrapper functions.
  */
 
+#include <pvfmm_common.hpp>
+
 #ifndef _PVFMM_MAT_UTILS_
 #define _PVFMM_MAT_UTILS_
 
-#include <cstdlib>
-
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif

+ 6 - 4
include/mat_utils.txx

@@ -5,14 +5,16 @@
  * \brief This file contains BLAS and LAPACK wrapper functions.
  */
 
+#include <omp.h>
+#include <cmath>
 #include <cassert>
-#include <vector>
+#include <algorithm>
 #include <iostream>
-#include <stdint.h>
-#include <math.h>
+#include <vector>
+
 #include <blas.h>
 #include <lapack.h>
-#include <fft_wrapper.hpp>
+#include <matrix.hpp>
 
 namespace pvfmm{
 namespace mat{

+ 5 - 5
include/matrix.hpp

@@ -5,14 +5,14 @@
  * \brief This file contains definition of the class Matrix.
  */
 
-#ifndef _PVFMM_MATRIX_HPP_
-#define _PVFMM_MATRIX_HPP_
+#include <stdint.h>
 
-#include <cstdlib>
-#include <vector>
-#include <iostream>
+#include <pvfmm_common.hpp>
 #include <vector.hpp>
 
+#ifndef _PVFMM_MATRIX_HPP_
+#define _PVFMM_MATRIX_HPP_
+
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif

+ 8 - 2
include/matrix.txx

@@ -5,11 +5,17 @@
  * \brief This file contains inplementation of the class Matrix.
  */
 
-#include <cstring>
+#include <omp.h>
+#include <cmath>
+#include <cstdlib>
 #include <cassert>
+#include <iostream>
 #include <iomanip>
-#include <profile.hpp>
+
+#include <device_wrapper.hpp>
 #include <mat_utils.hpp>
+#include <mem_utils.hpp>
+#include <profile.hpp>
 
 namespace pvfmm{
 

+ 12 - 10
include/mem_mgr.hpp

@@ -6,18 +6,20 @@
  * uses a pre-allocated buffer of size defined in call to the constructor.
  */
 
-#ifndef _PVFMM_MEM_MGR_HPP_
-#define _PVFMM_MEM_MGR_HPP_
-
-#include <map>
-#include <stack>
-#include <vector>
-#include <cassert>
-#include <iostream>
-#include <cmath>
 #include <omp.h>
+#include <cstdlib>
+#include <stdint.h>
+#include <algorithm>
+#include <iostream>
+#include <cassert>
+#include <vector>
+#include <stack>
+#include <map>
+
 #include <pvfmm_common.hpp>
-#include <mem_utils.hpp>
+
+#ifndef _PVFMM_MEM_MGR_HPP_
+#define _PVFMM_MEM_MGR_HPP_
 
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))

+ 2 - 7
include/mem_utils.hpp

@@ -5,15 +5,10 @@
  * \brief This file contains memory management utilities.
  */
 
-#ifndef _PVFMM_MEM_UTILS_
-#define _PVFMM_MEM_UTILS_
-
-#include <cstdlib>
 #include <pvfmm_common.hpp>
 
-#define ALLOC alloc_if(1) free_if(0)
-#define FREE alloc_if(0) free_if(1)
-#define REUSE alloc_if(0) free_if(0)
+#ifndef _PVFMM_MEM_UTILS_
+#define _PVFMM_MEM_UTILS_
 
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))

+ 1 - 2
include/mem_utils.txx

@@ -5,11 +5,10 @@
  * \brief This file contains implementation of mem_utils.hpp.
  */
 
-#include <omp.h>
 #include <cassert>
 #include <cstring>
+#include <cstdlib>
 #include <stdint.h>
-#include <profile.hpp>
 
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))

+ 5 - 5
include/mortonid.hpp

@@ -5,13 +5,13 @@
  * \brief This file contains definition of the class MortonId.
  */
 
-#ifndef _PVFMM_MORTONID_HPP_
-#define _PVFMM_MORTONID_HPP_
+#include <vector>
+#include <stdint.h>
 
 #include <pvfmm_common.hpp>
-#include <iostream>
-#include <stdint.h>
-#include <vector>
+
+#ifndef _PVFMM_MORTONID_HPP_
+#define _PVFMM_MORTONID_HPP_
 
 namespace pvfmm{
 

+ 2 - 0
include/mortonid.txx

@@ -5,6 +5,8 @@
  * \brief This file contains implementation of the class MortonId.
  */
 
+#include <cmath>
+
 namespace pvfmm{
 
 inline MortonId::MortonId():x(0), y(0), z(0), depth(0){}

+ 6 - 3
include/mpi_node.hpp

@@ -6,15 +6,18 @@
  * locally essential tree node.
  */
 
-#ifndef _PVFMM_MPI_NODE_HPP_
-#define _PVFMM_MPI_NODE_HPP_
+#include <vector>
+#include <cassert>
+#include <stdint.h>
 
 #include <pvfmm_common.hpp>
-#include <assert.h>
 #include <tree_node.hpp>
 #include <mortonid.hpp>
 #include <vector.hpp>
 
+#ifndef _PVFMM_MPI_NODE_HPP_
+#define _PVFMM_MPI_NODE_HPP_
+
 namespace pvfmm{
 
 /**

+ 5 - 3
include/mpi_node.txx

@@ -1,12 +1,14 @@
 /**
- * \file mpi_node.cpp
+ * \file mpi_node.txx
  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  * \date 12-11-2010
  * \brief This file contains the implementation of the class MPI_Node.
  */
 
-#include <assert.h>
-#include <iostream>
+#include <cmath>
+
+#include <matrix.hpp>
+#include <mem_utils.hpp>
 
 namespace pvfmm{
 

+ 5 - 4
include/mpi_tree.hpp

@@ -6,15 +6,16 @@
  * MPI tree.
  */
 
-#ifndef _PVFMM_MPI_TREE_HPP_
-#define _PVFMM_MPI_TREE_HPP_
+#include <mpi.h>
+#include <vector>
 
 #include <pvfmm_common.hpp>
-#include <mpi.h>
-#include <mpi_node.hpp>
 #include <mortonid.hpp>
 #include <tree.hpp>
 
+#ifndef _PVFMM_MPI_TREE_HPP_
+#define _PVFMM_MPI_TREE_HPP_
+
 namespace pvfmm{
 
 enum BoundaryType{

+ 15 - 4
include/mpi_tree.txx

@@ -5,13 +5,24 @@
  * \brief This file contains the implementation of the class MPI_Tree.
  */
 
-#include <assert.h>
-#include <cstring>
+#include <omp.h>
+#include <cmath>
+#include <cstdlib>
+#include <cassert>
+#include <string>
+#include <sstream>
+#include <iostream>
+#include <iomanip>
 #include <fstream>
-#include <list>
+#include <algorithm>
+#include <stdint.h>
 #include <set>
-#include <parUtils.h>
+
+#include <dtypes.h>
 #include <ompUtils.h>
+#include <parUtils.h>
+#include <mem_utils.hpp>
+#include <mpi_node.hpp>
 #include <profile.hpp>
 
 namespace pvfmm{

+ 3 - 3
include/ompUtils.txx

@@ -1,7 +1,7 @@
-#include <cstdlib>
+
 #include <omp.h>
-#include <iterator>
-#include <vector>
+#include <cstring>
+#include <algorithm>
 
 namespace pvfmm{
 

+ 4 - 4
include/parUtils.h

@@ -8,13 +8,13 @@
   @author Dhairya Malhotra, dhairya.malhotra@gmail.com
   */
 
-#ifndef __PVFMM_PAR_UTILS_H_
-#define __PVFMM_PAR_UTILS_H_
-
-#include "mpi.h"
+#include <mpi.h>
 #include <vector>
 #include <vector.hpp>
 
+#ifndef __PVFMM_PAR_UTILS_H_
+#define __PVFMM_PAR_UTILS_H_
+
 /**
   @namespace par
   @author Rahul Sampath

+ 6 - 4
include/parUtils.txx

@@ -8,13 +8,15 @@
   @author Santi Swaroop Adavani, santis@gmail.com
   */
 
-#include "dtypes.h"
+#include <cmath>
 #include <cassert>
+#include <cstring>
+#include <cstdlib>
 #include <iostream>
 #include <algorithm>
-#include <cstring>
-#include "ompUtils.h"
-#include <mpi.h>
+
+#include <dtypes.h>
+#include <ompUtils.h>
 
 namespace pvfmm{
 namespace par{

+ 5 - 3
include/precomp_mat.hpp

@@ -6,13 +6,15 @@
  * Handles storage of precomputed translation matrices.
  */
 
-#ifndef _PVFMM_PrecompMAT_HPP_
-#define _PVFMM_PrecompMAT_HPP_
-
 #include <mpi.h>
+#include <vector>
+
 #include <pvfmm_common.hpp>
 #include <matrix.hpp>
 
+#ifndef _PVFMM_PrecompMAT_HPP_
+#define _PVFMM_PrecompMAT_HPP_
+
 namespace pvfmm{
 
 typedef enum{

+ 9 - 1
include/precomp_mat.txx

@@ -6,8 +6,16 @@
  * Handles storage of precomputed translation matrices.
  */
 
-#include <sys/stat.h>
+#include <omp.h>
+#include <cassert>
 #include <stdint.h>
+#ifdef PVFMM_HAVE_SYS_STAT_H
+#include <sys/stat.h>
+#endif
+
+#include <mem_utils.hpp>
+#include <profile.hpp>
+#include <vector.hpp>
 
 namespace pvfmm{
 

+ 4 - 4
include/profile.hpp

@@ -5,16 +5,16 @@
  * \brief This file contains definition of the class Profile.
  */
 
-#ifndef _PVFMM_PROFILE_HPP_
-#define _PVFMM_PROFILE_HPP_
-
 #include <mpi.h>
-#include <iostream>
 #include <string>
 #include <vector>
 #include <stack>
+
 #include <pvfmm_common.hpp>
 
+#ifndef _PVFMM_PROFILE_HPP_
+#define _PVFMM_PROFILE_HPP_
+
 #ifndef __PROFILE__
 #define __PROFILE__ -1
 #endif

+ 12 - 7
include/pvfmm.hpp

@@ -5,17 +5,22 @@
  * \brief This file contains wrapper functions for PvFMM.
  */
 
-#ifndef _PVFMM_HPP_
-#define _PVFMM_HPP_
-
 #include <mpi.h>
-#include <cstdlib>
-#include <iostream>
+#include <vector>
+#include <cmath>
 
 #include <pvfmm_common.hpp>
-#include <fmm_cheb.hpp>
-#include <fmm_node.hpp>
+#include <cheb_node.hpp>
+#include <mpi_node.hpp>
 #include <fmm_tree.hpp>
+#include <fmm_node.hpp>
+#include <fmm_cheb.hpp>
+#include <fmm_pts.hpp>
+#include <vector.hpp>
+#include <parUtils.h>
+
+#ifndef _PVFMM_HPP_
+#define _PVFMM_HPP_
 
 namespace pvfmm{
 

+ 6 - 2
include/pvfmm_common.hpp

@@ -38,11 +38,15 @@
 #define DEVICE_BUFFER_SIZE 1024 //in MB
 #define V_BLK_CACHE 25 //in KB
 
+#ifndef __DEVICE_SYNC__
+#define __DEVICE_SYNC__ 0 // No device synchronization by default.
+#endif
+
 #define UNUSED(x) (void)(x) // to ignore unused variable warning.
 
-#include <cstdlib>
-#include <cassert>
 #ifndef NDEBUG
+#include <cassert>
+#include <iostream>
 #define ASSERT_WITH_MSG(cond, msg) do \
 { if (!(cond)) { std::cerr<<"Error: "<<msg<<'\n'; assert(cond); } \
 } while(0)

+ 2 - 4
include/quad_utils.hpp

@@ -5,13 +5,11 @@
  * \brief This file contains definition of QuadReal_t.
  */
 
+#include <cmath>
+
 #ifndef _QUAD_UTILS_
 #define _QUAD_UTILS_
 
-#include <pvfmm_common.hpp>
-#include <iostream>
-#include <vector>
-
 #ifdef PVFMM_QUAD_T
 
 typedef PVFMM_QUAD_T QuadReal_t;

+ 4 - 2
include/quad_utils.txx

@@ -5,9 +5,11 @@
  * \brief This file contains quadruple-precision related functions.
  */
 
-#include <iomanip>
-#include <cstdlib>
+#include <omp.h>
 #include <cmath>
+#include <iostream>
+#include <iomanip>
+#include <vector>
 
 QuadReal_t atoquad(const char* str){
   size_t i=0;

+ 6 - 3
include/tree.hpp

@@ -5,13 +5,16 @@
  * \brief This file contains the definition of the base class for a tree.
  */
 
-#ifndef _PVFMM_TREE_HPP_
-#define _PVFMM_TREE_HPP_
+#include <cassert>
+#include <vector>
 
 #include <pvfmm_common.hpp>
-#include <iostream>
+#include <tree_node.hpp>
 #include <mem_mgr.hpp>
 
+#ifndef _PVFMM_TREE_HPP_
+#define _PVFMM_TREE_HPP_
+
 namespace pvfmm{
 
 /**

+ 1 - 4
include/tree.txx

@@ -1,13 +1,10 @@
 /**
- * \file tree.cpp
+ * \file tree.txx
  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
  * \date 12-11-2010
  * \brief This file contains the implementation of the class Tree.
  */
 
-#include <tree.hpp>
-#include <assert.h>
-
 namespace pvfmm{
 
 template <class TreeNode>

+ 2 - 4
include/tree_node.hpp

@@ -6,13 +6,11 @@
  * node.
  */
 
+#include <pvfmm_common.hpp>
+
 #ifndef _PVFMM_TREE_NODE_HPP_
 #define _PVFMM_TREE_NODE_HPP_
 
-#include <pvfmm_common.hpp>
-#include <assert.h>
-#include <cstring>
-
 namespace pvfmm{
 
 /**

+ 5 - 4
include/vector.hpp

@@ -5,13 +5,14 @@
  * \brief This file contains definition of the class Vector.
  */
 
-#ifndef _PVFMM_VECTOR_HPP_
-#define _PVFMM_VECTOR_HPP_
-
 #include <vector>
-#include <iostream>
 #include <stdint.h>
 
+#include <pvfmm_common.hpp>
+
+#ifndef _PVFMM_VECTOR_HPP_
+#define _PVFMM_VECTOR_HPP_
+
 #ifdef __INTEL_OFFLOAD
 #pragma offload_attribute(push,target(mic))
 #endif

+ 4 - 4
include/vector.txx

@@ -5,13 +5,13 @@
  * \brief This file contains implementation of the class Vector.
  */
 
-#include <cstdlib>
-#include <cstring>
 #include <cassert>
+#include <iostream>
 #include <iomanip>
-#include <profile.hpp>
-#include <mem_utils.hpp>
+
 #include <device_wrapper.hpp>
+#include <mem_utils.hpp>
+#include <profile.hpp>
 
 namespace pvfmm{
 

+ 6 - 6
m4/ac_check_intel_offload.m4

@@ -1,7 +1,3 @@
-# SYNOPSIS
-#
-#   CHECK_INTEL_OFFLOAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
-#
 
 AC_DEFUN([CHECK_INTEL_OFFLOAD], [
     ## Check for support of offload pragma and -no-offload flag. If
@@ -38,8 +34,8 @@ AC_DEFUN([CHECK_INTEL_OFFLOAD], [
     AC_LANG_WERROR([off])
     CFLAGS="$XCFLAGS"
     CXXFLAGS="$XCXXFLAGS"
+    ARFLAGS="$AR_FLAGS"
 
-    # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
     if test x"$intel_offload_pragma_ok" = xyes; then
         AC_DEFINE(HAVE_INTEL_OFFLOAD_PRAGMA,1,[Define if you have INTEL_OFFLOAD_PRAGMA.])
     fi
@@ -48,6 +44,9 @@ AC_DEFUN([CHECK_INTEL_OFFLOAD], [
         if test x"$intel_offload_pragma_ok" = xyes; then
             AC_DEFINE(HAVE_INTEL_OFFLOAD,1,[Define if you have INTEL_OFFLOAD.])
             intel_offload_ok=yes
+
+            AR="xiar"
+            ARFLAGS="cru -qoffload-build"
         else
             CFLAGS="$CFLAGS -no-offload"
             CXXFLAGS="$CXXFLAGS -no-offload"
@@ -56,6 +55,7 @@ AC_DEFUN([CHECK_INTEL_OFFLOAD], [
     AC_SUBST(intel_offload_pragma_ok)
     AC_SUBST(intel_noffload_flag_ok)
     AC_SUBST(intel_offload_ok)
-
+    AC_SUBST(ARFLAGS)
+    AC_SUBST(AR)
 ])
 

+ 19 - 0
src/device_wrapper.cpp

@@ -0,0 +1,19 @@
+/**
+ * \file device_wrapper.cpp
+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
+ * \date 7-30-2014
+ * \brief This file contains implementation of DeviceWrapper.
+ */
+
+#include <mpi.h>
+
+#include <device_wrapper.hpp>
+#include <vector.hpp>
+
+namespace pvfmm{
+
+  Vector<char> MIC_Lock::lock_vec;
+  Vector<char>::Device MIC_Lock::lock_vec_;
+  int MIC_Lock::lock_idx;
+
+}//end namespace

+ 2 - 1
src/fmm_gll.cpp

@@ -10,6 +10,7 @@
 #include <fmm_tree.hpp>
 #include <cheb_utils.hpp>
 #include <vector.hpp>
+#include <cheb_node.hpp>
 
 typedef pvfmm::FMM_Node<pvfmm::Cheb_Node<double> > FMMNode_t;
 typedef pvfmm::FMM_Cheb<FMMNode_t> FMM_Mat_t;
@@ -58,7 +59,7 @@ extern "C" {
         fmm_data->fmm_mat_laplace_grad=new FMM_Mat_t;
         fmm_mat=((FMM_Mat_t*)fmm_data->fmm_mat_laplace_grad);
 
-        fmm_data->kernel_laplace_grad=pvfmm::LaplaceKernel<double>::grad_ker;
+        fmm_data->kernel_laplace_grad=&pvfmm::LaplaceKernel<double>::grad_ker();
         mykernel=(pvfmm::Kernel<double>*)fmm_data->kernel_laplace_grad;
 
         fmm_data->tree_laplace_grad=new FMM_Tree_t(fmm_data->comm);