il y a 11 ans · 8c2b816f95
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,9 @@
 
				 /src/*.o
			
 
				 /stamp-h1
			
 
				 
			
 
				+m4/libtool.m4
			
 
				+m4/ltoptions.m4
			
 
				+m4/ltsugar.m4
			
 
				+m4/ltversion.m4
			
 
				+m4/lt~obsolete.m4
			
 
				+libtool
			
--- a/Makefile.am
+++ b/Makefile.am
@@ -100,6 +100,7 @@ lib_libfmm_a_HEADERS = \
 
				 # the sources to add to the library and to add to the source distribution
			
 
				 lib_libpvfmm_a_SOURCES = \
			
 
				 									$(lib_libpvfmm_a_HEADERS) \
			
 
				+									src/device_wrapper.cpp \
			
 
				 									src/fmm_gll.cpp \
			
 
				 									src/legendre_rule.cpp \
			
 
				 									src/mortonid.cpp \
			
@@ -216,6 +217,7 @@ clean-local: clean-doxygen
 
				 	cd $(EX_DIR) && $(MAKE) clean;
			
 
				 	$(RM) -r $(RESULT_DIR)/*
			
 
				 	$(RM) *~ */*~ */*/*~
			
 
				+	$(RM) ./lib/*
			
 
				 
			
 
				 #------------------------------------------------------------------------------
			
 
				 
			
--- a/configure.ac
+++ b/configure.ac
@@ -25,8 +25,14 @@ AC_PROG_INSTALL
 
				 AC_PROG_MKDIR_P
			
 
				 AC_PROG_LN_S
			
 
				 AC_PROG_MAKE_SET
			
 
				-AC_PROG_RANLIB
			
 
				-#AM_PROG_AR
			
 
				+#AC_PROG_RANLIB
			
 
				+##AM_PROG_AR
			
 
				+
			
 
				+# automake 1.12 seems to require AM_PROG_AR, but automake 1.11 doesn't
			
 
				+# recognize it
			
 
				+m4_ifdef([AM_PROG_AR], [AM_PROG_AR])
			
 
				+LT_INIT
			
 
				+AC_PROG_LIBTOOL
			
 
				 
			
 
				 # Check for CUDA
			
 
				 AX_CHECK_CUDA
			
--- a/examples/Makefile
+++ b/examples/Makefile
@@ -27,15 +27,15 @@ ifeq ($(INTEL_OFFLOAD_OK),yes)
 
				 
			
 
				 $(BINDIR)/%: $(OBJDIR)/%.o
			
 
				 	-@$(MKDIRS) $(dir $@)
			
 
				-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -no-offload      $^       $(LDFLAGS_PVFMM) -o $@
			
 
				-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM)                  $^_mic   $(LDFLAGS_PVFMM) -o $@_mic
			
 
				-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -D__MIC_ASYNCH__ $^_async $(LDFLAGS_PVFMM) -o $@_async
			
 
				+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -no-offload         $^       $(LDFLAGS_PVFMM) -o $@
			
 
				+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM)                     $^_async $(LDFLAGS_PVFMM) -o $@_async
			
 
				+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -D__DEVICE_SYNC__=1 $^_mic   $(LDFLAGS_PVFMM) -o $@_mic
			
 
				 
			
 
				 $(OBJDIR)/%.o: $(SRCDIR)/%.cpp
			
 
				 	-@$(MKDIRS) $(dir $@)
			
 
				-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -no-offload      -I$(INCDIR) -c $^ -o $@
			
 
				-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM)                  -I$(INCDIR) -c $^ -o $@_mic
			
 
				-	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -D__MIC_ASYNCH__ -I$(INCDIR) -c $^ -o $@_async
			
 
				+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -no-offload         -I$(INCDIR) -c $^ -o $@
			
 
				+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM)                     -I$(INCDIR) -c $^ -o $@_async
			
 
				+	$(CXX_PVFMM) $(CXXFLAGS_PVFMM) -D__DEVICE_SYNC__=1 -I$(INCDIR) -c $^ -o $@_mic
			
 
				 
			
 
				 else
			
 
				 
			
--- a/examples/src/example1.cpp
+++ b/examples/src/example1.cpp
@@ -76,12 +76,15 @@ void fmm_test(size_t N, int mult_order, MPI_Comm comm){
 
				   for(size_t i=0;i< src_value.size();i++)  src_value[i]=drand48();
			
 
				   for(size_t i=0;i<surf_value.size();i++) surf_value[i]=drand48();
			
 
				 
			
 
				+  // Create memory-manager (optional)
			
 
				+  pvfmm::mem::MemoryManager mem_mgr(10000000);
			
 
				+
			
 
				   // Construct tree.
			
 
				   size_t max_pts=300;
			
 
				   pvfmm::PtFMM_Tree* tree=PtFMM_CreateTree(src_coord, src_value, surf_coord, surf_value, trg_coord, comm, max_pts, pvfmm::FreeSpace);
			
 
				 
			
 
				   // Load matrices.
			
 
				-  pvfmm::PtFMM matrices;
			
 
				+  pvfmm::PtFMM matrices(&mem_mgr);
			
 
				   matrices.Initialize(mult_order, comm, &kernel_fn, &kernel_fn_aux);
			
 
				 
			
 
				   // FMM Setup
			
--- a/examples/src/fmm_cheb.cpp
+++ b/examples/src/fmm_cheb.cpp
@@ -9,6 +9,7 @@
 
				 #include <fmm_cheb.hpp>
			
 
				 #include <fmm_node.hpp>
			
 
				 #include <fmm_tree.hpp>
			
 
				+#include <cheb_node.hpp>
			
 
				 #include <utils.hpp>
			
 
				 
			
 
				 //////////////////////////////////////////////////////////////////////////////
			
@@ -229,16 +230,16 @@ void fmm_test(int test_case, size_t N, size_t M, bool unif, int mult_order, int
 
				       fn_input_=fn_input_t1<Real_t>;
			
 
				       fn_poten_=fn_poten_t1<Real_t>;
			
 
				       fn_grad_ =fn_grad_t1<Real_t>;
			
 
				-      mykernel     =pvfmm::LaplaceKernel<Real_t>::potn_ker;
			
 
				-      //mykernel_grad=pvfmm::LaplaceKernel<Real_t>::grad_ker;
			
 
				+      mykernel     =&pvfmm::LaplaceKernel<Real_t>::potn_ker();
			
 
				+      //mykernel_grad=&pvfmm::LaplaceKernel<Real_t>::grad_ker();
			
 
				       bndry=pvfmm::Periodic;
			
 
				       break;
			
 
				     case 2:
			
 
				       fn_input_=fn_input_t2<Real_t>;
			
 
				       fn_poten_=fn_poten_t2<Real_t>;
			
 
				       fn_grad_ =fn_grad_t2<Real_t>;
			
 
				-      mykernel     =pvfmm::LaplaceKernel<Real_t>::potn_ker;
			
 
				-      //mykernel_grad=pvfmm::LaplaceKernel<Real_t>::grad_ker;
			
 
				+      mykernel     =&pvfmm::LaplaceKernel<Real_t>::potn_ker();
			
 
				+      //mykernel_grad=&pvfmm::LaplaceKernel<Real_t>::grad_ker();
			
 
				       bndry=pvfmm::FreeSpace;
			
 
				       break;
			
 
				     case 3:
			
--- a/include/cheb_node.hpp
+++ b/include/cheb_node.hpp
@@ -5,15 +5,16 @@
 
				  * \brief This is a derived cheb class of MPI_Node.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_CHEB_NODE_HPP_
			
 
				-#define _PVFMM_CHEB_NODE_HPP_
			
 
				+#include <vector>
			
 
				+#include <stdint.h>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <assert.h>
			
 
				-#include <vector.hpp>
			
 
				+#include <tree_node.hpp>
			
 
				 #include <mpi_node.hpp>
			
 
				-#include <mortonid.hpp>
			
 
				-#include <cheb_utils.hpp>
			
 
				+#include <vector.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_CHEB_NODE_HPP_
			
 
				+#define _PVFMM_CHEB_NODE_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/cheb_node.txx
+++ b/include/cheb_node.txx
@@ -1,14 +1,16 @@
 
				 /**
			
 
				- * \file cheb_node.cpp
			
 
				+ * \file cheb_node.txx
			
 
				  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
			
 
				  * \date 1-22-2010
			
 
				  * \brief This file contains the implementation of the class Cheb_Node.
			
 
				  */
			
 
				 
			
 
				-#include <iostream>
			
 
				-#include <matrix.hpp>
			
 
				-#include <omp.h>
			
 
				+#include <cmath>
			
 
				+#include <cassert>
			
 
				+#include <algorithm>
			
 
				+
			
 
				 #include <cheb_utils.hpp>
			
 
				+#include <matrix.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/cheb_utils.hpp
+++ b/include/cheb_utils.hpp
@@ -5,13 +5,14 @@
 
				  * \brief This file contains chebyshev related functions.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_CHEB_UTILS_HPP_
			
 
				-#define _PVFMM_CHEB_UTILS_HPP_
			
 
				+#include <vector>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <vector>
			
 
				-#include <kernel.hpp>
			
 
				 #include <vector.hpp>
			
 
				+#include <kernel.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_CHEB_UTILS_HPP_
			
 
				+#define _PVFMM_CHEB_UTILS_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/cheb_utils.txx
+++ b/include/cheb_utils.txx
@@ -5,11 +5,18 @@
 
				  * \brief This file contains chebyshev related functions.
			
 
				  */
			
 
				 
			
 
				-#include <assert.h>
			
 
				+#include <omp.h>
			
 
				+#include <cmath>
			
 
				+#include <cassert>
			
 
				+#include <iostream>
			
 
				 #include <algorithm>
			
 
				-#include <matrix.hpp>
			
 
				-#include <mem_mgr.hpp>
			
 
				+
			
 
				 #include <legendre_rule.hpp>
			
 
				+#include <mem_utils.hpp>
			
 
				+#include <mat_utils.hpp>
			
 
				+#include <mem_mgr.hpp>
			
 
				+#include <matrix.hpp>
			
 
				+#include <profile.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/device_wrapper.hpp
+++ b/include/device_wrapper.hpp
@@ -5,13 +5,13 @@
 
				  * \brief This file contains definition of DeviceWrapper.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_DEVICE_WRAPPER_HPP_
			
 
				-#define _PVFMM_DEVICE_WRAPPER_HPP_
			
 
				-
			
 
				-#include <cstdlib>
			
 
				-#include <cassert>
			
 
				 #include <stdint.h>
			
 
				+
			
 
				 #include <pvfmm_common.hpp>
			
 
				+#include <vector.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_DEVICE_WRAPPER_HPP_
			
 
				+#define _PVFMM_DEVICE_WRAPPER_HPP_
			
 
				 
			
 
				 #ifdef __INTEL_OFFLOAD
			
 
				 #pragma offload_attribute(push,target(mic))
			
@@ -24,8 +24,10 @@ namespace DeviceWrapper{
 
				 
			
 
				   void free_device(char* dev_handle, uintptr_t dev_ptr);
			
 
				 
			
 
				+  template <int SYNC=__DEVICE_SYNC__>
			
 
				   int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len);
			
 
				 
			
 
				+  template <int SYNC=__DEVICE_SYNC__>
			
 
				   int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len);
			
 
				 
			
 
				   void wait(int lock_idx);
			
@@ -51,7 +53,7 @@ Note: Any MIC offload section should look like this:
 
				       MIC_Lock::release_lock(lock_idx);
			
 
				     }
			
 
				 
			
 
				-    #ifdef __MIC_ASYNCH__
			
 
				+    #ifdef __DEVICE_SYNC__
			
 
				     MIC_Lock::wait_lock(lock_idx);
			
 
				     #endif
			
 
				 
			
--- a/include/device_wrapper.txx
+++ b/include/device_wrapper.txx
@@ -5,8 +5,9 @@
 
				  * \brief This file contains implementation of DeviceWrapper.
			
 
				  */
			
 
				 
			
 
				-#include <vector.hpp>
			
 
				-#include <device_wrapper.hpp>
			
 
				+#include <omp.h>
			
 
				+#include <cassert>
			
 
				+#include <cstdlib>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
@@ -14,6 +15,10 @@ namespace DeviceWrapper{
 
				 
			
 
				   // MIC functions
			
 
				 
			
 
				+  #define ALLOC alloc_if(1) free_if(0)
			
 
				+  #define FREE alloc_if(0) free_if(1)
			
 
				+  #define REUSE alloc_if(0) free_if(0)
			
 
				+
			
 
				   inline uintptr_t alloc_device_mic(char* dev_handle, size_t len){
			
 
				     assert(dev_handle!=NULL);
			
 
				     uintptr_t dev_ptr=(uintptr_t)NULL;
			
@@ -52,10 +57,6 @@ namespace DeviceWrapper{
 
				         MIC_Lock::release_lock(lock_idx);
			
 
				       }
			
 
				     }
			
 
				-    #ifndef __MIC_ASYNCH__ // Wait
			
 
				-    #pragma offload target(mic:0)
			
 
				-    {MIC_Lock::wait_lock(lock_idx);}
			
 
				-    #endif
			
 
				     return lock_idx;
			
 
				     #endif
			
 
				     return -1;
			
@@ -80,9 +81,6 @@ namespace DeviceWrapper{
 
				         MIC_Lock::release_lock(lock_idx);
			
 
				       }
			
 
				     }
			
 
				-    #ifndef __MIC_ASYNCH__ // Wait
			
 
				-    MIC_Lock::wait_lock(lock_idx);
			
 
				-    #endif
			
 
				     return lock_idx;
			
 
				     #endif
			
 
				     return -1;
			
@@ -116,20 +114,27 @@ namespace DeviceWrapper{
 
				     #endif
			
 
				   }
			
 
				 
			
 
				+  template <int SYNC=__DEVICE_SYNC__>
			
 
				   inline int host2device(char* host_ptr, char* dev_handle, uintptr_t dev_ptr, size_t len){
			
 
				     int lock_idx=-1;
			
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     lock_idx=host2device_mic(host_ptr,dev_handle,dev_ptr,len);
			
 
				+    if(SYNC){
			
 
				+      #pragma offload target(mic:0)
			
 
				+      {MIC_Lock::wait_lock(lock_idx);}
			
 
				+    }
			
 
				     #else
			
 
				     ;
			
 
				     #endif
			
 
				     return lock_idx;
			
 
				   }
			
 
				 
			
 
				+  template <int SYNC=__DEVICE_SYNC__>
			
 
				   inline int device2host(char* dev_handle, uintptr_t dev_ptr, char* host_ptr, size_t len){
			
 
				     int lock_idx=-1;
			
 
				     #ifdef __INTEL_OFFLOAD
			
 
				     lock_idx=device2host_mic(dev_handle,dev_ptr, host_ptr, len);
			
 
				+    if(SYNC) MIC_Lock::wait_lock(lock_idx);
			
 
				     #else
			
 
				     ;
			
 
				     #endif
			
@@ -230,8 +235,4 @@ namespace DeviceWrapper{
 
				     #endif
			
 
				   }
			
 
				 
			
 
				-  Vector<char> MIC_Lock::lock_vec;
			
 
				-  Vector<char>::Device MIC_Lock::lock_vec_;
			
 
				-  int MIC_Lock::lock_idx;
			
 
				-
			
 
				 }//end namespace
			
--- a/include/dtypes.h
+++ b/include/dtypes.h
@@ -1,9 +1,9 @@
 
				-#ifndef __PVFMM_DTYPES_H_
			
 
				-#define __PVFMM_DTYPES_H_
			
 
				-
			
 
				 #include <mpi.h>
			
 
				 #include <complex>
			
 
				 
			
 
				+#ifndef __PVFMM_DTYPES_H_
			
 
				+#define __PVFMM_DTYPES_H_
			
 
				+
			
 
				 /**
			
 
				  * \file	dtypes.h
			
 
				  * \brief	Traits to determine MPI_DATATYPE from a C++ datatype
			
--- a/include/fft_wrapper.hpp
+++ b/include/fft_wrapper.hpp
@@ -1,20 +1,25 @@
 
				 /**
			
 
				- * \file mat_utils.hpp
			
 
				+ * \file fft_wrapper.hpp
			
 
				  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
			
 
				  * \date 2-11-2011
			
 
				  * \brief This file contains FFTW3 wrapper functions.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_FFT_WRAPPER_
			
 
				-#define _PVFMM_FFT_WRAPPER_
			
 
				-
			
 
				+#include <cmath>
			
 
				+#include <cassert>
			
 
				+#include <vector>
			
 
				 #include <fftw3.h>
			
 
				 #ifdef FFTW3_MKL
			
 
				 #include <fftw3_mkl.h>
			
 
				 #endif
			
 
				+
			
 
				 #include <pvfmm_common.hpp>
			
 
				+#include <mem_utils.hpp>
			
 
				 #include <matrix.hpp>
			
 
				 
			
 
				+#ifndef _PVFMM_FFT_WRAPPER_
			
 
				+#define _PVFMM_FFT_WRAPPER_
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 template<class T>
			
--- a/include/fmm_cheb.hpp
+++ b/include/fmm_cheb.hpp
@@ -6,16 +6,19 @@
 
				  * This handles all the translations through matrix multiplications.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_FMM_CHEB_HPP_
			
 
				-#define _PVFMM_FMM_CHEB_HPP_
			
 
				+#include <mpi.h>
			
 
				+#include <vector>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <mpi.h>
			
 
				-#include <matrix.hpp>
			
 
				 #include <precomp_mat.hpp>
			
 
				-#include <cheb_utils.hpp>
			
 
				-#include <cheb_node.hpp>
			
 
				+#include <mem_mgr.hpp>
			
 
				 #include <fmm_pts.hpp>
			
 
				+#include <vector.hpp>
			
 
				+#include <matrix.hpp>
			
 
				+#include <kernel.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_FMM_CHEB_HPP_
			
 
				+#define _PVFMM_FMM_CHEB_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/fmm_cheb.txx
+++ b/include/fmm_cheb.txx
@@ -5,13 +5,21 @@
 
				  * \brief This file contains the implementation of the FMM_Cheb class.
			
 
				  */
			
 
				 
			
 
				-#include <mpi.h>
			
 
				-#include <fftw3.h>
			
 
				-#include <parUtils.h>
			
 
				+#include <omp.h>
			
 
				+#include <sstream>
			
 
				+#include <iostream>
			
 
				+#include <cstdlib>
			
 
				+#include <cmath>
			
 
				 #ifdef PVFMM_HAVE_SYS_STAT_H
			
 
				 #include <sys/stat.h>
			
 
				 #endif
			
 
				 
			
 
				+#include <dtypes.h>
			
 
				+#include <parUtils.h>
			
 
				+#include <cheb_utils.hpp>
			
 
				+#include <mem_utils.hpp>
			
 
				+#include <profile.hpp>
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 template <class FMMNode>
			
--- a/include/fmm_gll.hpp
+++ b/include/fmm_gll.hpp
@@ -1,8 +1,11 @@
 
				-#ifndef _PVFMM_FMM_GLL_HPP_
			
 
				-#define _PVFMM_FMM_GLL_HPP_
			
 
				 
			
 
				 #include <mpi.h>
			
 
				 
			
 
				+#include <pvfmm_common.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_FMM_GLL_HPP_
			
 
				+#define _PVFMM_FMM_GLL_HPP_
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 extern "C" {
			
 
				 #endif
			
--- a/include/fmm_node.hpp
+++ b/include/fmm_node.hpp
@@ -5,19 +5,21 @@
 
				  * \brief This file contains the definition of the FMM_Node class.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_FMM_NODE_HPP_
			
 
				-#define _PVFMM_FMM_NODE_HPP_
			
 
				+#include <vector>
			
 
				 
			
 
				-#include <mpi.h>
			
 
				-#include <iostream>
			
 
				 #include <pvfmm_common.hpp>
			
 
				+#include <tree_node.hpp>
			
 
				 #include <mpi_node.hpp>
			
 
				 #include <fmm_pts.hpp>
			
 
				+#include <vector.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_FMM_NODE_HPP_
			
 
				+#define _PVFMM_FMM_NODE_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 /**
			
 
				- * \brief Base class for node of FMM_Tree.
			
 
				+ * \brief Base class for node of FMM_Node.
			
 
				  */
			
 
				 template <class Node>
			
 
				 class FMM_Node: public Node{
			
--- a/include/fmm_node.txx
+++ b/include/fmm_node.txx
@@ -5,6 +5,11 @@
 
				  * \brief This file contains the implementation of the FMM_Node class.
			
 
				  */
			
 
				 
			
 
				+#include <cassert>
			
 
				+
			
 
				+#include <mem_utils.hpp>
			
 
				+#include <mpi_node.hpp>
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 template <class Node>
			
--- a/include/fmm_pts.hpp
+++ b/include/fmm_pts.hpp
@@ -6,16 +6,23 @@
 
				  * This handles all the translations for point sources and targets.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_FMM_PTS_HPP_
			
 
				-#define _PVFMM_FMM_PTS_HPP_
			
 
				+#include <mpi.h>
			
 
				+#include <string>
			
 
				+#include <vector>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <mpi.h>
			
 
				-#include <matrix.hpp>
			
 
				-#include <precomp_mat.hpp>
			
 
				 #include <interac_list.hpp>
			
 
				-#include <kernel.hpp>
			
 
				+#include <precomp_mat.hpp>
			
 
				+#include <fft_wrapper.hpp>
			
 
				+#include <mem_utils.hpp>
			
 
				 #include <mpi_node.hpp>
			
 
				+#include <mem_mgr.hpp>
			
 
				+#include <vector.hpp>
			
 
				+#include <matrix.hpp>
			
 
				+#include <kernel.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_FMM_PTS_HPP_
			
 
				+#define _PVFMM_FMM_PTS_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
@@ -130,9 +137,11 @@ class FMM_Pts{
 
				 
			
 
				   void SetupPrecomp(SetupData<Real_t>& setup_data, bool device=false);
			
 
				   void SetupInterac(SetupData<Real_t>& setup_data, bool device=false);
			
 
				+  template <int SYNC=__DEVICE_SYNC__>
			
 
				   void EvalList    (SetupData<Real_t>& setup_data, bool device=false); // Run on CPU by default.
			
 
				 
			
 
				   void SetupInteracPts(SetupData<Real_t>& setup_data, bool shift_src, bool shift_trg, Matrix<Real_t>* M, bool device);
			
 
				+  template <int SYNC=__DEVICE_SYNC__>
			
 
				   void EvalListPts    (SetupData<Real_t>& setup_data, bool device=false); // Run on CPU by default.
			
 
				 
			
 
				   /**
			
--- a/include/fmm_pts.txx
+++ b/include/fmm_pts.txx
@@ -5,11 +5,14 @@
 
				  * \brief This file contains the implementation of the FMM_Pts class.
			
 
				  */
			
 
				 
			
 
				-#include <mpi.h>
			
 
				-#include <set>
			
 
				+#include <omp.h>
			
 
				+#include <cmath>
			
 
				+#include <cstdlib>
			
 
				+#include <cassert>
			
 
				 #include <sstream>
			
 
				-#include <fft_wrapper.hpp>
			
 
				-#include <mat_utils.hpp>
			
 
				+#include <iostream>
			
 
				+#include <stdint.h>
			
 
				+#include <set>
			
 
				 #ifdef PVFMM_HAVE_SYS_STAT_H
			
 
				 #include <sys/stat.h>
			
 
				 #endif
			
@@ -17,6 +20,9 @@
 
				 #ifdef __SSE__
			
 
				 #include <xmmintrin.h>
			
 
				 #endif
			
 
				+#ifdef __SSE2__
			
 
				+#include <emmintrin.h>
			
 
				+#endif
			
 
				 #ifdef __SSE3__
			
 
				 #include <pmmintrin.h>
			
 
				 #endif
			
@@ -27,6 +33,8 @@
 
				 #include <immintrin.h>
			
 
				 #endif
			
 
				 
			
 
				+#include <profile.hpp>
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 /**
			
@@ -1485,6 +1493,7 @@ void FMM_Pts<FMMNode>::SetupInterac(SetupData<Real_t>& setup_data, bool device){
 
				 }
			
 
				 
			
 
				 template <class FMMNode>
			
 
				+template <int SYNC=__DEVICE_SYNC__>
			
 
				 void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
			
 
				   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
			
 
				     Profile::Tic("Host2Device",&this->comm,false,25);
			
@@ -1722,11 +1731,11 @@ void FMM_Pts<FMMNode>::EvalList(SetupData<Real_t>& setup_data, bool device){
 
				     if(device) MIC_Lock::release_lock(lock_idx);
			
 
				   }
			
 
				 
			
 
				-  #ifndef __MIC_ASYNCH__
			
 
				   #ifdef __INTEL_OFFLOAD
			
 
				-  #pragma offload if(device) target(mic:0)
			
 
				-  {if(device) MIC_Lock::wait_lock(lock_idx);}
			
 
				-  #endif
			
 
				+  if(SYNC){
			
 
				+    #pragma offload if(device) target(mic:0)
			
 
				+    {if(device) MIC_Lock::wait_lock(lock_idx);}
			
 
				+  }
			
 
				   #endif
			
 
				 
			
 
				   Profile::Toc();
			
@@ -3089,6 +3098,7 @@ void FMM_Pts<FMMNode>::SetupInteracPts(SetupData<Real_t>& setup_data, bool shift
 
				 }
			
 
				 
			
 
				 template <class FMMNode>
			
 
				+template <int SYNC=__DEVICE_SYNC__>
			
 
				 void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
			
 
				   if(setup_data.interac_data.Dim(0)==0 || setup_data.interac_data.Dim(1)==0){
			
 
				     Profile::Tic("Host2Device",&this->comm,false,25);
			
@@ -3268,11 +3278,11 @@ void FMM_Pts<FMMNode>::EvalListPts(SetupData<Real_t>& setup_data, bool device){
 
				     if(device) MIC_Lock::release_lock(lock_idx);
			
 
				   }
			
 
				 
			
 
				-  #ifndef __MIC_ASYNCH__
			
 
				   #ifdef __INTEL_OFFLOAD
			
 
				-  #pragma offload if(device) target(mic:0)
			
 
				-  {if(device) MIC_Lock::wait_lock(lock_idx);}
			
 
				-  #endif
			
 
				+  if(SYNC){
			
 
				+    #pragma offload if(device) target(mic:0)
			
 
				+    {if(device) MIC_Lock::wait_lock(lock_idx);}
			
 
				+  }
			
 
				   #endif
			
 
				 
			
 
				   Profile::Toc();
			
--- a/include/fmm_tree.hpp
+++ b/include/fmm_tree.hpp
@@ -5,14 +5,17 @@
 
				  * \brief This file contains the definition of the FMM_Tree class.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_FMM_TREE_HPP_
			
 
				-#define _PVFMM_FMM_TREE_HPP_
			
 
				-
			
 
				-#include <iostream>
			
 
				 #include <mpi.h>
			
 
				+#include <vector>
			
 
				+
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <mpi_tree.hpp>
			
 
				+#include <interac_list.hpp>
			
 
				 #include <fmm_node.hpp>
			
 
				+#include <mpi_tree.hpp>
			
 
				+#include <matrix.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_FMM_TREE_HPP_
			
 
				+#define _PVFMM_FMM_TREE_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/fmm_tree.txx
+++ b/include/fmm_tree.txx
@@ -5,9 +5,17 @@
 
				  * \brief This file contains the implementation of the class FMM_Tree.
			
 
				  */
			
 
				 
			
 
				-#include <assert.h>
			
 
				+#include <omp.h>
			
 
				+#include <sstream>
			
 
				+#include <iomanip>
			
 
				+#include <cassert>
			
 
				+
			
 
				+#include <mpi_node.hpp>
			
 
				 #include <fmm_node.hpp>
			
 
				+#include <mem_utils.hpp>
			
 
				+#include <mortonid.hpp>
			
 
				 #include <profile.hpp>
			
 
				+#include <vector.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/interac_list.hpp
+++ b/include/interac_list.hpp
@@ -7,12 +7,14 @@
 
				  * symmetry class for each interaction.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_INTERAC_LIST_HPP_
			
 
				-#define _PVFMM_INTERAC_LIST_HPP_
			
 
				+#include <vector>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <tree_node.hpp>
			
 
				 #include <precomp_mat.hpp>
			
 
				+#include <matrix.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_INTERAC_LIST_HPP_
			
 
				+#define _PVFMM_INTERAC_LIST_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/interac_list.txx
+++ b/include/interac_list.txx
@@ -7,10 +7,10 @@
 
				  * symmetry class for each interaction.
			
 
				  */
			
 
				 
			
 
				-#include <math.h>
			
 
				-#include <algorithm>
			
 
				-#include <tree_node.hpp>
			
 
				-#include <precomp_mat.hpp>
			
 
				+#include <cmath>
			
 
				+#include <cassert>
			
 
				+
			
 
				+#include <parUtils.h>
			
 
				 #include <ompUtils.h>
			
 
				 
			
 
				 namespace pvfmm{
			
--- a/include/kernel.hpp
+++ b/include/kernel.hpp
@@ -6,13 +6,13 @@
 
				  * implementation of various kernels for FMM.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_FMM_KERNEL_HPP_
			
 
				-#define _PVFMM_FMM_KERNEL_HPP_
			
 
				+#include <string>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <quad_utils.hpp>
			
 
				 #include <mem_mgr.hpp>
			
 
				-#include <string>
			
 
				+
			
 
				+#ifndef _PVFMM_FMM_KERNEL_HPP_
			
 
				+#define _PVFMM_FMM_KERNEL_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
@@ -41,7 +41,7 @@ struct Kernel{
 
				    * \brief Constructor.
			
 
				    */
			
 
				   Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
			
 
				-         const int (&k_dim)[2], bool homogen_=false, T ker_scale=0,
			
 
				+         std::pair<int,int> k_dim, bool homogen_=false, T ker_scale=0,
			
 
				          size_t dev_poten=(size_t)NULL, size_t dev_dbl_poten=(size_t)NULL);
			
 
				 
			
 
				   /**
			
@@ -73,7 +73,7 @@ struct Kernel{
 
				 template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr),
			
 
				                      void (*B)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
			
 
				 Kernel<T> BuildKernel(const char* name, int dim,
			
 
				-         const int (&k_dim)[2], bool homogen=false, T ker_scale=0){
			
 
				+         std::pair<int,int> k_dim, bool homogen=false, T ker_scale=0){
			
 
				   size_t dev_ker_poten      ;
			
 
				   size_t dev_dbl_layer_poten;
			
 
				   #ifdef __INTEL_OFFLOAD
			
@@ -91,7 +91,7 @@ Kernel<T> BuildKernel(const char* name, int dim,
 
				 
			
 
				 template<typename T, void (*A)(T*, int, T*, int, T*, int, T*, mem::MemoryManager* mem_mgr)>
			
 
				 Kernel<T> BuildKernel(const char* name, int dim,
			
 
				-         const int (&k_dim)[2], bool homogen=false, T ker_scale=0){
			
 
				+         std::pair<int,int> k_dim, bool homogen=false, T ker_scale=0){
			
 
				   size_t dev_ker_poten      ;
			
 
				   #ifdef __INTEL_OFFLOAD
			
 
				   #pragma offload target(mic:0)
			
@@ -133,36 +133,33 @@ void laplace_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_cn
 
				 
			
 
				 
			
 
				 
			
 
				-int dim_laplace_poten[2]={1,1};
			
 
				-int dim_laplace_grad [2]={1,3};
			
 
				-
			
 
				 #ifdef QuadReal_t
			
 
				-const Kernel<QuadReal_t> laplace_potn_q=BuildKernel<QuadReal_t, laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
			
 
				-const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
			
 
				+const Kernel<QuadReal_t> laplace_potn_q=BuildKernel<QuadReal_t, laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
			
 
				+const Kernel<QuadReal_t> laplace_grad_q=BuildKernel<QuadReal_t, laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
			
 
				 #endif
			
 
				 
			
 
				-const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
			
 
				-const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
			
 
				+const Kernel<double    > laplace_potn_d=BuildKernel<double    , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
			
 
				+const Kernel<double    > laplace_grad_d=BuildKernel<double    , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
			
 
				 
			
 
				-const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten, laplace_dbl_poten>("laplace"     , 3, dim_laplace_poten, true, 1.0);
			
 
				-const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                    >("laplace_grad", 3, dim_laplace_grad , true, 2.0);
			
 
				+const Kernel<float     > laplace_potn_f=BuildKernel<float     , laplace_poten, laplace_dbl_poten>("laplace"     , 3, std::pair<int,int>(1,1), true, 1.0);
			
 
				+const Kernel<float     > laplace_grad_f=BuildKernel<float     , laplace_grad                    >("laplace_grad", 3, std::pair<int,int>(1,3), true, 2.0);
			
 
				 
			
 
				 template<class T>
			
 
				 struct LaplaceKernel{
			
 
				-  static Kernel<T>* potn_ker;
			
 
				-  static Kernel<T>* grad_ker;
			
 
				+  inline static const Kernel<T>& potn_ker();
			
 
				+  inline static const Kernel<T>& grad_ker();
			
 
				 };
			
 
				 
			
 
				 #ifdef QuadReal_t
			
 
				-template<> Kernel<QuadReal_t>* LaplaceKernel<QuadReal_t>::potn_ker=(Kernel<QuadReal_t>*)&laplace_potn_q;
			
 
				-template<> Kernel<QuadReal_t>* LaplaceKernel<QuadReal_t>::grad_ker=(Kernel<QuadReal_t>*)&laplace_grad_q;
			
 
				+template<> const Kernel<QuadReal_t>& LaplaceKernel<QuadReal_t>::potn_ker(){ return laplace_potn_q; };
			
 
				+template<> const Kernel<QuadReal_t>& LaplaceKernel<QuadReal_t>::grad_ker(){ return laplace_grad_q; };
			
 
				 #endif
			
 
				 
			
 
				-template<> Kernel<double>* LaplaceKernel<double>::potn_ker=(Kernel<double>*)&laplace_potn_d;
			
 
				-template<> Kernel<double>* LaplaceKernel<double>::grad_ker=(Kernel<double>*)&laplace_grad_d;
			
 
				+template<> const Kernel<double>& LaplaceKernel<double>::potn_ker(){ return laplace_potn_d; };
			
 
				+template<> const Kernel<double>& LaplaceKernel<double>::grad_ker(){ return laplace_grad_d; };
			
 
				 
			
 
				-template<> Kernel<float>* LaplaceKernel<float>::potn_ker=(Kernel<float>*)&laplace_potn_f;
			
 
				-template<> Kernel<float>* LaplaceKernel<float>::grad_ker=(Kernel<float>*)&laplace_grad_f;
			
 
				+template<> const Kernel<float>& LaplaceKernel<float>::potn_ker(){ return laplace_potn_f; };
			
 
				+template<> const Kernel<float>& LaplaceKernel<float>::grad_ker(){ return laplace_grad_f; };
			
 
				 
			
 
				 ////////////////////////////////////////////////////////////////////////////////
			
 
				 ////////                   STOKES KERNEL                             ////////
			
@@ -189,17 +186,13 @@ void stokes_grad(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cn
 
				 
			
 
				 
			
 
				 
			
 
				-int dim_stokes_vel   [2]={3,3};
			
 
				-const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_sym_dip>("stokes_vel"   , 3, dim_stokes_vel   ,true,1.0);
			
 
				+const Kernel<double> ker_stokes_vel   =BuildKernel<double, stokes_vel, stokes_sym_dip>("stokes_vel"   , 3, std::pair<int,int>(3,3),true,1.0);
			
 
				 
			
 
				-int dim_stokes_press [2]={3,1};
			
 
				-const Kernel<double> ker_stokes_press =BuildKernel<double, stokes_press              >("stokes_press" , 3, dim_stokes_press ,true,2.0);
			
 
				+const Kernel<double> ker_stokes_press =BuildKernel<double, stokes_press              >("stokes_press" , 3, std::pair<int,int>(3,1),true,2.0);
			
 
				 
			
 
				-int dim_stokes_stress[2]={3,9};
			
 
				-const Kernel<double> ker_stokes_stress=BuildKernel<double, stokes_stress             >("stokes_stress", 3, dim_stokes_stress,true,2.0);
			
 
				+const Kernel<double> ker_stokes_stress=BuildKernel<double, stokes_stress             >("stokes_stress", 3, std::pair<int,int>(3,9),true,2.0);
			
 
				 
			
 
				-int dim_stokes_grad  [2]={3,9};
			
 
				-const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad               >("stokes_grad"  , 3, dim_stokes_grad  ,true,2.0);
			
 
				+const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad               >("stokes_grad"  , 3, std::pair<int,int>(3,9),true,2.0);
			
 
				 
			
 
				 ////////////////////////////////////////////////////////////////////////////////
			
 
				 ////////                  BIOT-SAVART KERNEL                            ////////
			
@@ -208,8 +201,7 @@ const Kernel<double> ker_stokes_grad  =BuildKernel<double, stokes_grad
 
				 template <class T>
			
 
				 void biot_savart(T* r_src, int src_cnt, T* v_src_, int dof, T* r_trg, int trg_cnt, T* k_out, mem::MemoryManager* mem_mgr);
			
 
				 
			
 
				-int dim_biot_savart[2]={3,3};
			
 
				-const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_savart", 3, dim_biot_savart,true,2.0);
			
 
				+const Kernel<double> ker_biot_savart=BuildKernel<double, biot_savart>("biot_savart", 3, std::pair<int,int>(3,3),true,2.0);
			
 
				 
			
 
				 ////////////////////////////////////////////////////////////////////////////////
			
 
				 ////////                   HELMHOLTZ KERNEL                             ////////
			
@@ -227,11 +219,9 @@ void helmholtz_grad(T* r_src, int src_cnt, T* v_src, int dof, T* r_trg, int trg_
 
				 
			
 
				 
			
 
				 
			
 
				-int dim_helmholtz     [2]={2,2};
			
 
				-const Kernel<double> ker_helmholtz     =BuildKernel<double, helmholtz_poten>("helmholtz"     , 3, dim_helmholtz     );
			
 
				+const Kernel<double> ker_helmholtz     =BuildKernel<double, helmholtz_poten>("helmholtz"     , 3, std::pair<int,int>(2,2));
			
 
				 
			
 
				-int dim_helmholtz_grad[2]={2,6};
			
 
				-const Kernel<double> ker_helmholtz_grad=BuildKernel<double, helmholtz_grad >("helmholtz_grad", 3, dim_helmholtz_grad);
			
 
				+const Kernel<double> ker_helmholtz_grad=BuildKernel<double, helmholtz_grad >("helmholtz_grad", 3, std::pair<int,int>(2,6));
			
 
				 
			
 
				 }//end namespace
			
 
				 #ifdef __INTEL_OFFLOAD
			
--- a/include/kernel.txx
+++ b/include/kernel.txx
@@ -6,14 +6,29 @@
 
				  * implementation of various kernels for FMM.
			
 
				  */
			
 
				 
			
 
				-#ifdef USE_SSE
			
 
				-#include <emmintrin.h>
			
 
				-#endif
			
 
				-
			
 
				-#include <math.h>
			
 
				-#include <assert.h>
			
 
				+#include <cmath>
			
 
				+#include <cstdlib>
			
 
				 #include <vector>
			
 
				+
			
 
				+#include <mem_utils.hpp>
			
 
				 #include <profile.hpp>
			
 
				+#include <vector.hpp>
			
 
				+
			
 
				+#ifdef __SSE__
			
 
				+#include <xmmintrin.h>
			
 
				+#endif
			
 
				+#ifdef __SSE2__
			
 
				+#include <emmintrin.h>
			
 
				+#endif
			
 
				+#ifdef __SSE3__
			
 
				+#include <pmmintrin.h>
			
 
				+#endif
			
 
				+#ifdef __AVX__
			
 
				+#include <immintrin.h>
			
 
				+#endif
			
 
				+#if defined(__MIC__)
			
 
				+#include <immintrin.h>
			
 
				+#endif
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
@@ -31,11 +46,11 @@ Kernel<T>::Kernel(): dim(0){
 
				  */
			
 
				 template <class T>
			
 
				 Kernel<T>::Kernel(Ker_t poten, Ker_t dbl_poten, const char* name, int dim_,
			
 
				-                  const int (&k_dim)[2], bool homogen_, T ker_scale,
			
 
				+                  std::pair<int,int> k_dim, bool homogen_, T ker_scale,
			
 
				                   size_t dev_poten, size_t dev_dbl_poten){
			
 
				   dim=dim_;
			
 
				-  ker_dim[0]=k_dim[0];
			
 
				-  ker_dim[1]=k_dim[1];
			
 
				+  ker_dim[0]=k_dim.first;
			
 
				+  ker_dim[1]=k_dim.second;
			
 
				   ker_poten=poten;
			
 
				   dbl_layer_poten=dbl_poten;
			
 
				   homogen=homogen_;
			
--- a/include/legendre_rule.hpp
+++ b/include/legendre_rule.hpp
@@ -1,8 +1,9 @@
 
				-#ifndef _LEGENDRE_RULE_HPP_
			
 
				-#define _LEGENDRE_RULE_HPP_
			
 
				 
			
 
				 # include <cstring>
			
 
				 
			
 
				+#ifndef _LEGENDRE_RULE_HPP_
			
 
				+#define _LEGENDRE_RULE_HPP_
			
 
				+
			
 
				 void cdgqf ( int nt, int kind, double alpha, double beta, double t[],
			
 
				   double wts[] );
			
 
				 void cgqf ( int nt, int kind, double alpha, double beta, double a, double b,
			
--- a/include/mat_utils.hpp
+++ b/include/mat_utils.hpp
@@ -5,11 +5,11 @@
 
				  * \brief This file contains BLAS and LAPACK wrapper functions.
			
 
				  */
			
 
				 
			
 
				+#include <pvfmm_common.hpp>
			
 
				+
			
 
				 #ifndef _PVFMM_MAT_UTILS_
			
 
				 #define _PVFMM_MAT_UTILS_
			
 
				 
			
 
				-#include <cstdlib>
			
 
				-
			
 
				 #ifdef __INTEL_OFFLOAD
			
 
				 #pragma offload_attribute(push,target(mic))
			
 
				 #endif
			
--- a/include/mat_utils.txx
+++ b/include/mat_utils.txx
@@ -5,14 +5,16 @@
 
				  * \brief This file contains BLAS and LAPACK wrapper functions.
			
 
				  */
			
 
				 
			
 
				+#include <omp.h>
			
 
				+#include <cmath>
			
 
				 #include <cassert>
			
 
				-#include <vector>
			
 
				+#include <algorithm>
			
 
				 #include <iostream>
			
 
				-#include <stdint.h>
			
 
				-#include <math.h>
			
 
				+#include <vector>
			
 
				+
			
 
				 #include <blas.h>
			
 
				 #include <lapack.h>
			
 
				-#include <fft_wrapper.hpp>
			
 
				+#include <matrix.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 namespace mat{
			
--- a/include/matrix.hpp
+++ b/include/matrix.hpp
@@ -5,14 +5,14 @@
 
				  * \brief This file contains definition of the class Matrix.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_MATRIX_HPP_
			
 
				-#define _PVFMM_MATRIX_HPP_
			
 
				+#include <stdint.h>
			
 
				 
			
 
				-#include <cstdlib>
			
 
				-#include <vector>
			
 
				-#include <iostream>
			
 
				+#include <pvfmm_common.hpp>
			
 
				 #include <vector.hpp>
			
 
				 
			
 
				+#ifndef _PVFMM_MATRIX_HPP_
			
 
				+#define _PVFMM_MATRIX_HPP_
			
 
				+
			
 
				 #ifdef __INTEL_OFFLOAD
			
 
				 #pragma offload_attribute(push,target(mic))
			
 
				 #endif
			
--- a/include/matrix.txx
+++ b/include/matrix.txx
@@ -5,11 +5,17 @@
 
				  * \brief This file contains inplementation of the class Matrix.
			
 
				  */
			
 
				 
			
 
				-#include <cstring>
			
 
				+#include <omp.h>
			
 
				+#include <cmath>
			
 
				+#include <cstdlib>
			
 
				 #include <cassert>
			
 
				+#include <iostream>
			
 
				 #include <iomanip>
			
 
				-#include <profile.hpp>
			
 
				+
			
 
				+#include <device_wrapper.hpp>
			
 
				 #include <mat_utils.hpp>
			
 
				+#include <mem_utils.hpp>
			
 
				+#include <profile.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/mem_mgr.hpp
+++ b/include/mem_mgr.hpp
@@ -6,18 +6,20 @@
 
				  * uses a pre-allocated buffer of size defined in call to the constructor.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_MEM_MGR_HPP_
			
 
				-#define _PVFMM_MEM_MGR_HPP_
			
 
				-
			
 
				-#include <map>
			
 
				-#include <stack>
			
 
				-#include <vector>
			
 
				-#include <cassert>
			
 
				-#include <iostream>
			
 
				-#include <cmath>
			
 
				 #include <omp.h>
			
 
				+#include <cstdlib>
			
 
				+#include <stdint.h>
			
 
				+#include <algorithm>
			
 
				+#include <iostream>
			
 
				+#include <cassert>
			
 
				+#include <vector>
			
 
				+#include <stack>
			
 
				+#include <map>
			
 
				+
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <mem_utils.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_MEM_MGR_HPP_
			
 
				+#define _PVFMM_MEM_MGR_HPP_
			
 
				 
			
 
				 #ifdef __INTEL_OFFLOAD
			
 
				 #pragma offload_attribute(push,target(mic))
			
--- a/include/mem_utils.hpp
+++ b/include/mem_utils.hpp
@@ -5,15 +5,10 @@
 
				  * \brief This file contains memory management utilities.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_MEM_UTILS_
			
 
				-#define _PVFMM_MEM_UTILS_
			
 
				-
			
 
				-#include <cstdlib>
			
 
				 #include <pvfmm_common.hpp>
			
 
				 
			
 
				-#define ALLOC alloc_if(1) free_if(0)
			
 
				-#define FREE alloc_if(0) free_if(1)
			
 
				-#define REUSE alloc_if(0) free_if(0)
			
 
				+#ifndef _PVFMM_MEM_UTILS_
			
 
				+#define _PVFMM_MEM_UTILS_
			
 
				 
			
 
				 #ifdef __INTEL_OFFLOAD
			
 
				 #pragma offload_attribute(push,target(mic))
			
--- a/include/mem_utils.txx
+++ b/include/mem_utils.txx
@@ -5,11 +5,10 @@
 
				  * \brief This file contains implementation of mem_utils.hpp.
			
 
				  */
			
 
				 
			
 
				-#include <omp.h>
			
 
				 #include <cassert>
			
 
				 #include <cstring>
			
 
				+#include <cstdlib>
			
 
				 #include <stdint.h>
			
 
				-#include <profile.hpp>
			
 
				 
			
 
				 #ifdef __INTEL_OFFLOAD
			
 
				 #pragma offload_attribute(push,target(mic))
			
--- a/include/mortonid.hpp
+++ b/include/mortonid.hpp
@@ -5,13 +5,13 @@
 
				  * \brief This file contains definition of the class MortonId.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_MORTONID_HPP_
			
 
				-#define _PVFMM_MORTONID_HPP_
			
 
				+#include <vector>
			
 
				+#include <stdint.h>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <iostream>
			
 
				-#include <stdint.h>
			
 
				-#include <vector>
			
 
				+
			
 
				+#ifndef _PVFMM_MORTONID_HPP_
			
 
				+#define _PVFMM_MORTONID_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/mortonid.txx
+++ b/include/mortonid.txx
@@ -5,6 +5,8 @@
 
				  * \brief This file contains implementation of the class MortonId.
			
 
				  */
			
 
				 
			
 
				+#include <cmath>
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 inline MortonId::MortonId():x(0), y(0), z(0), depth(0){}
			
--- a/include/mpi_node.hpp
+++ b/include/mpi_node.hpp
@@ -6,15 +6,18 @@
 
				  * locally essential tree node.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_MPI_NODE_HPP_
			
 
				-#define _PVFMM_MPI_NODE_HPP_
			
 
				+#include <vector>
			
 
				+#include <cassert>
			
 
				+#include <stdint.h>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <assert.h>
			
 
				 #include <tree_node.hpp>
			
 
				 #include <mortonid.hpp>
			
 
				 #include <vector.hpp>
			
 
				 
			
 
				+#ifndef _PVFMM_MPI_NODE_HPP_
			
 
				+#define _PVFMM_MPI_NODE_HPP_
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 /**
			
--- a/include/mpi_node.txx
+++ b/include/mpi_node.txx
@@ -1,12 +1,14 @@
 
				 /**
			
 
				- * \file mpi_node.cpp
			
 
				+ * \file mpi_node.txx
			
 
				  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
			
 
				  * \date 12-11-2010
			
 
				  * \brief This file contains the implementation of the class MPI_Node.
			
 
				  */
			
 
				 
			
 
				-#include <assert.h>
			
 
				-#include <iostream>
			
 
				+#include <cmath>
			
 
				+
			
 
				+#include <matrix.hpp>
			
 
				+#include <mem_utils.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/mpi_tree.hpp
+++ b/include/mpi_tree.hpp
@@ -6,15 +6,16 @@
 
				  * MPI tree.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_MPI_TREE_HPP_
			
 
				-#define _PVFMM_MPI_TREE_HPP_
			
 
				+#include <mpi.h>
			
 
				+#include <vector>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <mpi.h>
			
 
				-#include <mpi_node.hpp>
			
 
				 #include <mortonid.hpp>
			
 
				 #include <tree.hpp>
			
 
				 
			
 
				+#ifndef _PVFMM_MPI_TREE_HPP_
			
 
				+#define _PVFMM_MPI_TREE_HPP_
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 enum BoundaryType{
			
--- a/include/mpi_tree.txx
+++ b/include/mpi_tree.txx
@@ -5,13 +5,24 @@
 
				  * \brief This file contains the implementation of the class MPI_Tree.
			
 
				  */
			
 
				 
			
 
				-#include <assert.h>
			
 
				-#include <cstring>
			
 
				+#include <omp.h>
			
 
				+#include <cmath>
			
 
				+#include <cstdlib>
			
 
				+#include <cassert>
			
 
				+#include <string>
			
 
				+#include <sstream>
			
 
				+#include <iostream>
			
 
				+#include <iomanip>
			
 
				 #include <fstream>
			
 
				-#include <list>
			
 
				+#include <algorithm>
			
 
				+#include <stdint.h>
			
 
				 #include <set>
			
 
				-#include <parUtils.h>
			
 
				+
			
 
				+#include <dtypes.h>
			
 
				 #include <ompUtils.h>
			
 
				+#include <parUtils.h>
			
 
				+#include <mem_utils.hpp>
			
 
				+#include <mpi_node.hpp>
			
 
				 #include <profile.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
--- a/include/ompUtils.txx
+++ b/include/ompUtils.txx
@@ -1,7 +1,7 @@
 
				-#include <cstdlib>
			
 
				+
			
 
				 #include <omp.h>
			
 
				-#include <iterator>
			
 
				-#include <vector>
			
 
				+#include <cstring>
			
 
				+#include <algorithm>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/parUtils.h
+++ b/include/parUtils.h
@@ -8,13 +8,13 @@
 
				   @author Dhairya Malhotra, dhairya.malhotra@gmail.com
			
 
				   */
			
 
				 
			
 
				-#ifndef __PVFMM_PAR_UTILS_H_
			
 
				-#define __PVFMM_PAR_UTILS_H_
			
 
				-
			
 
				-#include "mpi.h"
			
 
				+#include <mpi.h>
			
 
				 #include <vector>
			
 
				 #include <vector.hpp>
			
 
				 
			
 
				+#ifndef __PVFMM_PAR_UTILS_H_
			
 
				+#define __PVFMM_PAR_UTILS_H_
			
 
				+
			
 
				 /**
			
 
				   @namespace par
			
 
				   @author Rahul Sampath
			
--- a/include/parUtils.txx
+++ b/include/parUtils.txx
@@ -8,13 +8,15 @@
 
				   @author Santi Swaroop Adavani, santis@gmail.com
			
 
				   */
			
 
				 
			
 
				-#include "dtypes.h"
			
 
				+#include <cmath>
			
 
				 #include <cassert>
			
 
				+#include <cstring>
			
 
				+#include <cstdlib>
			
 
				 #include <iostream>
			
 
				 #include <algorithm>
			
 
				-#include <cstring>
			
 
				-#include "ompUtils.h"
			
 
				-#include <mpi.h>
			
 
				+
			
 
				+#include <dtypes.h>
			
 
				+#include <ompUtils.h>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 namespace par{
			
--- a/include/precomp_mat.hpp
+++ b/include/precomp_mat.hpp
@@ -6,13 +6,15 @@
 
				  * Handles storage of precomputed translation matrices.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_PrecompMAT_HPP_
			
 
				-#define _PVFMM_PrecompMAT_HPP_
			
 
				-
			
 
				 #include <mpi.h>
			
 
				+#include <vector>
			
 
				+
			
 
				 #include <pvfmm_common.hpp>
			
 
				 #include <matrix.hpp>
			
 
				 
			
 
				+#ifndef _PVFMM_PrecompMAT_HPP_
			
 
				+#define _PVFMM_PrecompMAT_HPP_
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 typedef enum{
			
--- a/include/precomp_mat.txx
+++ b/include/precomp_mat.txx
@@ -6,8 +6,16 @@
 
				  * Handles storage of precomputed translation matrices.
			
 
				  */
			
 
				 
			
 
				-#include <sys/stat.h>
			
 
				+#include <omp.h>
			
 
				+#include <cassert>
			
 
				 #include <stdint.h>
			
 
				+#ifdef PVFMM_HAVE_SYS_STAT_H
			
 
				+#include <sys/stat.h>
			
 
				+#endif
			
 
				+
			
 
				+#include <mem_utils.hpp>
			
 
				+#include <profile.hpp>
			
 
				+#include <vector.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/profile.hpp
+++ b/include/profile.hpp
@@ -5,16 +5,16 @@
 
				  * \brief This file contains definition of the class Profile.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_PROFILE_HPP_
			
 
				-#define _PVFMM_PROFILE_HPP_
			
 
				-
			
 
				 #include <mpi.h>
			
 
				-#include <iostream>
			
 
				 #include <string>
			
 
				 #include <vector>
			
 
				 #include <stack>
			
 
				+
			
 
				 #include <pvfmm_common.hpp>
			
 
				 
			
 
				+#ifndef _PVFMM_PROFILE_HPP_
			
 
				+#define _PVFMM_PROFILE_HPP_
			
 
				+
			
 
				 #ifndef __PROFILE__
			
 
				 #define __PROFILE__ -1
			
 
				 #endif
			
--- a/include/pvfmm.hpp
+++ b/include/pvfmm.hpp
@@ -5,17 +5,22 @@
 
				  * \brief This file contains wrapper functions for PvFMM.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_HPP_
			
 
				-#define _PVFMM_HPP_
			
 
				-
			
 
				 #include <mpi.h>
			
 
				-#include <cstdlib>
			
 
				-#include <iostream>
			
 
				+#include <vector>
			
 
				+#include <cmath>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <fmm_cheb.hpp>
			
 
				-#include <fmm_node.hpp>
			
 
				+#include <cheb_node.hpp>
			
 
				+#include <mpi_node.hpp>
			
 
				 #include <fmm_tree.hpp>
			
 
				+#include <fmm_node.hpp>
			
 
				+#include <fmm_cheb.hpp>
			
 
				+#include <fmm_pts.hpp>
			
 
				+#include <vector.hpp>
			
 
				+#include <parUtils.h>
			
 
				+
			
 
				+#ifndef _PVFMM_HPP_
			
 
				+#define _PVFMM_HPP_
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/include/pvfmm_common.hpp
+++ b/include/pvfmm_common.hpp
@@ -38,11 +38,15 @@
 
				 #define DEVICE_BUFFER_SIZE 1024 //in MB
			
 
				 #define V_BLK_CACHE 25 //in KB
			
 
				 
			
 
				+#ifndef __DEVICE_SYNC__
			
 
				+#define __DEVICE_SYNC__ 0 // No device synchronization by default.
			
 
				+#endif
			
 
				+
			
 
				 #define UNUSED(x) (void)(x) // to ignore unused variable warning.
			
 
				 
			
 
				-#include <cstdlib>
			
 
				-#include <cassert>
			
 
				 #ifndef NDEBUG
			
 
				+#include <cassert>
			
 
				+#include <iostream>
			
 
				 #define ASSERT_WITH_MSG(cond, msg) do \
			
 
				 { if (!(cond)) { std::cerr<<"Error: "<<msg<<'\n'; assert(cond); } \
			
 
				 } while(0)
			
--- a/include/quad_utils.hpp
+++ b/include/quad_utils.hpp
@@ -5,13 +5,11 @@
 
				  * \brief This file contains definition of QuadReal_t.
			
 
				  */
			
 
				 
			
 
				+#include <cmath>
			
 
				+
			
 
				 #ifndef _QUAD_UTILS_
			
 
				 #define _QUAD_UTILS_
			
 
				 
			
 
				-#include <pvfmm_common.hpp>
			
 
				-#include <iostream>
			
 
				-#include <vector>
			
 
				-
			
 
				 #ifdef PVFMM_QUAD_T
			
 
				 
			
 
				 typedef PVFMM_QUAD_T QuadReal_t;
			
--- a/include/quad_utils.txx
+++ b/include/quad_utils.txx
@@ -5,9 +5,11 @@
 
				  * \brief This file contains quadruple-precision related functions.
			
 
				  */
			
 
				 
			
 
				-#include <iomanip>
			
 
				-#include <cstdlib>
			
 
				+#include <omp.h>
			
 
				 #include <cmath>
			
 
				+#include <iostream>
			
 
				+#include <iomanip>
			
 
				+#include <vector>
			
 
				 
			
 
				 QuadReal_t atoquad(const char* str){
			
 
				   size_t i=0;
			
--- a/include/tree.hpp
+++ b/include/tree.hpp
@@ -5,13 +5,16 @@
 
				  * \brief This file contains the definition of the base class for a tree.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_TREE_HPP_
			
 
				-#define _PVFMM_TREE_HPP_
			
 
				+#include <cassert>
			
 
				+#include <vector>
			
 
				 
			
 
				 #include <pvfmm_common.hpp>
			
 
				-#include <iostream>
			
 
				+#include <tree_node.hpp>
			
 
				 #include <mem_mgr.hpp>
			
 
				 
			
 
				+#ifndef _PVFMM_TREE_HPP_
			
 
				+#define _PVFMM_TREE_HPP_
			
 
				+
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 /**
			
--- a/include/tree.txx
+++ b/include/tree.txx
@@ -1,13 +1,10 @@
 
				 /**
			
 
				- * \file tree.cpp
			
 
				+ * \file tree.txx
			
 
				  * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
			
 
				  * \date 12-11-2010
			
 
				  * \brief This file contains the implementation of the class Tree.
			
 
				  */
			
 
				 
			
 
				-#include <tree.hpp>
			
 
				-#include <assert.h>
			
 
				-
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 template <class TreeNode>
			
--- a/include/tree_node.hpp
+++ b/include/tree_node.hpp
@@ -6,13 +6,11 @@
 
				  * node.
			
 
				  */
			
 
				 
			
 
				+#include <pvfmm_common.hpp>
			
 
				+
			
 
				 #ifndef _PVFMM_TREE_NODE_HPP_
			
 
				 #define _PVFMM_TREE_NODE_HPP_
			
 
				 
			
 
				-#include <pvfmm_common.hpp>
			
 
				-#include <assert.h>
			
 
				-#include <cstring>
			
 
				-
			
 
				 namespace pvfmm{
			
 
				 
			
 
				 /**
			
--- a/include/vector.hpp
+++ b/include/vector.hpp
@@ -5,13 +5,14 @@
 
				  * \brief This file contains definition of the class Vector.
			
 
				  */
			
 
				 
			
 
				-#ifndef _PVFMM_VECTOR_HPP_
			
 
				-#define _PVFMM_VECTOR_HPP_
			
 
				-
			
 
				 #include <vector>
			
 
				-#include <iostream>
			
 
				 #include <stdint.h>
			
 
				 
			
 
				+#include <pvfmm_common.hpp>
			
 
				+
			
 
				+#ifndef _PVFMM_VECTOR_HPP_
			
 
				+#define _PVFMM_VECTOR_HPP_
			
 
				+
			
 
				 #ifdef __INTEL_OFFLOAD
			
 
				 #pragma offload_attribute(push,target(mic))
			
 
				 #endif
			
--- a/include/vector.txx
+++ b/include/vector.txx
@@ -5,13 +5,13 @@
 
				  * \brief This file contains implementation of the class Vector.
			
 
				  */
			
 
				 
			
 
				-#include <cstdlib>
			
 
				-#include <cstring>
			
 
				 #include <cassert>
			
 
				+#include <iostream>
			
 
				 #include <iomanip>
			
 
				-#include <profile.hpp>
			
 
				-#include <mem_utils.hpp>
			
 
				+
			
 
				 #include <device_wrapper.hpp>
			
 
				+#include <mem_utils.hpp>
			
 
				+#include <profile.hpp>
			
 
				 
			
 
				 namespace pvfmm{
			
 
				 
			
--- a/m4/ac_check_intel_offload.m4
+++ b/m4/ac_check_intel_offload.m4
@@ -1,7 +1,3 @@
 
				-# SYNOPSIS
			
 
				-#
			
 
				-#   CHECK_INTEL_OFFLOAD([ACTION-IF-FOUND[, ACTION-IF-NOT-FOUND]])
			
 
				-#
			
 
				 
			
 
				 AC_DEFUN([CHECK_INTEL_OFFLOAD], [
			
 
				     ## Check for support of offload pragma and -no-offload flag. If
			
@@ -38,8 +34,8 @@ AC_DEFUN([CHECK_INTEL_OFFLOAD], [
 
				     AC_LANG_WERROR([off])
			
 
				     CFLAGS="$XCFLAGS"
			
 
				     CXXFLAGS="$XCXXFLAGS"
			
 
				+    ARFLAGS="$AR_FLAGS"
			
 
				 
			
 
				-    # Finally, execute ACTION-IF-FOUND/ACTION-IF-NOT-FOUND:
			
 
				     if test x"$intel_offload_pragma_ok" = xyes; then
			
 
				         AC_DEFINE(HAVE_INTEL_OFFLOAD_PRAGMA,1,[Define if you have INTEL_OFFLOAD_PRAGMA.])
			
 
				     fi
			
@@ -48,6 +44,9 @@ AC_DEFUN([CHECK_INTEL_OFFLOAD], [
 
				         if test x"$intel_offload_pragma_ok" = xyes; then
			
 
				             AC_DEFINE(HAVE_INTEL_OFFLOAD,1,[Define if you have INTEL_OFFLOAD.])
			
 
				             intel_offload_ok=yes
			
 
				+
			
 
				+            AR="xiar"
			
 
				+            ARFLAGS="cru -qoffload-build"
			
 
				         else
			
 
				             CFLAGS="$CFLAGS -no-offload"
			
 
				             CXXFLAGS="$CXXFLAGS -no-offload"
			
@@ -56,6 +55,7 @@ AC_DEFUN([CHECK_INTEL_OFFLOAD], [
 
				     AC_SUBST(intel_offload_pragma_ok)
			
 
				     AC_SUBST(intel_noffload_flag_ok)
			
 
				     AC_SUBST(intel_offload_ok)
			
 
				-
			
 
				+    AC_SUBST(ARFLAGS)
			
 
				+    AC_SUBST(AR)
			
 
				 ])
			
 
				 
			
--- a/src/device_wrapper.cpp
+++ b/src/device_wrapper.cpp
@@ -0,0 +1,19 @@
 
				+/**
			
 
				+ * \file device_wrapper.cpp
			
 
				+ * \author Dhairya Malhotra, dhairya.malhotra@gmail.com
			
 
				+ * \date 7-30-2014
			
 
				+ * \brief This file contains implementation of DeviceWrapper.
			
 
				+ */
			
 
				+
			
 
				+#include <mpi.h>
			
 
				+
			
 
				+#include <device_wrapper.hpp>
			
 
				+#include <vector.hpp>
			
 
				+
			
 
				+namespace pvfmm{
			
 
				+
			
 
				+  Vector<char> MIC_Lock::lock_vec;
			
 
				+  Vector<char>::Device MIC_Lock::lock_vec_;
			
 
				+  int MIC_Lock::lock_idx;
			
 
				+
			
 
				+}//end namespace
			
--- a/src/fmm_gll.cpp
+++ b/src/fmm_gll.cpp
@@ -10,6 +10,7 @@
 
				 #include <fmm_tree.hpp>
			
 
				 #include <cheb_utils.hpp>
			
 
				 #include <vector.hpp>
			
 
				+#include <cheb_node.hpp>
			
 
				 
			
 
				 typedef pvfmm::FMM_Node<pvfmm::Cheb_Node<double> > FMMNode_t;
			
 
				 typedef pvfmm::FMM_Cheb<FMMNode_t> FMM_Mat_t;
			
@@ -58,7 +59,7 @@ extern "C" {
 
				         fmm_data->fmm_mat_laplace_grad=new FMM_Mat_t;
			
 
				         fmm_mat=((FMM_Mat_t*)fmm_data->fmm_mat_laplace_grad);
			
 
				 
			
 
				-        fmm_data->kernel_laplace_grad=pvfmm::LaplaceKernel<double>::grad_ker;
			
 
				+        fmm_data->kernel_laplace_grad=&pvfmm::LaplaceKernel<double>::grad_ker();
			
 
				         mykernel=(pvfmm::Kernel<double>*)fmm_data->kernel_laplace_grad;
			
 
				 
			
 
				         fmm_data->tree_laplace_grad=new FMM_Tree_t(fmm_data->comm);