CUDA/C++ Host/Device Polymorphic Class Implementation

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
7
down vote

favorite

I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.

My goals are:

A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime

The ability to call functions of this interface from host AND device with the same underlying implementation

As performant as possible

The way I've achieved this is to: specify a base BField class with __host__ __device__ specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double) and getGradBAtS(double, double)) are called on device by a __global__ kernel which is run over ~3.5mil particles.

BField.h (Base class):

#ifndef BFIELD_H
#define BFIELD_H

#include <string>

//CUDA includes
#include "host_defines.h"

class BField

protected:
 BField** this_d nullptr ;

 #ifndef __CUDA_ARCH__ //host code
 std::string modelName_m;
 #else //device code
 const char* modelName_m; //placeholder, not used
 #endif /* !__CUDA_ARCH__ */

 __host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
 __host__ virtual void deleteEnvironment() = 0;

 __host__ __device__ BField() 

public:
 __host__ __device__ virtual ~BField() ;

 __host__ __device__ BField(const BField&) = delete;
 __host__ __device__ BField& operator=(const BField&) = delete;

 __host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
 __host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;

 __host__ virtual std::string name() const return modelName_m; 
 __host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;

#endif

DipoleB.h (Derived):

#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H

#include "BFieldBField.h"
#include "physicalconstants.h"

constexpr double B0 3.12e-5 ; //won't change from sim to sim

class DipoleB : public BField

protected:
 //Field simulation constants
 double L_m 0.0 ;
 double L_norm_m 0.0 ;
 double s_max_m 0.0 ;

 //specified variables
 double ILATDegrees_m 0.0 ;
 double ds_m 0.0 ;
 double errorTolerance_m 0.0 ;

 //protected functions
 __host__ virtual void setupEnvironment() override;
 __host__ virtual void deleteEnvironment() override;
 __host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
 __host__ __device__ double getLambdaAtS(const double s) const;

public:
 __host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
 __host__ __device__ ~DipoleB();
 __host__ __device__ DipoleB(const DipoleB&) = delete;
 __host__ __device__ DipoleB& operator=(const DipoleB&) = delete;

 //for testing
 double ILAT() const return ILATDegrees_m; 
 double ds() const return ds_m; 
 double L() const return L_m; 
 double s_max() const return s_max_m; 

 __host__ virtual void setds(double ds) ds_m = ds; 

 __host__ __device__ double getBFieldAtS(const double s, const double t) const override;
 __host__ __device__ double getGradBAtS (const double s, const double t) const override;

 __host__ double getErrTol() const return errorTolerance_m; 
 __host__ double getds() const return ds_m; 
;

#endif

DipoleB.cu (Derived member function definition and some CUDA kernels):

#include "BFieldDipoleB.h"

#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"

//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)

 ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));


__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)

 ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));


__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
 BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance 

 L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
 L_norm_m = L_m / RADIUS_EARTH;
 s_max_m = getSAtLambda(ILATDegrees_m);

 #ifndef __CUDA_ARCH__ //host code
 modelName_m = "DipoleB";
 setupEnvironment();
 #endif /* !__CUDA_ARCH__ */


__host__ __device__ DipoleB::~DipoleB()

 #ifndef __CUDA_ARCH__ //host code
 deleteEnvironment();
 #endif /* !__CUDA_ARCH__ */


//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const

 //double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
 double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
 double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))

 return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster


__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
 double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
 double dlambda 1.0 ;
 bool over 0 ;

 while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance
 
 while (1)
 
 over = (s_tmp >= s);
 if (over)
 
 lambda_tmp += dlambda;
 s_tmp = s_max_m - getSAtLambda(lambda_tmp);
 if (s_tmp < s)
 break;
 
 else
 
 lambda_tmp -= dlambda;
 s_tmp = s_max_m - getSAtLambda(lambda_tmp);
 if (s_tmp >= s)
 break;
 
 
 if (dlambda < errorTolerance_m / 100.0) //errorTolerance
 break;
 dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
 

 return lambda_tmp;


__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 double lambda_deg getLambdaAtS(s) ;
 double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;

 return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));


__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const

 return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);



//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
 setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
 CUDA_KERNEL_ERRCHK_WSYNC();


void DipoleB::deleteEnvironment()

 deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
 CUDA_KERNEL_ERRCHK_WSYNC();

 CUDA_API_ERRCHK(cudaFree(this_d));

Calling Functions:

__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
 double F_lor, F_mir, stmp;
 stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK

 //Mirror force
 F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)

 //Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
 F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS

 return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field

__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)

 // dy / dt = f(t, y), y(t_0) = y_0
 // funcArgs are whatever you need to pass to the equation
 // args array: [s_0, mu, q, m, simtime]
 double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;

 k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt

 t_RK = h / 2;
 y = y_0 + k1 * t_RK;
 k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)

 y = y_0 + k2 * t_RK;
 k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)

 t_RK = h;
 y = y_0 + k3 * t_RK;
 k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)

 return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y


__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
 const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)

 unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;

 double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;

 if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
 return;
 else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
 return;
 else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet
 
 t_escape_d[thdInd] = simtime;
 return;
 
 else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet
 
 t_escape_d[thdInd] = simtime;
 return;
 

 //args array: [ps_0, mu, q, m, simtime]
 const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;

 v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
 s_d[thdInd] += v_d[thdInd] * dt;

A few questions:

Am I achieving my goals in the most efficient way possible?

Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?

This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how BField is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of a Base instance:
```
[BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]
```
and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?

Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?

edited May 2 at 0:36

Jamalâ™¦

30.1k11114225

asked May 1 at 17:08

TomAdo

385

add a commentÂ |Â

up vote
7
down vote

favorite

My goals are:

A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime

The ability to call functions of this interface from host AND device with the same underlying implementation

As performant as possible

BField.h (Base class):

#ifndef BFIELD_H
#define BFIELD_H

#include <string>

//CUDA includes
#include "host_defines.h"

class BField

protected:
 BField** this_d nullptr ;

 #ifndef __CUDA_ARCH__ //host code
 std::string modelName_m;
 #else //device code
 const char* modelName_m; //placeholder, not used
 #endif /* !__CUDA_ARCH__ */

 __host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
 __host__ virtual void deleteEnvironment() = 0;

 __host__ __device__ BField() 

public:
 __host__ __device__ virtual ~BField() ;

 __host__ __device__ BField(const BField&) = delete;
 __host__ __device__ BField& operator=(const BField&) = delete;

 __host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
 __host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;

 __host__ virtual std::string name() const return modelName_m; 
 __host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;

#endif

DipoleB.h (Derived):

#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H

#include "BFieldBField.h"
#include "physicalconstants.h"

constexpr double B0 3.12e-5 ; //won't change from sim to sim

class DipoleB : public BField

protected:
 //Field simulation constants
 double L_m 0.0 ;
 double L_norm_m 0.0 ;
 double s_max_m 0.0 ;

 //specified variables
 double ILATDegrees_m 0.0 ;
 double ds_m 0.0 ;
 double errorTolerance_m 0.0 ;

 //protected functions
 __host__ virtual void setupEnvironment() override;
 __host__ virtual void deleteEnvironment() override;
 __host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
 __host__ __device__ double getLambdaAtS(const double s) const;

public:
 __host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
 __host__ __device__ ~DipoleB();
 __host__ __device__ DipoleB(const DipoleB&) = delete;
 __host__ __device__ DipoleB& operator=(const DipoleB&) = delete;

 //for testing
 double ILAT() const return ILATDegrees_m; 
 double ds() const return ds_m; 
 double L() const return L_m; 
 double s_max() const return s_max_m; 

 __host__ virtual void setds(double ds) ds_m = ds; 

 __host__ __device__ double getBFieldAtS(const double s, const double t) const override;
 __host__ __device__ double getGradBAtS (const double s, const double t) const override;

 __host__ double getErrTol() const return errorTolerance_m; 
 __host__ double getds() const return ds_m; 
;

#endif

DipoleB.cu (Derived member function definition and some CUDA kernels):

#include "BFieldDipoleB.h"

#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"

//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)

 ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));


__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)

 ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));


__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
 BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance 

 L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
 L_norm_m = L_m / RADIUS_EARTH;
 s_max_m = getSAtLambda(ILATDegrees_m);

 #ifndef __CUDA_ARCH__ //host code
 modelName_m = "DipoleB";
 setupEnvironment();
 #endif /* !__CUDA_ARCH__ */


__host__ __device__ DipoleB::~DipoleB()

 #ifndef __CUDA_ARCH__ //host code
 deleteEnvironment();
 #endif /* !__CUDA_ARCH__ */


//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const

 //double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
 double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
 double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))

 return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster


__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
 double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
 double dlambda 1.0 ;
 bool over 0 ;

 while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance
 
 while (1)
 
 over = (s_tmp >= s);
 if (over)
 
 lambda_tmp += dlambda;
 s_tmp = s_max_m - getSAtLambda(lambda_tmp);
 if (s_tmp < s)
 break;
 
 else
 
 lambda_tmp -= dlambda;
 s_tmp = s_max_m - getSAtLambda(lambda_tmp);
 if (s_tmp >= s)
 break;
 
 
 if (dlambda < errorTolerance_m / 100.0) //errorTolerance
 break;
 dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
 

 return lambda_tmp;


__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 double lambda_deg getLambdaAtS(s) ;
 double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;

 return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));


__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const

 return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);



//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
 setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
 CUDA_KERNEL_ERRCHK_WSYNC();


void DipoleB::deleteEnvironment()

 deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
 CUDA_KERNEL_ERRCHK_WSYNC();

 CUDA_API_ERRCHK(cudaFree(this_d));

Calling Functions:

__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
 double F_lor, F_mir, stmp;
 stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK

 //Mirror force
 F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)

 //Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
 F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS

 return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field

__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)

 // dy / dt = f(t, y), y(t_0) = y_0
 // funcArgs are whatever you need to pass to the equation
 // args array: [s_0, mu, q, m, simtime]
 double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;

 k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt

 t_RK = h / 2;
 y = y_0 + k1 * t_RK;
 k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)

 y = y_0 + k2 * t_RK;
 k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)

 t_RK = h;
 y = y_0 + k3 * t_RK;
 k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)

 return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y


__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
 const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)

 unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;

 double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;

 if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
 return;
 else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
 return;
 else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet
 
 t_escape_d[thdInd] = simtime;
 return;
 
 else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet
 
 t_escape_d[thdInd] = simtime;
 return;
 

 //args array: [ps_0, mu, q, m, simtime]
 const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;

 v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
 s_d[thdInd] += v_d[thdInd] * dt;

A few questions:

Am I achieving my goals in the most efficient way possible?

Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?

This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how BField is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of a Base instance:
```
[BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]
```
and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?

Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?

edited May 2 at 0:36

Jamalâ™¦

30.1k11114225

asked May 1 at 17:08

TomAdo

385

add a commentÂ |Â

up vote
7
down vote

favorite

My goals are:

A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime

The ability to call functions of this interface from host AND device with the same underlying implementation

As performant as possible

BField.h (Base class):

#ifndef BFIELD_H
#define BFIELD_H

#include <string>

//CUDA includes
#include "host_defines.h"

class BField

protected:
 BField** this_d nullptr ;

 #ifndef __CUDA_ARCH__ //host code
 std::string modelName_m;
 #else //device code
 const char* modelName_m; //placeholder, not used
 #endif /* !__CUDA_ARCH__ */

 __host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
 __host__ virtual void deleteEnvironment() = 0;

 __host__ __device__ BField() 

public:
 __host__ __device__ virtual ~BField() ;

 __host__ __device__ BField(const BField&) = delete;
 __host__ __device__ BField& operator=(const BField&) = delete;

 __host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
 __host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;

 __host__ virtual std::string name() const return modelName_m; 
 __host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;

#endif

DipoleB.h (Derived):

#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H

#include "BFieldBField.h"
#include "physicalconstants.h"

constexpr double B0 3.12e-5 ; //won't change from sim to sim

class DipoleB : public BField

protected:
 //Field simulation constants
 double L_m 0.0 ;
 double L_norm_m 0.0 ;
 double s_max_m 0.0 ;

 //specified variables
 double ILATDegrees_m 0.0 ;
 double ds_m 0.0 ;
 double errorTolerance_m 0.0 ;

 //protected functions
 __host__ virtual void setupEnvironment() override;
 __host__ virtual void deleteEnvironment() override;
 __host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
 __host__ __device__ double getLambdaAtS(const double s) const;

public:
 __host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
 __host__ __device__ ~DipoleB();
 __host__ __device__ DipoleB(const DipoleB&) = delete;
 __host__ __device__ DipoleB& operator=(const DipoleB&) = delete;

 //for testing
 double ILAT() const return ILATDegrees_m; 
 double ds() const return ds_m; 
 double L() const return L_m; 
 double s_max() const return s_max_m; 

 __host__ virtual void setds(double ds) ds_m = ds; 

 __host__ __device__ double getBFieldAtS(const double s, const double t) const override;
 __host__ __device__ double getGradBAtS (const double s, const double t) const override;

 __host__ double getErrTol() const return errorTolerance_m; 
 __host__ double getds() const return ds_m; 
;

#endif

DipoleB.cu (Derived member function definition and some CUDA kernels):

#include "BFieldDipoleB.h"

#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"

//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)

 ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));


__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)

 ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));


__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
 BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance 

 L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
 L_norm_m = L_m / RADIUS_EARTH;
 s_max_m = getSAtLambda(ILATDegrees_m);

 #ifndef __CUDA_ARCH__ //host code
 modelName_m = "DipoleB";
 setupEnvironment();
 #endif /* !__CUDA_ARCH__ */


__host__ __device__ DipoleB::~DipoleB()

 #ifndef __CUDA_ARCH__ //host code
 deleteEnvironment();
 #endif /* !__CUDA_ARCH__ */


//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const

 //double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
 double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
 double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))

 return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster


__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
 double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
 double dlambda 1.0 ;
 bool over 0 ;

 while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance
 
 while (1)
 
 over = (s_tmp >= s);
 if (over)
 
 lambda_tmp += dlambda;
 s_tmp = s_max_m - getSAtLambda(lambda_tmp);
 if (s_tmp < s)
 break;
 
 else
 
 lambda_tmp -= dlambda;
 s_tmp = s_max_m - getSAtLambda(lambda_tmp);
 if (s_tmp >= s)
 break;
 
 
 if (dlambda < errorTolerance_m / 100.0) //errorTolerance
 break;
 dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
 

 return lambda_tmp;


__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 double lambda_deg getLambdaAtS(s) ;
 double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;

 return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));


__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const

 return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);



//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
 setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
 CUDA_KERNEL_ERRCHK_WSYNC();


void DipoleB::deleteEnvironment()

 deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
 CUDA_KERNEL_ERRCHK_WSYNC();

 CUDA_API_ERRCHK(cudaFree(this_d));

Calling Functions:

__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
 double F_lor, F_mir, stmp;
 stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK

 //Mirror force
 F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)

 //Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
 F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS

 return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field

__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)

 // dy / dt = f(t, y), y(t_0) = y_0
 // funcArgs are whatever you need to pass to the equation
 // args array: [s_0, mu, q, m, simtime]
 double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;

 k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt

 t_RK = h / 2;
 y = y_0 + k1 * t_RK;
 k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)

 y = y_0 + k2 * t_RK;
 k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)

 t_RK = h;
 y = y_0 + k3 * t_RK;
 k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)

 return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y


__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
 const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)

 unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;

 double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;

 if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
 return;
 else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
 return;
 else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet
 
 t_escape_d[thdInd] = simtime;
 return;
 
 else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet
 
 t_escape_d[thdInd] = simtime;
 return;
 

 //args array: [ps_0, mu, q, m, simtime]
 const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;

 v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
 s_d[thdInd] += v_d[thdInd] * dt;

A few questions:

Am I achieving my goals in the most efficient way possible?

Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?

This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how BField is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of a Base instance:
```
[BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]
```
and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?

Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?

edited May 2 at 0:36

Jamalâ™¦

30.1k11114225

asked May 1 at 17:08

TomAdo

385

My goals are:

A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime

The ability to call functions of this interface from host AND device with the same underlying implementation

As performant as possible

BField.h (Base class):

#ifndef BFIELD_H
#define BFIELD_H

#include <string>

//CUDA includes
#include "host_defines.h"

class BField

protected:
 BField** this_d nullptr ;

 #ifndef __CUDA_ARCH__ //host code
 std::string modelName_m;
 #else //device code
 const char* modelName_m; //placeholder, not used
 #endif /* !__CUDA_ARCH__ */

 __host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
 __host__ virtual void deleteEnvironment() = 0;

 __host__ __device__ BField() 

public:
 __host__ __device__ virtual ~BField() ;

 __host__ __device__ BField(const BField&) = delete;
 __host__ __device__ BField& operator=(const BField&) = delete;

 __host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
 __host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;

 __host__ virtual std::string name() const return modelName_m; 
 __host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;

#endif

DipoleB.h (Derived):

#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H

#include "BFieldBField.h"
#include "physicalconstants.h"

constexpr double B0 3.12e-5 ; //won't change from sim to sim

class DipoleB : public BField

protected:
 //Field simulation constants
 double L_m 0.0 ;
 double L_norm_m 0.0 ;
 double s_max_m 0.0 ;

 //specified variables
 double ILATDegrees_m 0.0 ;
 double ds_m 0.0 ;
 double errorTolerance_m 0.0 ;

 //protected functions
 __host__ virtual void setupEnvironment() override;
 __host__ virtual void deleteEnvironment() override;
 __host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
 __host__ __device__ double getLambdaAtS(const double s) const;

public:
 __host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
 __host__ __device__ ~DipoleB();
 __host__ __device__ DipoleB(const DipoleB&) = delete;
 __host__ __device__ DipoleB& operator=(const DipoleB&) = delete;

 //for testing
 double ILAT() const return ILATDegrees_m; 
 double ds() const return ds_m; 
 double L() const return L_m; 
 double s_max() const return s_max_m; 

 __host__ virtual void setds(double ds) ds_m = ds; 

 __host__ __device__ double getBFieldAtS(const double s, const double t) const override;
 __host__ __device__ double getGradBAtS (const double s, const double t) const override;

 __host__ double getErrTol() const return errorTolerance_m; 
 __host__ double getds() const return ds_m; 
;

#endif

DipoleB.cu (Derived member function definition and some CUDA kernels):

#include "BFieldDipoleB.h"

#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"

//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)

 ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));


__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)

 ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));


__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
 BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance 

 L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
 L_norm_m = L_m / RADIUS_EARTH;
 s_max_m = getSAtLambda(ILATDegrees_m);

 #ifndef __CUDA_ARCH__ //host code
 modelName_m = "DipoleB";
 setupEnvironment();
 #endif /* !__CUDA_ARCH__ */


__host__ __device__ DipoleB::~DipoleB()

 #ifndef __CUDA_ARCH__ //host code
 deleteEnvironment();
 #endif /* !__CUDA_ARCH__ */


//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const

 //double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
 double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
 double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))

 return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster


__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
 double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
 double dlambda 1.0 ;
 bool over 0 ;

 while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance
 
 while (1)
 
 over = (s_tmp >= s);
 if (over)
 
 lambda_tmp += dlambda;
 s_tmp = s_max_m - getSAtLambda(lambda_tmp);
 if (s_tmp < s)
 break;
 
 else
 
 lambda_tmp -= dlambda;
 s_tmp = s_max_m - getSAtLambda(lambda_tmp);
 if (s_tmp >= s)
 break;
 
 
 if (dlambda < errorTolerance_m / 100.0) //errorTolerance
 break;
 dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
 

 return lambda_tmp;


__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 double lambda_deg getLambdaAtS(s) ;
 double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;

 return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));


__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const

 return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);



//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
 CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
 setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
 CUDA_KERNEL_ERRCHK_WSYNC();


void DipoleB::deleteEnvironment()

 deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
 CUDA_KERNEL_ERRCHK_WSYNC();

 CUDA_API_ERRCHK(cudaFree(this_d));

Calling Functions:

__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
 double F_lor, F_mir, stmp;
 stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK

 //Mirror force
 F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)

 //Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
 F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS

 return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field

__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)

 // dy / dt = f(t, y), y(t_0) = y_0
 // funcArgs are whatever you need to pass to the equation
 // args array: [s_0, mu, q, m, simtime]
 double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;

 k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt

 t_RK = h / 2;
 y = y_0 + k1 * t_RK;
 k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)

 y = y_0 + k2 * t_RK;
 k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)

 t_RK = h;
 y = y_0 + k3 * t_RK;
 k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)

 return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y


__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
 const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)

 unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;

 double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;

 if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
 return;
 else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
 return;
 else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet
 
 t_escape_d[thdInd] = simtime;
 return;
 
 else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet
 
 t_escape_d[thdInd] = simtime;
 return;
 

 //args array: [ps_0, mu, q, m, simtime]
 const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;

 v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
 s_d[thdInd] += v_d[thdInd] * dt;

A few questions:

Am I achieving my goals in the most efficient way possible?

Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?

This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how BField is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of a Base instance:
```
[BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]
```
and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?

Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?

edited May 2 at 0:36

Jamalâ™¦

30.1k11114225

asked May 1 at 17:08

TomAdo

385

edited May 2 at 0:36

Jamalâ™¦

30.1k11114225

edited May 2 at 0:36

Jamalâ™¦

30.1k11114225

edited May 2 at 0:36

Jamalâ™¦

30.1k11114225

asked May 1 at 17:08

TomAdo

385

asked May 1 at 17:08

TomAdo

385

asked May 1 at 17:08

TomAdo

385

add a commentÂ |Â

1 Answer
1

active

oldest

votes

up vote
1
down vote

accepted

nice!

The code is generally rather good. I donÃ¢Â€Â™t have a lot to say about the language usage in the individual lines.

The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.

I donÃ¢Â€Â™t understand the part about making a matching copy on the device.

Why do you have modelName_m on dummied on the controller? CanÃ¢Â€Â™t you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*? The base class should take this as a constructor argument and the derived class supply it in the init line.

one thing stuck outÃ¢Â€Â¦

setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.

The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.

So, donÃ¢Â€Â™t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.

other

Passing double everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.

angle_in_degrees should be the type name, not the variable name. But even better, just angle_t and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.

edited May 1 at 20:40

answered May 1 at 20:33

JDÃ…Â‚ugosz

5,047731

Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â€“Â TomAdo
May 1 at 20:57

Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â€“Â TomAdo
May 1 at 21:01

Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â€“Â TomAdo
May 1 at 21:05

The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesnÃ¢Â€Â™t help the compiler. But, obviously zero performance change since there is zero meaning change. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:19

No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:27

Â |Â
show 2 more comments

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193367%2fcuda-c-host-device-polymorphic-class-implementation%23new-answer', 'question_page');

);

Post as a guest

Name

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
1
down vote

accepted

nice!

The code is generally rather good. I donÃ¢Â€Â™t have a lot to say about the language usage in the individual lines.

I donÃ¢Â€Â™t understand the part about making a matching copy on the device.

one thing stuck outÃ¢Â€Â¦

setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.

So, donÃ¢Â€Â™t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.

other

edited May 1 at 20:40

answered May 1 at 20:33

JDÃ…Â‚ugosz

5,047731

Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â€“Â TomAdo
May 1 at 20:57

Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â€“Â TomAdo
May 1 at 21:01

Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â€“Â TomAdo
May 1 at 21:05

The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesnÃ¢Â€Â™t help the compiler. But, obviously zero performance change since there is zero meaning change. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:19

No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:27

Â |Â
show 2 more comments

up vote
1
down vote

accepted

nice!

The code is generally rather good. I donÃ¢Â€Â™t have a lot to say about the language usage in the individual lines.

I donÃ¢Â€Â™t understand the part about making a matching copy on the device.

one thing stuck outÃ¢Â€Â¦

setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.

So, donÃ¢Â€Â™t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.

other

edited May 1 at 20:40

answered May 1 at 20:33

JDÃ…Â‚ugosz

5,047731

Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â€“Â TomAdo
May 1 at 20:57

Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â€“Â TomAdo
May 1 at 21:01

Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â€“Â TomAdo
May 1 at 21:05

The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesnÃ¢Â€Â™t help the compiler. But, obviously zero performance change since there is zero meaning change. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:19

No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:27

Â |Â
show 2 more comments

up vote
1
down vote

accepted

nice!

The code is generally rather good. I donÃ¢Â€Â™t have a lot to say about the language usage in the individual lines.

I donÃ¢Â€Â™t understand the part about making a matching copy on the device.

one thing stuck outÃ¢Â€Â¦

setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.

So, donÃ¢Â€Â™t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.

other

edited May 1 at 20:40

answered May 1 at 20:33

JDÃ…Â‚ugosz

5,047731

nice!

The code is generally rather good. I donÃ¢Â€Â™t have a lot to say about the language usage in the individual lines.

I donÃ¢Â€Â™t understand the part about making a matching copy on the device.

one thing stuck outÃ¢Â€Â¦

setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.

So, donÃ¢Â€Â™t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.

other

edited May 1 at 20:40

answered May 1 at 20:33

JDÃ…Â‚ugosz

5,047731

edited May 1 at 20:40

answered May 1 at 20:33

JDÃ…Â‚ugosz

5,047731

answered May 1 at 20:33

JDÃ…Â‚ugosz

5,047731

answered May 1 at 20:33

JDÃ…Â‚ugosz

5,047731

Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â€“Â TomAdo
May 1 at 20:57

Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â€“Â TomAdo
May 1 at 21:01

Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â€“Â TomAdo
May 1 at 21:05

The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesnÃ¢Â€Â™t help the compiler. But, obviously zero performance change since there is zero meaning change. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:19

No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:27

Â |Â
show 2 more comments

Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â€“Â TomAdo
May 1 at 20:57

Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â€“Â TomAdo
May 1 at 21:01

Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â€“Â TomAdo
May 1 at 21:05

The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesnÃ¢Â€Â™t help the compiler. But, obviously zero performance change since there is zero meaning change. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:19

No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:27

Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â€“Â TomAdo
May 1 at 20:57

Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â€“Â TomAdo
May 1 at 21:01

Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â€“Â TomAdo
May 1 at 21:05

The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesnÃ¢Â€Â™t help the compiler. But, obviously zero performance change since there is zero meaning change. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:19

No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. Ã¢Â‹Â¯
â€“Â JDÃ…Â‚ugosz
May 1 at 21:27

Â |Â
show 2 more comments

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr

CUDA/C++ Host/Device Polymorphic Class Implementation

1 Answer
1

nice!

one thing stuck outÃ¢Â€Â¦

other

Your Answer

Post as a guest

1 Answer
1

1 Answer
1

nice!

one thing stuck outÃ¢Â€Â¦

other

nice!

one thing stuck outÃ¢Â€Â¦

other

nice!

one thing stuck outÃ¢Â€Â¦

other

nice!

one thing stuck outÃ¢Â€Â¦

other

Post as a guest

Popular posts from this blog

Read an image with ADNS2610 optical sensor and Arduino Uno

Read files from a directory using Promises

Chat program with C++ and SFML

CUDA/C++ Host/Device Polymorphic Class Implementation

1 Answer 1

nice!

one thing stuck outÃ¢Â€Â¦

other

Your Answer

Sign up or log in

Post as a guest

Post as a guest

1 Answer 1

1 Answer 1

nice!

one thing stuck outÃ¢Â€Â¦

other

nice!

one thing stuck outÃ¢Â€Â¦

other

nice!

one thing stuck outÃ¢Â€Â¦

other

nice!

one thing stuck outÃ¢Â€Â¦

other

Sign up or log in

Post as a guest

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Popular posts from this blog

Read an image with ADNS2610 optical sensor and Arduino Uno

Read files from a directory using Promises

Chat program with C++ and SFML

1 Answer
1

1 Answer
1

1 Answer
1