CUDA/C++ Host/Device Polymorphic Class Implementation
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
7
down vote
favorite
I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.
My goals are:
- A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime
- The ability to call functions of this interface from host AND device with the same underlying implementation
- As performant as possible
The way I've achieved this is to: specify a base BField
class with __host__ __device__
specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB
). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double)
and getGradBAtS(double, double)
) are called on device by a __global__
kernel which is run over ~3.5mil particles.
BField.h (Base class):
#ifndef BFIELD_H
#define BFIELD_H
#include <string>
//CUDA includes
#include "host_defines.h"
class BField
protected:
BField** this_d nullptr ;
#ifndef __CUDA_ARCH__ //host code
std::string modelName_m;
#else //device code
const char* modelName_m; //placeholder, not used
#endif /* !__CUDA_ARCH__ */
__host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
__host__ virtual void deleteEnvironment() = 0;
__host__ __device__ BField()
public:
__host__ __device__ virtual ~BField() ;
__host__ __device__ BField(const BField&) = delete;
__host__ __device__ BField& operator=(const BField&) = delete;
__host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
__host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;
__host__ virtual std::string name() const return modelName_m;
__host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;
#endif
DipoleB.h (Derived):
#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H
#include "BFieldBField.h"
#include "physicalconstants.h"
constexpr double B0 3.12e-5 ; //won't change from sim to sim
class DipoleB : public BField
protected:
//Field simulation constants
double L_m 0.0 ;
double L_norm_m 0.0 ;
double s_max_m 0.0 ;
//specified variables
double ILATDegrees_m 0.0 ;
double ds_m 0.0 ;
double errorTolerance_m 0.0 ;
//protected functions
__host__ virtual void setupEnvironment() override;
__host__ virtual void deleteEnvironment() override;
__host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
__host__ __device__ double getLambdaAtS(const double s) const;
public:
__host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
__host__ __device__ ~DipoleB();
__host__ __device__ DipoleB(const DipoleB&) = delete;
__host__ __device__ DipoleB& operator=(const DipoleB&) = delete;
//for testing
double ILAT() const return ILATDegrees_m;
double ds() const return ds_m;
double L() const return L_m;
double s_max() const return s_max_m;
__host__ virtual void setds(double ds) ds_m = ds;
__host__ __device__ double getBFieldAtS(const double s, const double t) const override;
__host__ __device__ double getGradBAtS (const double s, const double t) const override;
__host__ double getErrTol() const return errorTolerance_m;
__host__ double getds() const return ds_m;
;
#endif
DipoleB.cu (Derived member function definition and some CUDA kernels):
#include "BFieldDipoleB.h"
#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"
//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)
ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));
__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)
ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));
__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance
L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
L_norm_m = L_m / RADIUS_EARTH;
s_max_m = getSAtLambda(ILATDegrees_m);
#ifndef __CUDA_ARCH__ //host code
modelName_m = "DipoleB";
setupEnvironment();
#endif /* !__CUDA_ARCH__ */
__host__ __device__ DipoleB::~DipoleB()
#ifndef __CUDA_ARCH__ //host code
deleteEnvironment();
#endif /* !__CUDA_ARCH__ */
//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const
//double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))
return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster
__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
double dlambda 1.0 ;
bool over 0 ;
while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance
while (1)
over = (s_tmp >= s);
if (over)
lambda_tmp += dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp < s)
break;
else
lambda_tmp -= dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp >= s)
break;
if (dlambda < errorTolerance_m / 100.0) //errorTolerance
break;
dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
return lambda_tmp;
__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_deg getLambdaAtS(s) ;
double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;
return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));
__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const
return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);
//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
CUDA_KERNEL_ERRCHK_WSYNC();
void DipoleB::deleteEnvironment()
deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
CUDA_KERNEL_ERRCHK_WSYNC();
CUDA_API_ERRCHK(cudaFree(this_d));
Calling Functions:
__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
double F_lor, F_mir, stmp;
stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK
//Mirror force
F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)
//Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS
return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field
__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)
// dy / dt = f(t, y), y(t_0) = y_0
// funcArgs are whatever you need to pass to the equation
// args array: [s_0, mu, q, m, simtime]
double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;
k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt
t_RK = h / 2;
y = y_0 + k1 * t_RK;
k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)
y = y_0 + k2 * t_RK;
k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)
t_RK = h;
y = y_0 + k3 * t_RK;
k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)
return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y
__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)
unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;
double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;
if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
return;
else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
return;
else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet
t_escape_d[thdInd] = simtime;
return;
else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet
t_escape_d[thdInd] = simtime;
return;
//args array: [ps_0, mu, q, m, simtime]
const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;
v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
s_d[thdInd] += v_d[thdInd] * dt;
A few questions:
- Am I achieving my goals in the most efficient way possible?
- Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?
This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how
BField
is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of aBase
instance:[BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]
and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?
- Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?
c++ polymorphism cuda
add a comment |Â
up vote
7
down vote
favorite
I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.
My goals are:
- A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime
- The ability to call functions of this interface from host AND device with the same underlying implementation
- As performant as possible
The way I've achieved this is to: specify a base BField
class with __host__ __device__
specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB
). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double)
and getGradBAtS(double, double)
) are called on device by a __global__
kernel which is run over ~3.5mil particles.
BField.h (Base class):
#ifndef BFIELD_H
#define BFIELD_H
#include <string>
//CUDA includes
#include "host_defines.h"
class BField
protected:
BField** this_d nullptr ;
#ifndef __CUDA_ARCH__ //host code
std::string modelName_m;
#else //device code
const char* modelName_m; //placeholder, not used
#endif /* !__CUDA_ARCH__ */
__host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
__host__ virtual void deleteEnvironment() = 0;
__host__ __device__ BField()
public:
__host__ __device__ virtual ~BField() ;
__host__ __device__ BField(const BField&) = delete;
__host__ __device__ BField& operator=(const BField&) = delete;
__host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
__host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;
__host__ virtual std::string name() const return modelName_m;
__host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;
#endif
DipoleB.h (Derived):
#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H
#include "BFieldBField.h"
#include "physicalconstants.h"
constexpr double B0 3.12e-5 ; //won't change from sim to sim
class DipoleB : public BField
protected:
//Field simulation constants
double L_m 0.0 ;
double L_norm_m 0.0 ;
double s_max_m 0.0 ;
//specified variables
double ILATDegrees_m 0.0 ;
double ds_m 0.0 ;
double errorTolerance_m 0.0 ;
//protected functions
__host__ virtual void setupEnvironment() override;
__host__ virtual void deleteEnvironment() override;
__host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
__host__ __device__ double getLambdaAtS(const double s) const;
public:
__host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
__host__ __device__ ~DipoleB();
__host__ __device__ DipoleB(const DipoleB&) = delete;
__host__ __device__ DipoleB& operator=(const DipoleB&) = delete;
//for testing
double ILAT() const return ILATDegrees_m;
double ds() const return ds_m;
double L() const return L_m;
double s_max() const return s_max_m;
__host__ virtual void setds(double ds) ds_m = ds;
__host__ __device__ double getBFieldAtS(const double s, const double t) const override;
__host__ __device__ double getGradBAtS (const double s, const double t) const override;
__host__ double getErrTol() const return errorTolerance_m;
__host__ double getds() const return ds_m;
;
#endif
DipoleB.cu (Derived member function definition and some CUDA kernels):
#include "BFieldDipoleB.h"
#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"
//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)
ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));
__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)
ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));
__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance
L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
L_norm_m = L_m / RADIUS_EARTH;
s_max_m = getSAtLambda(ILATDegrees_m);
#ifndef __CUDA_ARCH__ //host code
modelName_m = "DipoleB";
setupEnvironment();
#endif /* !__CUDA_ARCH__ */
__host__ __device__ DipoleB::~DipoleB()
#ifndef __CUDA_ARCH__ //host code
deleteEnvironment();
#endif /* !__CUDA_ARCH__ */
//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const
//double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))
return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster
__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
double dlambda 1.0 ;
bool over 0 ;
while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance
while (1)
over = (s_tmp >= s);
if (over)
lambda_tmp += dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp < s)
break;
else
lambda_tmp -= dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp >= s)
break;
if (dlambda < errorTolerance_m / 100.0) //errorTolerance
break;
dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
return lambda_tmp;
__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_deg getLambdaAtS(s) ;
double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;
return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));
__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const
return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);
//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
CUDA_KERNEL_ERRCHK_WSYNC();
void DipoleB::deleteEnvironment()
deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
CUDA_KERNEL_ERRCHK_WSYNC();
CUDA_API_ERRCHK(cudaFree(this_d));
Calling Functions:
__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
double F_lor, F_mir, stmp;
stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK
//Mirror force
F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)
//Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS
return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field
__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)
// dy / dt = f(t, y), y(t_0) = y_0
// funcArgs are whatever you need to pass to the equation
// args array: [s_0, mu, q, m, simtime]
double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;
k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt
t_RK = h / 2;
y = y_0 + k1 * t_RK;
k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)
y = y_0 + k2 * t_RK;
k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)
t_RK = h;
y = y_0 + k3 * t_RK;
k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)
return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y
__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)
unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;
double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;
if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
return;
else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
return;
else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet
t_escape_d[thdInd] = simtime;
return;
else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet
t_escape_d[thdInd] = simtime;
return;
//args array: [ps_0, mu, q, m, simtime]
const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;
v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
s_d[thdInd] += v_d[thdInd] * dt;
A few questions:
- Am I achieving my goals in the most efficient way possible?
- Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?
This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how
BField
is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of aBase
instance:[BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]
and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?
- Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?
c++ polymorphism cuda
add a comment |Â
up vote
7
down vote
favorite
up vote
7
down vote
favorite
I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.
My goals are:
- A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime
- The ability to call functions of this interface from host AND device with the same underlying implementation
- As performant as possible
The way I've achieved this is to: specify a base BField
class with __host__ __device__
specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB
). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double)
and getGradBAtS(double, double)
) are called on device by a __global__
kernel which is run over ~3.5mil particles.
BField.h (Base class):
#ifndef BFIELD_H
#define BFIELD_H
#include <string>
//CUDA includes
#include "host_defines.h"
class BField
protected:
BField** this_d nullptr ;
#ifndef __CUDA_ARCH__ //host code
std::string modelName_m;
#else //device code
const char* modelName_m; //placeholder, not used
#endif /* !__CUDA_ARCH__ */
__host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
__host__ virtual void deleteEnvironment() = 0;
__host__ __device__ BField()
public:
__host__ __device__ virtual ~BField() ;
__host__ __device__ BField(const BField&) = delete;
__host__ __device__ BField& operator=(const BField&) = delete;
__host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
__host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;
__host__ virtual std::string name() const return modelName_m;
__host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;
#endif
DipoleB.h (Derived):
#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H
#include "BFieldBField.h"
#include "physicalconstants.h"
constexpr double B0 3.12e-5 ; //won't change from sim to sim
class DipoleB : public BField
protected:
//Field simulation constants
double L_m 0.0 ;
double L_norm_m 0.0 ;
double s_max_m 0.0 ;
//specified variables
double ILATDegrees_m 0.0 ;
double ds_m 0.0 ;
double errorTolerance_m 0.0 ;
//protected functions
__host__ virtual void setupEnvironment() override;
__host__ virtual void deleteEnvironment() override;
__host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
__host__ __device__ double getLambdaAtS(const double s) const;
public:
__host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
__host__ __device__ ~DipoleB();
__host__ __device__ DipoleB(const DipoleB&) = delete;
__host__ __device__ DipoleB& operator=(const DipoleB&) = delete;
//for testing
double ILAT() const return ILATDegrees_m;
double ds() const return ds_m;
double L() const return L_m;
double s_max() const return s_max_m;
__host__ virtual void setds(double ds) ds_m = ds;
__host__ __device__ double getBFieldAtS(const double s, const double t) const override;
__host__ __device__ double getGradBAtS (const double s, const double t) const override;
__host__ double getErrTol() const return errorTolerance_m;
__host__ double getds() const return ds_m;
;
#endif
DipoleB.cu (Derived member function definition and some CUDA kernels):
#include "BFieldDipoleB.h"
#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"
//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)
ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));
__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)
ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));
__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance
L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
L_norm_m = L_m / RADIUS_EARTH;
s_max_m = getSAtLambda(ILATDegrees_m);
#ifndef __CUDA_ARCH__ //host code
modelName_m = "DipoleB";
setupEnvironment();
#endif /* !__CUDA_ARCH__ */
__host__ __device__ DipoleB::~DipoleB()
#ifndef __CUDA_ARCH__ //host code
deleteEnvironment();
#endif /* !__CUDA_ARCH__ */
//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const
//double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))
return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster
__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
double dlambda 1.0 ;
bool over 0 ;
while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance
while (1)
over = (s_tmp >= s);
if (over)
lambda_tmp += dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp < s)
break;
else
lambda_tmp -= dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp >= s)
break;
if (dlambda < errorTolerance_m / 100.0) //errorTolerance
break;
dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
return lambda_tmp;
__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_deg getLambdaAtS(s) ;
double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;
return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));
__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const
return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);
//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
CUDA_KERNEL_ERRCHK_WSYNC();
void DipoleB::deleteEnvironment()
deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
CUDA_KERNEL_ERRCHK_WSYNC();
CUDA_API_ERRCHK(cudaFree(this_d));
Calling Functions:
__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
double F_lor, F_mir, stmp;
stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK
//Mirror force
F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)
//Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS
return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field
__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)
// dy / dt = f(t, y), y(t_0) = y_0
// funcArgs are whatever you need to pass to the equation
// args array: [s_0, mu, q, m, simtime]
double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;
k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt
t_RK = h / 2;
y = y_0 + k1 * t_RK;
k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)
y = y_0 + k2 * t_RK;
k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)
t_RK = h;
y = y_0 + k3 * t_RK;
k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)
return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y
__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)
unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;
double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;
if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
return;
else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
return;
else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet
t_escape_d[thdInd] = simtime;
return;
else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet
t_escape_d[thdInd] = simtime;
return;
//args array: [ps_0, mu, q, m, simtime]
const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;
v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
s_d[thdInd] += v_d[thdInd] * dt;
A few questions:
- Am I achieving my goals in the most efficient way possible?
- Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?
This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how
BField
is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of aBase
instance:[BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]
and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?
- Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?
c++ polymorphism cuda
I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.
My goals are:
- A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime
- The ability to call functions of this interface from host AND device with the same underlying implementation
- As performant as possible
The way I've achieved this is to: specify a base BField
class with __host__ __device__
specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB
). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double)
and getGradBAtS(double, double)
) are called on device by a __global__
kernel which is run over ~3.5mil particles.
BField.h (Base class):
#ifndef BFIELD_H
#define BFIELD_H
#include <string>
//CUDA includes
#include "host_defines.h"
class BField
protected:
BField** this_d nullptr ;
#ifndef __CUDA_ARCH__ //host code
std::string modelName_m;
#else //device code
const char* modelName_m; //placeholder, not used
#endif /* !__CUDA_ARCH__ */
__host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
__host__ virtual void deleteEnvironment() = 0;
__host__ __device__ BField()
public:
__host__ __device__ virtual ~BField() ;
__host__ __device__ BField(const BField&) = delete;
__host__ __device__ BField& operator=(const BField&) = delete;
__host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
__host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;
__host__ virtual std::string name() const return modelName_m;
__host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;
#endif
DipoleB.h (Derived):
#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H
#include "BFieldBField.h"
#include "physicalconstants.h"
constexpr double B0 3.12e-5 ; //won't change from sim to sim
class DipoleB : public BField
protected:
//Field simulation constants
double L_m 0.0 ;
double L_norm_m 0.0 ;
double s_max_m 0.0 ;
//specified variables
double ILATDegrees_m 0.0 ;
double ds_m 0.0 ;
double errorTolerance_m 0.0 ;
//protected functions
__host__ virtual void setupEnvironment() override;
__host__ virtual void deleteEnvironment() override;
__host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
__host__ __device__ double getLambdaAtS(const double s) const;
public:
__host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
__host__ __device__ ~DipoleB();
__host__ __device__ DipoleB(const DipoleB&) = delete;
__host__ __device__ DipoleB& operator=(const DipoleB&) = delete;
//for testing
double ILAT() const return ILATDegrees_m;
double ds() const return ds_m;
double L() const return L_m;
double s_max() const return s_max_m;
__host__ virtual void setds(double ds) ds_m = ds;
__host__ __device__ double getBFieldAtS(const double s, const double t) const override;
__host__ __device__ double getGradBAtS (const double s, const double t) const override;
__host__ double getErrTol() const return errorTolerance_m;
__host__ double getds() const return ds_m;
;
#endif
DipoleB.cu (Derived member function definition and some CUDA kernels):
#include "BFieldDipoleB.h"
#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"
//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)
ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));
__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)
ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));
__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance
L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
L_norm_m = L_m / RADIUS_EARTH;
s_max_m = getSAtLambda(ILATDegrees_m);
#ifndef __CUDA_ARCH__ //host code
modelName_m = "DipoleB";
setupEnvironment();
#endif /* !__CUDA_ARCH__ */
__host__ __device__ DipoleB::~DipoleB()
#ifndef __CUDA_ARCH__ //host code
deleteEnvironment();
#endif /* !__CUDA_ARCH__ */
//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const
//double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))
return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster
__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
double dlambda 1.0 ;
bool over 0 ;
while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance
while (1)
over = (s_tmp >= s);
if (over)
lambda_tmp += dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp < s)
break;
else
lambda_tmp -= dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp >= s)
break;
if (dlambda < errorTolerance_m / 100.0) //errorTolerance
break;
dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)
return lambda_tmp;
__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_deg getLambdaAtS(s) ;
double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;
return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));
__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const
return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);
//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
CUDA_KERNEL_ERRCHK_WSYNC();
void DipoleB::deleteEnvironment()
deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
CUDA_KERNEL_ERRCHK_WSYNC();
CUDA_API_ERRCHK(cudaFree(this_d));
Calling Functions:
__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
double F_lor, F_mir, stmp;
stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK
//Mirror force
F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)
//Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS
return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field
__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)
// dy / dt = f(t, y), y(t_0) = y_0
// funcArgs are whatever you need to pass to the equation
// args array: [s_0, mu, q, m, simtime]
double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;
k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt
t_RK = h / 2;
y = y_0 + k1 * t_RK;
k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)
y = y_0 + k2 * t_RK;
k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)
t_RK = h;
y = y_0 + k3 * t_RK;
k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)
return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y
__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)
unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;
double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;
if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
return;
else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
return;
else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet
t_escape_d[thdInd] = simtime;
return;
else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet
t_escape_d[thdInd] = simtime;
return;
//args array: [ps_0, mu, q, m, simtime]
const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;
v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
s_d[thdInd] += v_d[thdInd] * dt;
A few questions:
- Am I achieving my goals in the most efficient way possible?
- Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?
This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how
BField
is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of aBase
instance:[BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]
and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?
- Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?
c++ polymorphism cuda
edited May 2 at 0:36
Jamalâ¦
30.1k11114225
30.1k11114225
asked May 1 at 17:08
TomAdo
385
385
add a comment |Â
add a comment |Â
1 Answer
1
active
oldest
votes
up vote
1
down vote
accepted
nice!
The code is generally rather good. I donâÂÂt have a lot to say about the language usage in the individual lines.
The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.
I donâÂÂt understand the part about making a matching copy on the device.
Why do you have modelName_m
on dummied on the controller? CanâÂÂt you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*
? The base class should take this as a constructor argument and the derived class supply it in the init line.
one thing stuck outâ¦
setupEnvironment
and deleteEnvironment
are called from the constructor and destructor respectively. But they are virtual functions.
The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.
So, donâÂÂt call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.
other
Passing double
everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.
angle_in_degrees
should be the type name, not the variable name. But even better, just angle_t
and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.
Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allowgetBFieldAtS
to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passingL_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m
to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â TomAdo
May 1 at 20:57
Good point onmodelName_m
...I suppose I just like smart containers (string). I could just make oneconst char*
. Think I'll do that.setupEnvironment
anddeleteEnvironment
- good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â TomAdo
May 1 at 21:01
Regarding passingdouble
everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like thetypedef
idea...something liketypedef double mass_kg_t
. Are you saying more here, like some sort of struct containing aconst char *
that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â TomAdo
May 1 at 21:05
Thetypedef
(now, you would use ausing
) makes the function signature more readable.foo (double, double)
which is which?foo (angle_t, length_t)
is clearer, but doesnâÂÂt help the compiler. But, obviously zero performance change since there is zero meaning change. â¯
â JDà Âugosz
May 1 at 21:19
No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But givenangle_t x = 25_degrees;
orx = 0.5_radians;
both are correct, and converted at compile time to the internal format.x = 32;
would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now forstd::chrono::duration
. â¯
â JDà Âugosz
May 1 at 21:27
 |Â
show 2 more comments
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
1
down vote
accepted
nice!
The code is generally rather good. I donâÂÂt have a lot to say about the language usage in the individual lines.
The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.
I donâÂÂt understand the part about making a matching copy on the device.
Why do you have modelName_m
on dummied on the controller? CanâÂÂt you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*
? The base class should take this as a constructor argument and the derived class supply it in the init line.
one thing stuck outâ¦
setupEnvironment
and deleteEnvironment
are called from the constructor and destructor respectively. But they are virtual functions.
The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.
So, donâÂÂt call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.
other
Passing double
everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.
angle_in_degrees
should be the type name, not the variable name. But even better, just angle_t
and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.
Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allowgetBFieldAtS
to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passingL_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m
to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â TomAdo
May 1 at 20:57
Good point onmodelName_m
...I suppose I just like smart containers (string). I could just make oneconst char*
. Think I'll do that.setupEnvironment
anddeleteEnvironment
- good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â TomAdo
May 1 at 21:01
Regarding passingdouble
everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like thetypedef
idea...something liketypedef double mass_kg_t
. Are you saying more here, like some sort of struct containing aconst char *
that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â TomAdo
May 1 at 21:05
Thetypedef
(now, you would use ausing
) makes the function signature more readable.foo (double, double)
which is which?foo (angle_t, length_t)
is clearer, but doesnâÂÂt help the compiler. But, obviously zero performance change since there is zero meaning change. â¯
â JDà Âugosz
May 1 at 21:19
No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But givenangle_t x = 25_degrees;
orx = 0.5_radians;
both are correct, and converted at compile time to the internal format.x = 32;
would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now forstd::chrono::duration
. â¯
â JDà Âugosz
May 1 at 21:27
 |Â
show 2 more comments
up vote
1
down vote
accepted
nice!
The code is generally rather good. I donâÂÂt have a lot to say about the language usage in the individual lines.
The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.
I donâÂÂt understand the part about making a matching copy on the device.
Why do you have modelName_m
on dummied on the controller? CanâÂÂt you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*
? The base class should take this as a constructor argument and the derived class supply it in the init line.
one thing stuck outâ¦
setupEnvironment
and deleteEnvironment
are called from the constructor and destructor respectively. But they are virtual functions.
The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.
So, donâÂÂt call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.
other
Passing double
everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.
angle_in_degrees
should be the type name, not the variable name. But even better, just angle_t
and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.
Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allowgetBFieldAtS
to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passingL_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m
to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â TomAdo
May 1 at 20:57
Good point onmodelName_m
...I suppose I just like smart containers (string). I could just make oneconst char*
. Think I'll do that.setupEnvironment
anddeleteEnvironment
- good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â TomAdo
May 1 at 21:01
Regarding passingdouble
everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like thetypedef
idea...something liketypedef double mass_kg_t
. Are you saying more here, like some sort of struct containing aconst char *
that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â TomAdo
May 1 at 21:05
Thetypedef
(now, you would use ausing
) makes the function signature more readable.foo (double, double)
which is which?foo (angle_t, length_t)
is clearer, but doesnâÂÂt help the compiler. But, obviously zero performance change since there is zero meaning change. â¯
â JDà Âugosz
May 1 at 21:19
No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But givenangle_t x = 25_degrees;
orx = 0.5_radians;
both are correct, and converted at compile time to the internal format.x = 32;
would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now forstd::chrono::duration
. â¯
â JDà Âugosz
May 1 at 21:27
 |Â
show 2 more comments
up vote
1
down vote
accepted
up vote
1
down vote
accepted
nice!
The code is generally rather good. I donâÂÂt have a lot to say about the language usage in the individual lines.
The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.
I donâÂÂt understand the part about making a matching copy on the device.
Why do you have modelName_m
on dummied on the controller? CanâÂÂt you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*
? The base class should take this as a constructor argument and the derived class supply it in the init line.
one thing stuck outâ¦
setupEnvironment
and deleteEnvironment
are called from the constructor and destructor respectively. But they are virtual functions.
The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.
So, donâÂÂt call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.
other
Passing double
everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.
angle_in_degrees
should be the type name, not the variable name. But even better, just angle_t
and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.
nice!
The code is generally rather good. I donâÂÂt have a lot to say about the language usage in the individual lines.
The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.
I donâÂÂt understand the part about making a matching copy on the device.
Why do you have modelName_m
on dummied on the controller? CanâÂÂt you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*
? The base class should take this as a constructor argument and the derived class supply it in the init line.
one thing stuck outâ¦
setupEnvironment
and deleteEnvironment
are called from the constructor and destructor respectively. But they are virtual functions.
The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.
So, donâÂÂt call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.
other
Passing double
everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.
angle_in_degrees
should be the type name, not the variable name. But even better, just angle_t
and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.
edited May 1 at 20:40
answered May 1 at 20:33
JDÃ Âugosz
5,047731
5,047731
Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allowgetBFieldAtS
to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passingL_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m
to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â TomAdo
May 1 at 20:57
Good point onmodelName_m
...I suppose I just like smart containers (string). I could just make oneconst char*
. Think I'll do that.setupEnvironment
anddeleteEnvironment
- good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â TomAdo
May 1 at 21:01
Regarding passingdouble
everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like thetypedef
idea...something liketypedef double mass_kg_t
. Are you saying more here, like some sort of struct containing aconst char *
that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â TomAdo
May 1 at 21:05
Thetypedef
(now, you would use ausing
) makes the function signature more readable.foo (double, double)
which is which?foo (angle_t, length_t)
is clearer, but doesnâÂÂt help the compiler. But, obviously zero performance change since there is zero meaning change. â¯
â JDà Âugosz
May 1 at 21:19
No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But givenangle_t x = 25_degrees;
orx = 0.5_radians;
both are correct, and converted at compile time to the internal format.x = 32;
would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now forstd::chrono::duration
. â¯
â JDà Âugosz
May 1 at 21:27
 |Â
show 2 more comments
Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allowgetBFieldAtS
to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passingL_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m
to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
â TomAdo
May 1 at 20:57
Good point onmodelName_m
...I suppose I just like smart containers (string). I could just make oneconst char*
. Think I'll do that.setupEnvironment
anddeleteEnvironment
- good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
â TomAdo
May 1 at 21:01
Regarding passingdouble
everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like thetypedef
idea...something liketypedef double mass_kg_t
. Are you saying more here, like some sort of struct containing aconst char *
that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
â TomAdo
May 1 at 21:05
Thetypedef
(now, you would use ausing
) makes the function signature more readable.foo (double, double)
which is which?foo (angle_t, length_t)
is clearer, but doesnâÂÂt help the compiler. But, obviously zero performance change since there is zero meaning change. â¯
â JDà Âugosz
May 1 at 21:19
No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But givenangle_t x = 25_degrees;
orx = 0.5_radians;
both are correct, and converted at compile time to the internal format.x = 32;
would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now forstd::chrono::duration
. â¯
â JDà Âugosz
May 1 at 21:27
Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow
getBFieldAtS
to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m
to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?â TomAdo
May 1 at 20:57
Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow
getBFieldAtS
to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m
to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?â TomAdo
May 1 at 20:57
Good point on
modelName_m
...I suppose I just like smart containers (string). I could just make one const char*
. Think I'll do that. setupEnvironment
and deleteEnvironment
- good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.â TomAdo
May 1 at 21:01
Good point on
modelName_m
...I suppose I just like smart containers (string). I could just make one const char*
. Think I'll do that. setupEnvironment
and deleteEnvironment
- good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.â TomAdo
May 1 at 21:01
Regarding passing
double
everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef
idea...something like typedef double mass_kg_t
. Are you saying more here, like some sort of struct containing a const char *
that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.â TomAdo
May 1 at 21:05
Regarding passing
double
everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef
idea...something like typedef double mass_kg_t
. Are you saying more here, like some sort of struct containing a const char *
that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.â TomAdo
May 1 at 21:05
The
typedef
(now, you would use a using
) makes the function signature more readable. foo (double, double)
which is which? foo (angle_t, length_t)
is clearer, but doesnâÂÂt help the compiler. But, obviously zero performance change since there is zero meaning change. â¯â JDà Âugosz
May 1 at 21:19
The
typedef
(now, you would use a using
) makes the function signature more readable. foo (double, double)
which is which? foo (angle_t, length_t)
is clearer, but doesnâÂÂt help the compiler. But, obviously zero performance change since there is zero meaning change. â¯â JDà Âugosz
May 1 at 21:19
No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given
angle_t x = 25_degrees;
or x = 0.5_radians;
both are correct, and converted at compile time to the internal format. x = 32;
would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration
. â¯â JDà Âugosz
May 1 at 21:27
No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given
angle_t x = 25_degrees;
or x = 0.5_radians;
both are correct, and converted at compile time to the internal format. x = 32;
would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration
. â¯â JDà Âugosz
May 1 at 21:27
 |Â
show 2 more comments
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193367%2fcuda-c-host-device-polymorphic-class-implementation%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password