CUDA/C++ Host/Device Polymorphic Class Implementation

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
7
down vote

favorite












I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.



My goals are:



  • A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime

  • The ability to call functions of this interface from host AND device with the same underlying implementation

  • As performant as possible

The way I've achieved this is to: specify a base BField class with __host__ __device__ specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double) and getGradBAtS(double, double)) are called on device by a __global__ kernel which is run over ~3.5mil particles.



BField.h (Base class):



#ifndef BFIELD_H
#define BFIELD_H

#include <string>

//CUDA includes
#include "host_defines.h"

class BField

protected:
BField** this_d nullptr ;

#ifndef __CUDA_ARCH__ //host code
std::string modelName_m;
#else //device code
const char* modelName_m; //placeholder, not used
#endif /* !__CUDA_ARCH__ */

__host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
__host__ virtual void deleteEnvironment() = 0;

__host__ __device__ BField()

public:
__host__ __device__ virtual ~BField() ;

__host__ __device__ BField(const BField&) = delete;
__host__ __device__ BField& operator=(const BField&) = delete;

__host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
__host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;

__host__ virtual std::string name() const return modelName_m;
__host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
;

#endif


DipoleB.h (Derived):



#ifndef DIPOLEB_BFIELD_H
#define DIPOLEB_BFIELD_H

#include "BFieldBField.h"
#include "physicalconstants.h"

constexpr double B0 3.12e-5 ; //won't change from sim to sim

class DipoleB : public BField

protected:
//Field simulation constants
double L_m 0.0 ;
double L_norm_m 0.0 ;
double s_max_m 0.0 ;

//specified variables
double ILATDegrees_m 0.0 ;
double ds_m 0.0 ;
double errorTolerance_m 0.0 ;

//protected functions
__host__ virtual void setupEnvironment() override;
__host__ virtual void deleteEnvironment() override;
__host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
__host__ __device__ double getLambdaAtS(const double s) const;

public:
__host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
__host__ __device__ ~DipoleB();
__host__ __device__ DipoleB(const DipoleB&) = delete;
__host__ __device__ DipoleB& operator=(const DipoleB&) = delete;

//for testing
double ILAT() const return ILATDegrees_m;
double ds() const return ds_m;
double L() const return L_m;
double s_max() const return s_max_m;

__host__ virtual void setds(double ds) ds_m = ds;

__host__ __device__ double getBFieldAtS(const double s, const double t) const override;
__host__ __device__ double getGradBAtS (const double s, const double t) const override;

__host__ double getErrTol() const return errorTolerance_m;
__host__ double getds() const return ds_m;
;

#endif


DipoleB.cu (Derived member function definition and some CUDA kernels):



#include "BFieldDipoleB.h"

#include "device_launch_parameters.h"
#include "ErrorHandlingcudaErrorCheck.h"
#include "ErrorHandlingcudaDeviceMacros.h"

//setup CUDA kernels
__global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)

ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));


__global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)

ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));


__host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance

L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
L_norm_m = L_m / RADIUS_EARTH;
s_max_m = getSAtLambda(ILATDegrees_m);

#ifndef __CUDA_ARCH__ //host code
modelName_m = "DipoleB";
setupEnvironment();
#endif /* !__CUDA_ARCH__ */


__host__ __device__ DipoleB::~DipoleB()

#ifndef __CUDA_ARCH__ //host code
deleteEnvironment();
#endif /* !__CUDA_ARCH__ */


//B Field related kernels
__host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const

//double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))

return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster


__host__ __device__ double DipoleB::getLambdaAtS(const double s) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
double dlambda 1.0 ;
bool over 0 ;

while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance

while (1)

over = (s_tmp >= s);
if (over)

lambda_tmp += dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp < s)
break;

else

lambda_tmp -= dlambda;
s_tmp = s_max_m - getSAtLambda(lambda_tmp);
if (s_tmp >= s)
break;


if (dlambda < errorTolerance_m / 100.0) //errorTolerance
break;
dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)


return lambda_tmp;


__host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
double lambda_deg getLambdaAtS(s) ;
double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;

return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));


__host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const

return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);



//DipoleB class member functions
void DipoleB::setupEnvironment()
// consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
CUDA_KERNEL_ERRCHK_WSYNC();


void DipoleB::deleteEnvironment()

deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
CUDA_KERNEL_ERRCHK_WSYNC();

CUDA_API_ERRCHK(cudaFree(this_d));



Calling Functions:



__device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
//args array: [s_0, mu, q, m, simtime]
double F_lor, F_mir, stmp;
stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK

//Mirror force
F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)

//Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS

return (F_lor + F_mir) / args[3];
//returns an acceleration in the parallel direction to the B Field

__device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)

// dy / dt = f(t, y), y(t_0) = y_0
// funcArgs are whatever you need to pass to the equation
// args array: [s_0, mu, q, m, simtime]
double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;

k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt

t_RK = h / 2;
y = y_0 + k1 * t_RK;
k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)

y = y_0 + k2 * t_RK;
k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)

t_RK = h;
y = y_0 + k3 * t_RK;
k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)

return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y


__global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)

unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;

double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;

if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
return;
else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
return;
else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet

t_escape_d[thdInd] = simtime;
return;

else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet

t_escape_d[thdInd] = simtime;
return;


//args array: [ps_0, mu, q, m, simtime]
const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;

v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
s_d[thdInd] += v_d[thdInd] * dt;



A few questions:



  • Am I achieving my goals in the most efficient way possible?

  • Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?


  • This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how BField is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of a Base instance:



    [BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]


    and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?



  • Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?






share|improve this question



























    up vote
    7
    down vote

    favorite












    I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.



    My goals are:



    • A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime

    • The ability to call functions of this interface from host AND device with the same underlying implementation

    • As performant as possible

    The way I've achieved this is to: specify a base BField class with __host__ __device__ specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double) and getGradBAtS(double, double)) are called on device by a __global__ kernel which is run over ~3.5mil particles.



    BField.h (Base class):



    #ifndef BFIELD_H
    #define BFIELD_H

    #include <string>

    //CUDA includes
    #include "host_defines.h"

    class BField

    protected:
    BField** this_d nullptr ;

    #ifndef __CUDA_ARCH__ //host code
    std::string modelName_m;
    #else //device code
    const char* modelName_m; //placeholder, not used
    #endif /* !__CUDA_ARCH__ */

    __host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
    __host__ virtual void deleteEnvironment() = 0;

    __host__ __device__ BField()

    public:
    __host__ __device__ virtual ~BField() ;

    __host__ __device__ BField(const BField&) = delete;
    __host__ __device__ BField& operator=(const BField&) = delete;

    __host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
    __host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;

    __host__ virtual std::string name() const return modelName_m;
    __host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
    ;

    #endif


    DipoleB.h (Derived):



    #ifndef DIPOLEB_BFIELD_H
    #define DIPOLEB_BFIELD_H

    #include "BFieldBField.h"
    #include "physicalconstants.h"

    constexpr double B0 3.12e-5 ; //won't change from sim to sim

    class DipoleB : public BField

    protected:
    //Field simulation constants
    double L_m 0.0 ;
    double L_norm_m 0.0 ;
    double s_max_m 0.0 ;

    //specified variables
    double ILATDegrees_m 0.0 ;
    double ds_m 0.0 ;
    double errorTolerance_m 0.0 ;

    //protected functions
    __host__ virtual void setupEnvironment() override;
    __host__ virtual void deleteEnvironment() override;
    __host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
    __host__ __device__ double getLambdaAtS(const double s) const;

    public:
    __host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
    __host__ __device__ ~DipoleB();
    __host__ __device__ DipoleB(const DipoleB&) = delete;
    __host__ __device__ DipoleB& operator=(const DipoleB&) = delete;

    //for testing
    double ILAT() const return ILATDegrees_m;
    double ds() const return ds_m;
    double L() const return L_m;
    double s_max() const return s_max_m;

    __host__ virtual void setds(double ds) ds_m = ds;

    __host__ __device__ double getBFieldAtS(const double s, const double t) const override;
    __host__ __device__ double getGradBAtS (const double s, const double t) const override;

    __host__ double getErrTol() const return errorTolerance_m;
    __host__ double getds() const return ds_m;
    ;

    #endif


    DipoleB.cu (Derived member function definition and some CUDA kernels):



    #include "BFieldDipoleB.h"

    #include "device_launch_parameters.h"
    #include "ErrorHandlingcudaErrorCheck.h"
    #include "ErrorHandlingcudaDeviceMacros.h"

    //setup CUDA kernels
    __global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)

    ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));


    __global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)

    ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));


    __host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
    BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance

    L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
    L_norm_m = L_m / RADIUS_EARTH;
    s_max_m = getSAtLambda(ILATDegrees_m);

    #ifndef __CUDA_ARCH__ //host code
    modelName_m = "DipoleB";
    setupEnvironment();
    #endif /* !__CUDA_ARCH__ */


    __host__ __device__ DipoleB::~DipoleB()

    #ifndef __CUDA_ARCH__ //host code
    deleteEnvironment();
    #endif /* !__CUDA_ARCH__ */


    //B Field related kernels
    __host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const

    //double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
    double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
    double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))

    return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster


    __host__ __device__ double DipoleB::getLambdaAtS(const double s) const
    // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
    double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
    double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
    double dlambda 1.0 ;
    bool over 0 ;

    while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance

    while (1)

    over = (s_tmp >= s);
    if (over)

    lambda_tmp += dlambda;
    s_tmp = s_max_m - getSAtLambda(lambda_tmp);
    if (s_tmp < s)
    break;

    else

    lambda_tmp -= dlambda;
    s_tmp = s_max_m - getSAtLambda(lambda_tmp);
    if (s_tmp >= s)
    break;


    if (dlambda < errorTolerance_m / 100.0) //errorTolerance
    break;
    dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)


    return lambda_tmp;


    __host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
    // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
    double lambda_deg getLambdaAtS(s) ;
    double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;

    return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));


    __host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const

    return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);



    //DipoleB class member functions
    void DipoleB::setupEnvironment()
    // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
    CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
    setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
    CUDA_KERNEL_ERRCHK_WSYNC();


    void DipoleB::deleteEnvironment()

    deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
    CUDA_KERNEL_ERRCHK_WSYNC();

    CUDA_API_ERRCHK(cudaFree(this_d));



    Calling Functions:



    __device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
    //args array: [s_0, mu, q, m, simtime]
    double F_lor, F_mir, stmp;
    stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK

    //Mirror force
    F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)

    //Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
    F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS

    return (F_lor + F_mir) / args[3];
    //returns an acceleration in the parallel direction to the B Field

    __device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)

    // dy / dt = f(t, y), y(t_0) = y_0
    // funcArgs are whatever you need to pass to the equation
    // args array: [s_0, mu, q, m, simtime]
    double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;

    k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt

    t_RK = h / 2;
    y = y_0 + k1 * t_RK;
    k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)

    y = y_0 + k2 * t_RK;
    k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)

    t_RK = h;
    y = y_0 + k3 * t_RK;
    k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)

    return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y


    __global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
    const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)

    unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;

    double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;

    if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
    return;
    else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
    return;
    else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet

    t_escape_d[thdInd] = simtime;
    return;

    else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet

    t_escape_d[thdInd] = simtime;
    return;


    //args array: [ps_0, mu, q, m, simtime]
    const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;

    v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
    s_d[thdInd] += v_d[thdInd] * dt;



    A few questions:



    • Am I achieving my goals in the most efficient way possible?

    • Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?


    • This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how BField is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of a Base instance:



      [BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]


      and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?



    • Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?






    share|improve this question























      up vote
      7
      down vote

      favorite









      up vote
      7
      down vote

      favorite











      I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.



      My goals are:



      • A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime

      • The ability to call functions of this interface from host AND device with the same underlying implementation

      • As performant as possible

      The way I've achieved this is to: specify a base BField class with __host__ __device__ specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double) and getGradBAtS(double, double)) are called on device by a __global__ kernel which is run over ~3.5mil particles.



      BField.h (Base class):



      #ifndef BFIELD_H
      #define BFIELD_H

      #include <string>

      //CUDA includes
      #include "host_defines.h"

      class BField

      protected:
      BField** this_d nullptr ;

      #ifndef __CUDA_ARCH__ //host code
      std::string modelName_m;
      #else //device code
      const char* modelName_m; //placeholder, not used
      #endif /* !__CUDA_ARCH__ */

      __host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
      __host__ virtual void deleteEnvironment() = 0;

      __host__ __device__ BField()

      public:
      __host__ __device__ virtual ~BField() ;

      __host__ __device__ BField(const BField&) = delete;
      __host__ __device__ BField& operator=(const BField&) = delete;

      __host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
      __host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;

      __host__ virtual std::string name() const return modelName_m;
      __host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
      ;

      #endif


      DipoleB.h (Derived):



      #ifndef DIPOLEB_BFIELD_H
      #define DIPOLEB_BFIELD_H

      #include "BFieldBField.h"
      #include "physicalconstants.h"

      constexpr double B0 3.12e-5 ; //won't change from sim to sim

      class DipoleB : public BField

      protected:
      //Field simulation constants
      double L_m 0.0 ;
      double L_norm_m 0.0 ;
      double s_max_m 0.0 ;

      //specified variables
      double ILATDegrees_m 0.0 ;
      double ds_m 0.0 ;
      double errorTolerance_m 0.0 ;

      //protected functions
      __host__ virtual void setupEnvironment() override;
      __host__ virtual void deleteEnvironment() override;
      __host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
      __host__ __device__ double getLambdaAtS(const double s) const;

      public:
      __host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
      __host__ __device__ ~DipoleB();
      __host__ __device__ DipoleB(const DipoleB&) = delete;
      __host__ __device__ DipoleB& operator=(const DipoleB&) = delete;

      //for testing
      double ILAT() const return ILATDegrees_m;
      double ds() const return ds_m;
      double L() const return L_m;
      double s_max() const return s_max_m;

      __host__ virtual void setds(double ds) ds_m = ds;

      __host__ __device__ double getBFieldAtS(const double s, const double t) const override;
      __host__ __device__ double getGradBAtS (const double s, const double t) const override;

      __host__ double getErrTol() const return errorTolerance_m;
      __host__ double getds() const return ds_m;
      ;

      #endif


      DipoleB.cu (Derived member function definition and some CUDA kernels):



      #include "BFieldDipoleB.h"

      #include "device_launch_parameters.h"
      #include "ErrorHandlingcudaErrorCheck.h"
      #include "ErrorHandlingcudaDeviceMacros.h"

      //setup CUDA kernels
      __global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)

      ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));


      __global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)

      ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));


      __host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
      BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance

      L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
      L_norm_m = L_m / RADIUS_EARTH;
      s_max_m = getSAtLambda(ILATDegrees_m);

      #ifndef __CUDA_ARCH__ //host code
      modelName_m = "DipoleB";
      setupEnvironment();
      #endif /* !__CUDA_ARCH__ */


      __host__ __device__ DipoleB::~DipoleB()

      #ifndef __CUDA_ARCH__ //host code
      deleteEnvironment();
      #endif /* !__CUDA_ARCH__ */


      //B Field related kernels
      __host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const

      //double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
      double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
      double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))

      return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster


      __host__ __device__ double DipoleB::getLambdaAtS(const double s) const
      // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
      double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
      double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
      double dlambda 1.0 ;
      bool over 0 ;

      while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance

      while (1)

      over = (s_tmp >= s);
      if (over)

      lambda_tmp += dlambda;
      s_tmp = s_max_m - getSAtLambda(lambda_tmp);
      if (s_tmp < s)
      break;

      else

      lambda_tmp -= dlambda;
      s_tmp = s_max_m - getSAtLambda(lambda_tmp);
      if (s_tmp >= s)
      break;


      if (dlambda < errorTolerance_m / 100.0) //errorTolerance
      break;
      dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)


      return lambda_tmp;


      __host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
      // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
      double lambda_deg getLambdaAtS(s) ;
      double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;

      return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));


      __host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const

      return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);



      //DipoleB class member functions
      void DipoleB::setupEnvironment()
      // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
      CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
      setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
      CUDA_KERNEL_ERRCHK_WSYNC();


      void DipoleB::deleteEnvironment()

      deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
      CUDA_KERNEL_ERRCHK_WSYNC();

      CUDA_API_ERRCHK(cudaFree(this_d));



      Calling Functions:



      __device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
      //args array: [s_0, mu, q, m, simtime]
      double F_lor, F_mir, stmp;
      stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK

      //Mirror force
      F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)

      //Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
      F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS

      return (F_lor + F_mir) / args[3];
      //returns an acceleration in the parallel direction to the B Field

      __device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)

      // dy / dt = f(t, y), y(t_0) = y_0
      // funcArgs are whatever you need to pass to the equation
      // args array: [s_0, mu, q, m, simtime]
      double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;

      k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt

      t_RK = h / 2;
      y = y_0 + k1 * t_RK;
      k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)

      y = y_0 + k2 * t_RK;
      k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)

      t_RK = h;
      y = y_0 + k3 * t_RK;
      k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)

      return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y


      __global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
      const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)

      unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;

      double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;

      if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
      return;
      else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
      return;
      else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet

      t_escape_d[thdInd] = simtime;
      return;

      else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet

      t_escape_d[thdInd] = simtime;
      return;


      //args array: [ps_0, mu, q, m, simtime]
      const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;

      v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
      s_d[thdInd] += v_d[thdInd] * dt;



      A few questions:



      • Am I achieving my goals in the most efficient way possible?

      • Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?


      • This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how BField is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of a Base instance:



        [BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]


        and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?



      • Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?






      share|improve this question













      I have an abstract class which acts as in interface for a variety of physical models providing Electric/Magnetic Fields as the result of a number of phenomena. I'm wondering if how I've done it is a good way to do it, or if there's a better way to achieve my goals.



      My goals are:



      • A common interface for returning the field strength (and a few related things through different functions) according to a physical field model chosen by the user at runtime

      • The ability to call functions of this interface from host AND device with the same underlying implementation

      • As performant as possible

      The way I've achieved this is to: specify a base BField class with __host__ __device__ specified pure virtual interface functions, and overwrite these with a number of derived classes (here DipoleB). On host, when an instance of the derived class is created, a mirror image of the instance is also created on device and a pointer to the on-device instance is stored on host. This on-device instance is also destroyed on host-instance destruction. The interface functions (here it's getBFieldAtS(double, double) and getGradBAtS(double, double)) are called on device by a __global__ kernel which is run over ~3.5mil particles.



      BField.h (Base class):



      #ifndef BFIELD_H
      #define BFIELD_H

      #include <string>

      //CUDA includes
      #include "host_defines.h"

      class BField

      protected:
      BField** this_d nullptr ;

      #ifndef __CUDA_ARCH__ //host code
      std::string modelName_m;
      #else //device code
      const char* modelName_m; //placeholder, not used
      #endif /* !__CUDA_ARCH__ */

      __host__ virtual void setupEnvironment() = 0; //define this function in derived classes to assign a pointer to that function's B Field code to the location indicated by BFieldFcnPtr_d and gradBFcnPtr_d
      __host__ virtual void deleteEnvironment() = 0;

      __host__ __device__ BField()

      public:
      __host__ __device__ virtual ~BField() ;

      __host__ __device__ BField(const BField&) = delete;
      __host__ __device__ BField& operator=(const BField&) = delete;

      __host__ __device__ virtual double getBFieldAtS(const double s, const double t) const = 0;
      __host__ __device__ virtual double getGradBAtS (const double s, const double t) const = 0;

      __host__ virtual std::string name() const return modelName_m;
      __host__ virtual BField** getPtrGPU() const return this_d; //once returned, have to cast it to the appropriate type
      ;

      #endif


      DipoleB.h (Derived):



      #ifndef DIPOLEB_BFIELD_H
      #define DIPOLEB_BFIELD_H

      #include "BFieldBField.h"
      #include "physicalconstants.h"

      constexpr double B0 3.12e-5 ; //won't change from sim to sim

      class DipoleB : public BField

      protected:
      //Field simulation constants
      double L_m 0.0 ;
      double L_norm_m 0.0 ;
      double s_max_m 0.0 ;

      //specified variables
      double ILATDegrees_m 0.0 ;
      double ds_m 0.0 ;
      double errorTolerance_m 0.0 ;

      //protected functions
      __host__ virtual void setupEnvironment() override;
      __host__ virtual void deleteEnvironment() override;
      __host__ __device__ double getSAtLambda(const double lambdaDegrees) const;
      __host__ __device__ double getLambdaAtS(const double s) const;

      public:
      __host__ __device__ DipoleB(double ILATDegrees, double errorTolerance = 1e-4, double ds = RADIUS_EARTH / 1000.0);
      __host__ __device__ ~DipoleB();
      __host__ __device__ DipoleB(const DipoleB&) = delete;
      __host__ __device__ DipoleB& operator=(const DipoleB&) = delete;

      //for testing
      double ILAT() const return ILATDegrees_m;
      double ds() const return ds_m;
      double L() const return L_m;
      double s_max() const return s_max_m;

      __host__ virtual void setds(double ds) ds_m = ds;

      __host__ __device__ double getBFieldAtS(const double s, const double t) const override;
      __host__ __device__ double getGradBAtS (const double s, const double t) const override;

      __host__ double getErrTol() const return errorTolerance_m;
      __host__ double getds() const return ds_m;
      ;

      #endif


      DipoleB.cu (Derived member function definition and some CUDA kernels):



      #include "BFieldDipoleB.h"

      #include "device_launch_parameters.h"
      #include "ErrorHandlingcudaErrorCheck.h"
      #include "ErrorHandlingcudaDeviceMacros.h"

      //setup CUDA kernels
      __global__ void setupEnvironmentGPU_DipoleB(BField** this_d, double ILATDeg, double errTol, double ds)

      ZEROTH_THREAD_ONLY("setupEnvironmentGPU_DipoleB", (*this_d) = new DipoleB(ILATDeg, errTol, ds));


      __global__ void deleteEnvironmentGPU_DipoleB(BField** dipoleb)

      ZEROTH_THREAD_ONLY("deleteEnvironmentGPU_DipoleB", delete ((DipoleB*)(*dipoleb)));


      __host__ __device__ DipoleB::DipoleB(double ILATDegrees, double errorTolerance, double ds) :
      BField(), ILATDegrees_m ILATDegrees , ds_m ds , errorTolerance_m errorTolerance

      L_m = RADIUS_EARTH / pow(cos(ILATDegrees * RADS_PER_DEG), 2);
      L_norm_m = L_m / RADIUS_EARTH;
      s_max_m = getSAtLambda(ILATDegrees_m);

      #ifndef __CUDA_ARCH__ //host code
      modelName_m = "DipoleB";
      setupEnvironment();
      #endif /* !__CUDA_ARCH__ */


      __host__ __device__ DipoleB::~DipoleB()

      #ifndef __CUDA_ARCH__ //host code
      deleteEnvironment();
      #endif /* !__CUDA_ARCH__ */


      //B Field related kernels
      __host__ __device__ double DipoleB::getSAtLambda(const double lambdaDegrees) const

      //double x asinh(sqrt(3.0) * sinpi(lambdaDegrees / 180.0)) ;
      double sinh_x sqrt(3.0) * sinpi(lambdaDegrees / 180.0) ;
      double x log(sinh_x + sqrt(sinh_x * sinh_x + 1)) ; //trig identity for asinh - a bit faster - asinh(x) == ln(x + sqrt(x*x + 1))

      return (0.5 * L_m / sqrt(3.0)) * (x + 0.25 * (exp(2.0*x)-exp(-2.0*x))); /* L */ //0.25 * (exp(2*x)-exp(-2*x)) == sinh(x) * cosh(x) and is faster


      __host__ __device__ double DipoleB::getLambdaAtS(const double s) const
      // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
      double lambda_tmp (-ILATDegrees_m / s_max_m) * s + ILATDegrees_m ; //-ILAT / s_max * s + ILAT
      double s_tmp s_max_m - getSAtLambda(lambda_tmp) ;
      double dlambda 1.0 ;
      bool over 0 ;

      while (abs((s_tmp - s) / s) > errorTolerance_m) //errorTolerance

      while (1)

      over = (s_tmp >= s);
      if (over)

      lambda_tmp += dlambda;
      s_tmp = s_max_m - getSAtLambda(lambda_tmp);
      if (s_tmp < s)
      break;

      else

      lambda_tmp -= dlambda;
      s_tmp = s_max_m - getSAtLambda(lambda_tmp);
      if (s_tmp >= s)
      break;


      if (dlambda < errorTolerance_m / 100.0) //errorTolerance
      break;
      dlambda /= 5.0; //through trial and error, this reduces the number of calculations usually (compared with 2, 2.5, 3, 4, 10)


      return lambda_tmp;


      __host__ __device__ double DipoleB::getBFieldAtS(const double s, const double simtime) const
      // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
      double lambda_deg getLambdaAtS(s) ;
      double rnorm L_norm_m * cospi(lambda_deg / 180.0) * cospi(lambda_deg / 180.0) ;

      return -B0 / (rnorm * rnorm * rnorm) * sqrt(1.0 + 3 * sinpi(lambda_deg / 180.0) * sinpi(lambda_deg / 180.0));


      __host__ __device__ double DipoleB::getGradBAtS(const double s, const double simtime) const

      return (getBFieldAtS(s + ds_m, simtime) - getBFieldAtS(s - ds_m, simtime)) / (2 * ds_m);



      //DipoleB class member functions
      void DipoleB::setupEnvironment()
      // consts: [ ILATDeg, L, L_norm, s_max, ds, errorTolerance ]
      CUDA_API_ERRCHK(cudaMalloc((void **)&this_d, sizeof(BField**)));
      setupEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d, ILATDegrees_m, errorTolerance_m, ds_m);
      CUDA_KERNEL_ERRCHK_WSYNC();


      void DipoleB::deleteEnvironment()

      deleteEnvironmentGPU_DipoleB <<< 1, 1 >>> (this_d);
      CUDA_KERNEL_ERRCHK_WSYNC();

      CUDA_API_ERRCHK(cudaFree(this_d));



      Calling Functions:



      __device__ double accel1dCUDA(const double vs_RK, const double t_RK, const double* args, BField** bfield, EField** efield) //made to pass into 1D Fourth Order Runge Kutta code
      //args array: [s_0, mu, q, m, simtime]
      double F_lor, F_mir, stmp;
      stmp = args[0] + vs_RK * t_RK; //ps_0 + vs_RK * t_RK

      //Mirror force
      F_mir = -args[1] * (*bfield)->getGradBAtS(stmp, t_RK + args[4]); //-mu * gradB(pos, runge-kutta time + simtime)

      //Lorentz force - simply qE - v x B is taken care of by mu - results in kg.m/s^2 - to convert to Re equivalent - divide by Re
      F_lor = args[2] * (*efield)->getEFieldAtS(stmp, t_RK + args[4]); //q * EFieldatS

      return (F_lor + F_mir) / args[3];
      //returns an acceleration in the parallel direction to the B Field

      __device__ double foRungeKuttaCUDA(const double y_0, const double h, const double* funcArg, BField** bfield, EField** efield)

      // dy / dt = f(t, y), y(t_0) = y_0
      // funcArgs are whatever you need to pass to the equation
      // args array: [s_0, mu, q, m, simtime]
      double k1, k2, k3, k4; double y y_0 ; double t_RK 0.0 ;

      k1 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k1 = f(t_n, y_n), returns units of dy / dt

      t_RK = h / 2;
      y = y_0 + k1 * t_RK;
      k2 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k2 = f(t_n + h/2, y_n + h/2 * k1)

      y = y_0 + k2 * t_RK;
      k3 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k3 = f(t_n + h/2, y_n + h/2 * k2)

      t_RK = h;
      y = y_0 + k3 * t_RK;
      k4 = accel1dCUDA(y, t_RK, funcArg, bfield, efield); //k4 = f(t_n + h, y_n + h k3)

      return (k1 + 2 * k2 + 2 * k3 + k4) * h / 6; //returns delta y, not dy / dt, not total y


      __global__ void computeKernel(double** currData_d, BField** bfield, EField** efield,
      const double simtime, const double dt, const double mass, const double charge, const double simmin, const double simmax)

      unsigned int thdInd blockIdx.x * blockDim.x + threadIdx.x ;

      double* v_d currData_d[0] ; const double* mu_d currData_d[1] ; double* s_d currData_d[2] ; const double* t_incident_d currData_d[3] ; double* t_escape_d currData_d[4] ;

      if (t_escape_d[thdInd] >= 0.0) //particle has escaped, t_escape is >= 0 iff it has both entered and is outside the sim boundaries
      return;
      else if (t_incident_d[thdInd] > simtime) //particle hasn't "entered the sim" yet
      return;
      else if (s_d[thdInd] < simmin * 0.999) //particle is out of sim to the bottom and t_escape not set yet

      t_escape_d[thdInd] = simtime;
      return;

      else if (s_d[thdInd] > simmax * 1.001) //particle is out of sim to the top and t_escape not set yet

      t_escape_d[thdInd] = simtime;
      return;


      //args array: [ps_0, mu, q, m, simtime]
      const double args s_d[thdInd], mu_d[thdInd], charge, mass, simtime ;

      v_d[thdInd] += foRungeKuttaCUDA(v_d[thdInd], dt, args, bfield, efield) / 2;
      s_d[thdInd] += v_d[thdInd] * dt;



      A few questions:



      • Am I achieving my goals in the most efficient way possible?

      • Are there any performance issues incurred by the fact that I'm creating one instance of a derived class on GPU and calling the interface function ~3.5 million * number of iterations times? That is, what are the implications of this many calls to a single member function?


      • This produces expected physical results (that is, calls to interface functions are producing the correct values because the particles behave appropriately), however when running through cuda-memcheck, I get a whole host of issues. I'm thinking this is because of how BField is set up and the fact that calling the (virtual) interface functions accesses something that would be outside the memory footprint of a Base instance:



        [BField instance memory footprint][-------(x impl of virt fcn here)----DipoleB Instance footprint-------]


        and cuda-memcheck doesn't think this should be valid. Does this sound feasible? Do I understand what is going on right?



      • Any non-optimal performance issues incurred by device-side dynamic allocation? Is there even another way to do this?








      share|improve this question












      share|improve this question




      share|improve this question








      edited May 2 at 0:36









      Jamal♦

      30.1k11114225




      30.1k11114225









      asked May 1 at 17:08









      TomAdo

      385




      385




















          1 Answer
          1






          active

          oldest

          votes

















          up vote
          1
          down vote



          accepted










          nice!



          The code is generally rather good. I don’t have a lot to say about the language usage in the individual lines.



          The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.



          I don’t understand the part about making a matching copy on the device.



          Why do you have modelName_m on dummied on the controller? Can’t you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*? The base class should take this as a constructor argument and the derived class supply it in the init line.



          one thing stuck out…



          setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.



          The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.



          So, don’t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.



          other



          Passing double everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.



          angle_in_degrees should be the type name, not the variable name. But even better, just angle_t and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.






          share|improve this answer























          • Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
            – TomAdo
            May 1 at 20:57










          • Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
            – TomAdo
            May 1 at 21:01










          • Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
            – TomAdo
            May 1 at 21:05










          • The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesn’t help the compiler. But, obviously zero performance change since there is zero meaning change. ⋯
            – JDługosz
            May 1 at 21:19










          • No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. ⋯
            – JDługosz
            May 1 at 21:27










          Your Answer




          StackExchange.ifUsing("editor", function ()
          return StackExchange.using("mathjaxEditing", function ()
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          );
          );
          , "mathjax-editing");

          StackExchange.ifUsing("editor", function ()
          StackExchange.using("externalEditor", function ()
          StackExchange.using("snippets", function ()
          StackExchange.snippets.init();
          );
          );
          , "code-snippets");

          StackExchange.ready(function()
          var channelOptions =
          tags: "".split(" "),
          id: "196"
          ;
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function()
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled)
          StackExchange.using("snippets", function()
          createEditor();
          );

          else
          createEditor();

          );

          function createEditor()
          StackExchange.prepareEditor(
          heartbeatType: 'answer',
          convertImagesToLinks: false,
          noModals: false,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          );



          );








           

          draft saved


          draft discarded


















          StackExchange.ready(
          function ()
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193367%2fcuda-c-host-device-polymorphic-class-implementation%23new-answer', 'question_page');

          );

          Post as a guest






























          1 Answer
          1






          active

          oldest

          votes








          1 Answer
          1






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          up vote
          1
          down vote



          accepted










          nice!



          The code is generally rather good. I don’t have a lot to say about the language usage in the individual lines.



          The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.



          I don’t understand the part about making a matching copy on the device.



          Why do you have modelName_m on dummied on the controller? Can’t you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*? The base class should take this as a constructor argument and the derived class supply it in the init line.



          one thing stuck out…



          setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.



          The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.



          So, don’t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.



          other



          Passing double everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.



          angle_in_degrees should be the type name, not the variable name. But even better, just angle_t and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.






          share|improve this answer























          • Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
            – TomAdo
            May 1 at 20:57










          • Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
            – TomAdo
            May 1 at 21:01










          • Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
            – TomAdo
            May 1 at 21:05










          • The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesn’t help the compiler. But, obviously zero performance change since there is zero meaning change. ⋯
            – JDługosz
            May 1 at 21:19










          • No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. ⋯
            – JDługosz
            May 1 at 21:27














          up vote
          1
          down vote



          accepted










          nice!



          The code is generally rather good. I don’t have a lot to say about the language usage in the individual lines.



          The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.



          I don’t understand the part about making a matching copy on the device.



          Why do you have modelName_m on dummied on the controller? Can’t you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*? The base class should take this as a constructor argument and the derived class supply it in the init line.



          one thing stuck out…



          setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.



          The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.



          So, don’t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.



          other



          Passing double everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.



          angle_in_degrees should be the type name, not the variable name. But even better, just angle_t and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.






          share|improve this answer























          • Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
            – TomAdo
            May 1 at 20:57










          • Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
            – TomAdo
            May 1 at 21:01










          • Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
            – TomAdo
            May 1 at 21:05










          • The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesn’t help the compiler. But, obviously zero performance change since there is zero meaning change. ⋯
            – JDługosz
            May 1 at 21:19










          • No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. ⋯
            – JDługosz
            May 1 at 21:27












          up vote
          1
          down vote



          accepted







          up vote
          1
          down vote



          accepted






          nice!



          The code is generally rather good. I don’t have a lot to say about the language usage in the individual lines.



          The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.



          I don’t understand the part about making a matching copy on the device.



          Why do you have modelName_m on dummied on the controller? Can’t you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*? The base class should take this as a constructor argument and the derived class supply it in the init line.



          one thing stuck out…



          setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.



          The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.



          So, don’t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.



          other



          Passing double everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.



          angle_in_degrees should be the type name, not the variable name. But even better, just angle_t and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.






          share|improve this answer















          nice!



          The code is generally rather good. I don’t have a lot to say about the language usage in the individual lines.



          The idea of using an abstract base to do different implementations selected at runtime is classic, easy to follow, and costs one indirection at calling time. Any run-time system to choose between different impls would have the same cost at least.



          I don’t understand the part about making a matching copy on the device.



          Why do you have modelName_m on dummied on the controller? Can’t you just leave it out? Since its only use is to set from a lexical string literal, why not keep it as a const char*? The base class should take this as a constructor argument and the derived class supply it in the init line.



          one thing stuck out…



          setupEnvironment and deleteEnvironment are called from the constructor and destructor respectively. But they are virtual functions.



          The object "is" of the type being constructed here, even if it is really of a derived class: it runs the base class ctors and then updates the vtable for the derived class and then runs the derived ctor body.



          So, don’t call virtual functions from the ctor/dtor. To do what this actually means, make it non-virtual.



          other



          Passing double everywhere when values have different meanings is a good way to crash your spacecraft on Mars. Consider a zero-overhead unit labeling template, or at the very least typedefs for different meanings even though they are not checked by the compiler.



          angle_in_degrees should be the type name, not the variable name. But even better, just angle_t and the choice of degrees or whatever is made by the caller and it converts (perhaps at compile time!) to the unit that the type really holds.







          share|improve this answer















          share|improve this answer



          share|improve this answer








          edited May 1 at 20:40


























          answered May 1 at 20:33









          JDługosz

          5,047731




          5,047731











          • Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
            – TomAdo
            May 1 at 20:57










          • Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
            – TomAdo
            May 1 at 21:01










          • Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
            – TomAdo
            May 1 at 21:05










          • The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesn’t help the compiler. But, obviously zero performance change since there is zero meaning change. ⋯
            – JDługosz
            May 1 at 21:19










          • No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. ⋯
            – JDługosz
            May 1 at 21:27
















          • Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
            – TomAdo
            May 1 at 20:57










          • Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
            – TomAdo
            May 1 at 21:01










          • Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
            – TomAdo
            May 1 at 21:05










          • The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesn’t help the compiler. But, obviously zero performance change since there is zero meaning change. ⋯
            – JDługosz
            May 1 at 21:19










          • No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. ⋯
            – JDługosz
            May 1 at 21:27















          Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
          – TomAdo
          May 1 at 20:57




          Thanks for the feedback!! A few responses. "I don't understand the part about making a matching copy on the device." This is done to allow getBFieldAtS to be called on both device (where most computing is done) and on host (where it's needed on occasion). Instead of passing L_m, L_norm_m, s_max_m, ILATDegrees_m, ds_m, errorTolerance_m to the device function every call, I figured it was easier to create a copy of the class on device. This is even more necessary in other implementations (which have more complexity). Is there a better way?
          – TomAdo
          May 1 at 20:57












          Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
          – TomAdo
          May 1 at 21:01




          Good point on modelName_m...I suppose I just like smart containers (string). I could just make one const char*. Think I'll do that. setupEnvironment and deleteEnvironment - good point - I think that's an oversight on my part, as they don't need to be virtual. Great point.
          – TomAdo
          May 1 at 21:01












          Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
          – TomAdo
          May 1 at 21:05




          Regarding passing double everywhere...so are you saying typenames for each unit (mass in kg, accel in m/s2, etc)? I like the typedef idea...something like typedef double mass_kg_t. Are you saying more here, like some sort of struct containing a const char * that labels the units? That's an interesting idea...my one reservation with it is: will it impact GPU performance...(hence why you said "zero-overhead"). Is what I said an example of zero-overhead? If not, could you provide an example? Thanks again for the great advice.
          – TomAdo
          May 1 at 21:05












          The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesn’t help the compiler. But, obviously zero performance change since there is zero meaning change. ⋯
          – JDługosz
          May 1 at 21:19




          The typedef (now, you would use a using) makes the function signature more readable. foo (double, double) which is which? foo (angle_t, length_t) is clearer, but doesn’t help the compiler. But, obviously zero performance change since there is zero meaning change. ⋯
          – JDługosz
          May 1 at 21:19












          No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. ⋯
          – JDługosz
          May 1 at 21:27




          No, a type-safe system would not contain a string. It would still contain the same bits as it does now. But given angle_t x = 25_degrees; or x = 0.5_radians; both are correct, and converted at compile time to the internal format. x = 32; would be a compile-time error. A classic zero-overhead template system is Boost.Units (boost.org/doc/libs/1_67_0/doc/html/boost_units.html). A narrower example that is more up-to-date is found in the standard library now for std::chrono::duration. ⋯
          – JDługosz
          May 1 at 21:27












           

          draft saved


          draft discarded


























           


          draft saved


          draft discarded














          StackExchange.ready(
          function ()
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f193367%2fcuda-c-host-device-polymorphic-class-implementation%23new-answer', 'question_page');

          );

          Post as a guest













































































          Popular posts from this blog

          Chat program with C++ and SFML

          Function to Return a JSON Like Objects Using VBA Collections and Arrays

          Will my employers contract hold up in court?