Nested cross-validation

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
0
down vote

favorite












This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).



The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.



Here are some of my concerns, though I am probably completely overlooking the important parts:



  • Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?


  • Dataset storage - my class accepts its datapoints and labels as two different arrays X and y. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.


  • All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.


This is the section of the notebook, relevant to this homework problem:



import numpy as np
def create_kfold_mask(num_samples, k):
masks =
fold_size = num_samples / k
for i in range(k):
mask = np.zeros(num_samples, dtype=bool)
mask[int(i*fold_size):int((i+1)*fold_size)] = True
masks.append(mask)
return masks


class NCV:
'''Nested Cross-Validation.'''
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


def __init__( self, X, y, loss=mean_squared_error, k=10 ):
self._all_data = np.array( list( zip( X, y ) ) )
np.random.shuffle( self._all_data )

self._loss = loss

# Number of groups in the inner loop.
self._k = k


def train( self ):
X, y = zip( *self._all_data )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
tr = np.array( list( zip( X_train, y_train ) ) )
c = self.calc_hyperparams( tr )

y_pred = self.fit_model( tr, c).predict( X_test )
print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )

m = self.fit_model( data=self._all_data, c=c )
return m


@staticmethod
def fit_model( data, c, g=10 ):
m = SVC( gamma=g, C=c )
X, y = zip( *data )
m.fit( X, y )
return m


@staticmethod
def calc_risk( y_pred, y_true, loss ):
'''Empirical risk on a sample.'''
assert len( y_pred ) == len( y_true )
return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])


@classmethod
def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
'''Train a model on a dataset. Return a risk estimate.'''
m = cls.fit_model( train, c, g )
X, y = zip( *test )
pred = m.predict( X )
r = cls.calc_risk( pred, y, loss )
return r


@staticmethod
def calc_crossval_risk( dataset, body, k ):
'''Apply `body` to overlapping batches of the dataset.'''
risk =
for mask in create_kfold_mask( len( dataset ), k ):
tr = dataset[ ~ mask ]
te = dataset[ mask ]
r = body( train=tr, test=te )
risk.append( r )
return sum(risk) / len(risk)


def calc_hyperparams( self
, dataset
, c_grid=np.logspace( start=0, stop=2, num=50 ) ):
'''Perform a grid search in hyperparameter space.'''
risk =
for c in c_grid:
body = lambda train, test: self.calc_OOB_risk( train=train, test=test
, loss=self._loss, c=c, g=10 )
r = self.calc_crossval_risk( dataset, body, self._k )
risk.append( r )

best = c_grid[ np.argmax( risk ) ]
return best


And this is a sample run:



from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )






share|improve this question

















  • 1




    Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
    – Mathias Ettinger
    Jul 13 at 7:53










  • Also, what is SVC? It is currently undefined, as far as I can tell.
    – Graipher
    Jul 13 at 9:46

















up vote
0
down vote

favorite












This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).



The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.



Here are some of my concerns, though I am probably completely overlooking the important parts:



  • Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?


  • Dataset storage - my class accepts its datapoints and labels as two different arrays X and y. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.


  • All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.


This is the section of the notebook, relevant to this homework problem:



import numpy as np
def create_kfold_mask(num_samples, k):
masks =
fold_size = num_samples / k
for i in range(k):
mask = np.zeros(num_samples, dtype=bool)
mask[int(i*fold_size):int((i+1)*fold_size)] = True
masks.append(mask)
return masks


class NCV:
'''Nested Cross-Validation.'''
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


def __init__( self, X, y, loss=mean_squared_error, k=10 ):
self._all_data = np.array( list( zip( X, y ) ) )
np.random.shuffle( self._all_data )

self._loss = loss

# Number of groups in the inner loop.
self._k = k


def train( self ):
X, y = zip( *self._all_data )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
tr = np.array( list( zip( X_train, y_train ) ) )
c = self.calc_hyperparams( tr )

y_pred = self.fit_model( tr, c).predict( X_test )
print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )

m = self.fit_model( data=self._all_data, c=c )
return m


@staticmethod
def fit_model( data, c, g=10 ):
m = SVC( gamma=g, C=c )
X, y = zip( *data )
m.fit( X, y )
return m


@staticmethod
def calc_risk( y_pred, y_true, loss ):
'''Empirical risk on a sample.'''
assert len( y_pred ) == len( y_true )
return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])


@classmethod
def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
'''Train a model on a dataset. Return a risk estimate.'''
m = cls.fit_model( train, c, g )
X, y = zip( *test )
pred = m.predict( X )
r = cls.calc_risk( pred, y, loss )
return r


@staticmethod
def calc_crossval_risk( dataset, body, k ):
'''Apply `body` to overlapping batches of the dataset.'''
risk =
for mask in create_kfold_mask( len( dataset ), k ):
tr = dataset[ ~ mask ]
te = dataset[ mask ]
r = body( train=tr, test=te )
risk.append( r )
return sum(risk) / len(risk)


def calc_hyperparams( self
, dataset
, c_grid=np.logspace( start=0, stop=2, num=50 ) ):
'''Perform a grid search in hyperparameter space.'''
risk =
for c in c_grid:
body = lambda train, test: self.calc_OOB_risk( train=train, test=test
, loss=self._loss, c=c, g=10 )
r = self.calc_crossval_risk( dataset, body, self._k )
risk.append( r )

best = c_grid[ np.argmax( risk ) ]
return best


And this is a sample run:



from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )






share|improve this question

















  • 1




    Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
    – Mathias Ettinger
    Jul 13 at 7:53










  • Also, what is SVC? It is currently undefined, as far as I can tell.
    – Graipher
    Jul 13 at 9:46













up vote
0
down vote

favorite









up vote
0
down vote

favorite











This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).



The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.



Here are some of my concerns, though I am probably completely overlooking the important parts:



  • Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?


  • Dataset storage - my class accepts its datapoints and labels as two different arrays X and y. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.


  • All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.


This is the section of the notebook, relevant to this homework problem:



import numpy as np
def create_kfold_mask(num_samples, k):
masks =
fold_size = num_samples / k
for i in range(k):
mask = np.zeros(num_samples, dtype=bool)
mask[int(i*fold_size):int((i+1)*fold_size)] = True
masks.append(mask)
return masks


class NCV:
'''Nested Cross-Validation.'''
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


def __init__( self, X, y, loss=mean_squared_error, k=10 ):
self._all_data = np.array( list( zip( X, y ) ) )
np.random.shuffle( self._all_data )

self._loss = loss

# Number of groups in the inner loop.
self._k = k


def train( self ):
X, y = zip( *self._all_data )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
tr = np.array( list( zip( X_train, y_train ) ) )
c = self.calc_hyperparams( tr )

y_pred = self.fit_model( tr, c).predict( X_test )
print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )

m = self.fit_model( data=self._all_data, c=c )
return m


@staticmethod
def fit_model( data, c, g=10 ):
m = SVC( gamma=g, C=c )
X, y = zip( *data )
m.fit( X, y )
return m


@staticmethod
def calc_risk( y_pred, y_true, loss ):
'''Empirical risk on a sample.'''
assert len( y_pred ) == len( y_true )
return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])


@classmethod
def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
'''Train a model on a dataset. Return a risk estimate.'''
m = cls.fit_model( train, c, g )
X, y = zip( *test )
pred = m.predict( X )
r = cls.calc_risk( pred, y, loss )
return r


@staticmethod
def calc_crossval_risk( dataset, body, k ):
'''Apply `body` to overlapping batches of the dataset.'''
risk =
for mask in create_kfold_mask( len( dataset ), k ):
tr = dataset[ ~ mask ]
te = dataset[ mask ]
r = body( train=tr, test=te )
risk.append( r )
return sum(risk) / len(risk)


def calc_hyperparams( self
, dataset
, c_grid=np.logspace( start=0, stop=2, num=50 ) ):
'''Perform a grid search in hyperparameter space.'''
risk =
for c in c_grid:
body = lambda train, test: self.calc_OOB_risk( train=train, test=test
, loss=self._loss, c=c, g=10 )
r = self.calc_crossval_risk( dataset, body, self._k )
risk.append( r )

best = c_grid[ np.argmax( risk ) ]
return best


And this is a sample run:



from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )






share|improve this question













This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).



The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.



Here are some of my concerns, though I am probably completely overlooking the important parts:



  • Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?


  • Dataset storage - my class accepts its datapoints and labels as two different arrays X and y. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.


  • All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.


This is the section of the notebook, relevant to this homework problem:



import numpy as np
def create_kfold_mask(num_samples, k):
masks =
fold_size = num_samples / k
for i in range(k):
mask = np.zeros(num_samples, dtype=bool)
mask[int(i*fold_size):int((i+1)*fold_size)] = True
masks.append(mask)
return masks


class NCV:
'''Nested Cross-Validation.'''
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


def __init__( self, X, y, loss=mean_squared_error, k=10 ):
self._all_data = np.array( list( zip( X, y ) ) )
np.random.shuffle( self._all_data )

self._loss = loss

# Number of groups in the inner loop.
self._k = k


def train( self ):
X, y = zip( *self._all_data )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
tr = np.array( list( zip( X_train, y_train ) ) )
c = self.calc_hyperparams( tr )

y_pred = self.fit_model( tr, c).predict( X_test )
print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )

m = self.fit_model( data=self._all_data, c=c )
return m


@staticmethod
def fit_model( data, c, g=10 ):
m = SVC( gamma=g, C=c )
X, y = zip( *data )
m.fit( X, y )
return m


@staticmethod
def calc_risk( y_pred, y_true, loss ):
'''Empirical risk on a sample.'''
assert len( y_pred ) == len( y_true )
return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])


@classmethod
def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
'''Train a model on a dataset. Return a risk estimate.'''
m = cls.fit_model( train, c, g )
X, y = zip( *test )
pred = m.predict( X )
r = cls.calc_risk( pred, y, loss )
return r


@staticmethod
def calc_crossval_risk( dataset, body, k ):
'''Apply `body` to overlapping batches of the dataset.'''
risk =
for mask in create_kfold_mask( len( dataset ), k ):
tr = dataset[ ~ mask ]
te = dataset[ mask ]
r = body( train=tr, test=te )
risk.append( r )
return sum(risk) / len(risk)


def calc_hyperparams( self
, dataset
, c_grid=np.logspace( start=0, stop=2, num=50 ) ):
'''Perform a grid search in hyperparameter space.'''
risk =
for c in c_grid:
body = lambda train, test: self.calc_OOB_risk( train=train, test=test
, loss=self._loss, c=c, g=10 )
r = self.calc_crossval_risk( dataset, body, self._k )
risk.append( r )

best = c_grid[ np.argmax( risk ) ]
return best


And this is a sample run:



from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )








share|improve this question












share|improve this question




share|improve this question








edited Jul 13 at 18:03









200_success

123k14143399




123k14143399









asked Jul 13 at 7:09









Vorac

25517




25517







  • 1




    Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
    – Mathias Ettinger
    Jul 13 at 7:53










  • Also, what is SVC? It is currently undefined, as far as I can tell.
    – Graipher
    Jul 13 at 9:46













  • 1




    Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
    – Mathias Ettinger
    Jul 13 at 7:53










  • Also, what is SVC? It is currently undefined, as far as I can tell.
    – Graipher
    Jul 13 at 9:46








1




1




Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
– Mathias Ettinger
Jul 13 at 7:53




Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
– Mathias Ettinger
Jul 13 at 7:53












Also, what is SVC? It is currently undefined, as far as I can tell.
– Graipher
Jul 13 at 9:46





Also, what is SVC? It is currently undefined, as far as I can tell.
– Graipher
Jul 13 at 9:46
















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f198406%2fnested-cross-validation%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f198406%2fnested-cross-validation%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Chat program with C++ and SFML

Function to Return a JSON Like Objects Using VBA Collections and Arrays

Will my employers contract hold up in court?