Nested cross-validation

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
0
down vote

favorite

This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).

The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.

Here are some of my concerns, though I am probably completely overlooking the important parts:

Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?

Dataset storage - my class accepts its datapoints and labels as two different arrays X and y. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.

All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.

This is the section of the notebook, relevant to this homework problem:

import numpy as np
def create_kfold_mask(num_samples, k):
 masks = 
 fold_size = num_samples / k
 for i in range(k):
 mask = np.zeros(num_samples, dtype=bool)
 mask[int(i*fold_size):int((i+1)*fold_size)] = True
 masks.append(mask)
 return masks


class NCV:
 '''Nested Cross-Validation.'''
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split


 def __init__( self, X, y, loss=mean_squared_error, k=10 ):
 self._all_data = np.array( list( zip( X, y ) ) )
 np.random.shuffle( self._all_data )

 self._loss = loss

 # Number of groups in the inner loop.
 self._k = k


 def train( self ):
 X, y = zip( *self._all_data )
 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
 tr = np.array( list( zip( X_train, y_train ) ) )
 c = self.calc_hyperparams( tr )

 y_pred = self.fit_model( tr, c).predict( X_test )
 print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
 print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )

 m = self.fit_model( data=self._all_data, c=c )
 return m


 @staticmethod
 def fit_model( data, c, g=10 ):
 m = SVC( gamma=g, C=c )
 X, y = zip( *data )
 m.fit( X, y )
 return m


 @staticmethod
 def calc_risk( y_pred, y_true, loss ):
 '''Empirical risk on a sample.'''
 assert len( y_pred ) == len( y_true )
 return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])


 @classmethod
 def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
 '''Train a model on a dataset. Return a risk estimate.'''
 m = cls.fit_model( train, c, g )
 X, y = zip( *test )
 pred = m.predict( X )
 r = cls.calc_risk( pred, y, loss )
 return r


 @staticmethod
 def calc_crossval_risk( dataset, body, k ):
 '''Apply `body` to overlapping batches of the dataset.'''
 risk = 
 for mask in create_kfold_mask( len( dataset ), k ):
 tr = dataset[ ~ mask ]
 te = dataset[ mask ]
 r = body( train=tr, test=te )
 risk.append( r )
 return sum(risk) / len(risk)


 def calc_hyperparams( self
 , dataset
 , c_grid=np.logspace( start=0, stop=2, num=50 ) ):
 '''Perform a grid search in hyperparameter space.'''
 risk = 
 for c in c_grid:
 body = lambda train, test: self.calc_OOB_risk( train=train, test=test
 , loss=self._loss, c=c, g=10 )
 r = self.calc_crossval_risk( dataset, body, self._k )
 risk.append( r )

 best = c_grid[ np.argmax( risk ) ]
 return best

And this is a sample run:

from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )

edited Jul 13 at 18:03

200_success

123k14143399

asked Jul 13 at 7:09

Vorac

25517

1

Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â€“Â Mathias Ettinger
Jul 13 at 7:53

Also, what is SVC? It is currently undefined, as far as I can tell.
â€“Â Graipher
Jul 13 at 9:46

add a commentÂ |Â

up vote
0
down vote

favorite

This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).

Here are some of my concerns, though I am probably completely overlooking the important parts:

Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?

Dataset storage - my class accepts its datapoints and labels as two different arrays X and y. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.

All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.

This is the section of the notebook, relevant to this homework problem:

import numpy as np
def create_kfold_mask(num_samples, k):
 masks = 
 fold_size = num_samples / k
 for i in range(k):
 mask = np.zeros(num_samples, dtype=bool)
 mask[int(i*fold_size):int((i+1)*fold_size)] = True
 masks.append(mask)
 return masks


class NCV:
 '''Nested Cross-Validation.'''
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split


 def __init__( self, X, y, loss=mean_squared_error, k=10 ):
 self._all_data = np.array( list( zip( X, y ) ) )
 np.random.shuffle( self._all_data )

 self._loss = loss

 # Number of groups in the inner loop.
 self._k = k


 def train( self ):
 X, y = zip( *self._all_data )
 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
 tr = np.array( list( zip( X_train, y_train ) ) )
 c = self.calc_hyperparams( tr )

 y_pred = self.fit_model( tr, c).predict( X_test )
 print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
 print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )

 m = self.fit_model( data=self._all_data, c=c )
 return m


 @staticmethod
 def fit_model( data, c, g=10 ):
 m = SVC( gamma=g, C=c )
 X, y = zip( *data )
 m.fit( X, y )
 return m


 @staticmethod
 def calc_risk( y_pred, y_true, loss ):
 '''Empirical risk on a sample.'''
 assert len( y_pred ) == len( y_true )
 return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])


 @classmethod
 def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
 '''Train a model on a dataset. Return a risk estimate.'''
 m = cls.fit_model( train, c, g )
 X, y = zip( *test )
 pred = m.predict( X )
 r = cls.calc_risk( pred, y, loss )
 return r


 @staticmethod
 def calc_crossval_risk( dataset, body, k ):
 '''Apply `body` to overlapping batches of the dataset.'''
 risk = 
 for mask in create_kfold_mask( len( dataset ), k ):
 tr = dataset[ ~ mask ]
 te = dataset[ mask ]
 r = body( train=tr, test=te )
 risk.append( r )
 return sum(risk) / len(risk)


 def calc_hyperparams( self
 , dataset
 , c_grid=np.logspace( start=0, stop=2, num=50 ) ):
 '''Perform a grid search in hyperparameter space.'''
 risk = 
 for c in c_grid:
 body = lambda train, test: self.calc_OOB_risk( train=train, test=test
 , loss=self._loss, c=c, g=10 )
 r = self.calc_crossval_risk( dataset, body, self._k )
 risk.append( r )

 best = c_grid[ np.argmax( risk ) ]
 return best

And this is a sample run:

from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )

edited Jul 13 at 18:03

200_success

123k14143399

asked Jul 13 at 7:09

Vorac

25517

1

Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â€“Â Mathias Ettinger
Jul 13 at 7:53

Also, what is SVC? It is currently undefined, as far as I can tell.
â€“Â Graipher
Jul 13 at 9:46

add a commentÂ |Â

up vote
0
down vote

favorite

This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).

Here are some of my concerns, though I am probably completely overlooking the important parts:

Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?

Dataset storage - my class accepts its datapoints and labels as two different arrays X and y. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.

All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.

This is the section of the notebook, relevant to this homework problem:

import numpy as np
def create_kfold_mask(num_samples, k):
 masks = 
 fold_size = num_samples / k
 for i in range(k):
 mask = np.zeros(num_samples, dtype=bool)
 mask[int(i*fold_size):int((i+1)*fold_size)] = True
 masks.append(mask)
 return masks


class NCV:
 '''Nested Cross-Validation.'''
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split


 def __init__( self, X, y, loss=mean_squared_error, k=10 ):
 self._all_data = np.array( list( zip( X, y ) ) )
 np.random.shuffle( self._all_data )

 self._loss = loss

 # Number of groups in the inner loop.
 self._k = k


 def train( self ):
 X, y = zip( *self._all_data )
 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
 tr = np.array( list( zip( X_train, y_train ) ) )
 c = self.calc_hyperparams( tr )

 y_pred = self.fit_model( tr, c).predict( X_test )
 print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
 print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )

 m = self.fit_model( data=self._all_data, c=c )
 return m


 @staticmethod
 def fit_model( data, c, g=10 ):
 m = SVC( gamma=g, C=c )
 X, y = zip( *data )
 m.fit( X, y )
 return m


 @staticmethod
 def calc_risk( y_pred, y_true, loss ):
 '''Empirical risk on a sample.'''
 assert len( y_pred ) == len( y_true )
 return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])


 @classmethod
 def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
 '''Train a model on a dataset. Return a risk estimate.'''
 m = cls.fit_model( train, c, g )
 X, y = zip( *test )
 pred = m.predict( X )
 r = cls.calc_risk( pred, y, loss )
 return r


 @staticmethod
 def calc_crossval_risk( dataset, body, k ):
 '''Apply `body` to overlapping batches of the dataset.'''
 risk = 
 for mask in create_kfold_mask( len( dataset ), k ):
 tr = dataset[ ~ mask ]
 te = dataset[ mask ]
 r = body( train=tr, test=te )
 risk.append( r )
 return sum(risk) / len(risk)


 def calc_hyperparams( self
 , dataset
 , c_grid=np.logspace( start=0, stop=2, num=50 ) ):
 '''Perform a grid search in hyperparameter space.'''
 risk = 
 for c in c_grid:
 body = lambda train, test: self.calc_OOB_risk( train=train, test=test
 , loss=self._loss, c=c, g=10 )
 r = self.calc_crossval_risk( dataset, body, self._k )
 risk.append( r )

 best = c_grid[ np.argmax( risk ) ]
 return best

And this is a sample run:

from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )

edited Jul 13 at 18:03

200_success

123k14143399

asked Jul 13 at 7:09

Vorac

25517

This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).

Here are some of my concerns, though I am probably completely overlooking the important parts:

Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?

Dataset storage - my class accepts its datapoints and labels as two different arrays X and y. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.

All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.

This is the section of the notebook, relevant to this homework problem:

import numpy as np
def create_kfold_mask(num_samples, k):
 masks = 
 fold_size = num_samples / k
 for i in range(k):
 mask = np.zeros(num_samples, dtype=bool)
 mask[int(i*fold_size):int((i+1)*fold_size)] = True
 masks.append(mask)
 return masks


class NCV:
 '''Nested Cross-Validation.'''
 from sklearn.metrics import mean_squared_error
 from sklearn.model_selection import train_test_split


 def __init__( self, X, y, loss=mean_squared_error, k=10 ):
 self._all_data = np.array( list( zip( X, y ) ) )
 np.random.shuffle( self._all_data )

 self._loss = loss

 # Number of groups in the inner loop.
 self._k = k


 def train( self ):
 X, y = zip( *self._all_data )
 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
 tr = np.array( list( zip( X_train, y_train ) ) )
 c = self.calc_hyperparams( tr )

 y_pred = self.fit_model( tr, c).predict( X_test )
 print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
 print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )

 m = self.fit_model( data=self._all_data, c=c )
 return m


 @staticmethod
 def fit_model( data, c, g=10 ):
 m = SVC( gamma=g, C=c )
 X, y = zip( *data )
 m.fit( X, y )
 return m


 @staticmethod
 def calc_risk( y_pred, y_true, loss ):
 '''Empirical risk on a sample.'''
 assert len( y_pred ) == len( y_true )
 return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])


 @classmethod
 def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
 '''Train a model on a dataset. Return a risk estimate.'''
 m = cls.fit_model( train, c, g )
 X, y = zip( *test )
 pred = m.predict( X )
 r = cls.calc_risk( pred, y, loss )
 return r


 @staticmethod
 def calc_crossval_risk( dataset, body, k ):
 '''Apply `body` to overlapping batches of the dataset.'''
 risk = 
 for mask in create_kfold_mask( len( dataset ), k ):
 tr = dataset[ ~ mask ]
 te = dataset[ mask ]
 r = body( train=tr, test=te )
 risk.append( r )
 return sum(risk) / len(risk)


 def calc_hyperparams( self
 , dataset
 , c_grid=np.logspace( start=0, stop=2, num=50 ) ):
 '''Perform a grid search in hyperparameter space.'''
 risk = 
 for c in c_grid:
 body = lambda train, test: self.calc_OOB_risk( train=train, test=test
 , loss=self._loss, c=c, g=10 )
 r = self.calc_crossval_risk( dataset, body, self._k )
 risk.append( r )

 best = c_grid[ np.argmax( risk ) ]
 return best

And this is a sample run:

from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )

edited Jul 13 at 18:03

200_success

123k14143399

asked Jul 13 at 7:09

Vorac

25517

edited Jul 13 at 18:03

200_success

123k14143399

edited Jul 13 at 18:03

200_success

123k14143399

edited Jul 13 at 18:03

200_success

123k14143399

asked Jul 13 at 7:09

Vorac

25517

asked Jul 13 at 7:09

Vorac

25517

asked Jul 13 at 7:09

Vorac

25517

1

Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â€“Â Mathias Ettinger
Jul 13 at 7:53

Also, what is SVC? It is currently undefined, as far as I can tell.
â€“Â Graipher
Jul 13 at 9:46

add a commentÂ |Â

1

Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â€“Â Mathias Ettinger
Jul 13 at 7:53

Also, what is SVC? It is currently undefined, as far as I can tell.
â€“Â Graipher
Jul 13 at 9:46

Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â€“Â Mathias Ettinger
Jul 13 at 7:53

Also, what is SVC? It is currently undefined, as far as I can tell.
â€“Â Graipher
Jul 13 at 9:46

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f198406%2fnested-cross-validation%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr