Nested cross-validation
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
0
down vote
favorite
This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).
The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.
Here are some of my concerns, though I am probably completely overlooking the important parts:
Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?
Dataset storage - my class accepts its datapoints and labels as two different arrays
X
andy
. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.
This is the section of the notebook, relevant to this homework problem:
import numpy as np
def create_kfold_mask(num_samples, k):
masks =
fold_size = num_samples / k
for i in range(k):
mask = np.zeros(num_samples, dtype=bool)
mask[int(i*fold_size):int((i+1)*fold_size)] = True
masks.append(mask)
return masks
class NCV:
'''Nested Cross-Validation.'''
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def __init__( self, X, y, loss=mean_squared_error, k=10 ):
self._all_data = np.array( list( zip( X, y ) ) )
np.random.shuffle( self._all_data )
self._loss = loss
# Number of groups in the inner loop.
self._k = k
def train( self ):
X, y = zip( *self._all_data )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
tr = np.array( list( zip( X_train, y_train ) ) )
c = self.calc_hyperparams( tr )
y_pred = self.fit_model( tr, c).predict( X_test )
print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )
m = self.fit_model( data=self._all_data, c=c )
return m
@staticmethod
def fit_model( data, c, g=10 ):
m = SVC( gamma=g, C=c )
X, y = zip( *data )
m.fit( X, y )
return m
@staticmethod
def calc_risk( y_pred, y_true, loss ):
'''Empirical risk on a sample.'''
assert len( y_pred ) == len( y_true )
return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])
@classmethod
def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
'''Train a model on a dataset. Return a risk estimate.'''
m = cls.fit_model( train, c, g )
X, y = zip( *test )
pred = m.predict( X )
r = cls.calc_risk( pred, y, loss )
return r
@staticmethod
def calc_crossval_risk( dataset, body, k ):
'''Apply `body` to overlapping batches of the dataset.'''
risk =
for mask in create_kfold_mask( len( dataset ), k ):
tr = dataset[ ~ mask ]
te = dataset[ mask ]
r = body( train=tr, test=te )
risk.append( r )
return sum(risk) / len(risk)
def calc_hyperparams( self
, dataset
, c_grid=np.logspace( start=0, stop=2, num=50 ) ):
'''Perform a grid search in hyperparameter space.'''
risk =
for c in c_grid:
body = lambda train, test: self.calc_OOB_risk( train=train, test=test
, loss=self._loss, c=c, g=10 )
r = self.calc_crossval_risk( dataset, body, self._k )
risk.append( r )
best = c_grid[ np.argmax( risk ) ]
return best
And this is a sample run:
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )
python python-3.x homework statistics
add a comment |Â
up vote
0
down vote
favorite
This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).
The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.
Here are some of my concerns, though I am probably completely overlooking the important parts:
Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?
Dataset storage - my class accepts its datapoints and labels as two different arrays
X
andy
. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.
This is the section of the notebook, relevant to this homework problem:
import numpy as np
def create_kfold_mask(num_samples, k):
masks =
fold_size = num_samples / k
for i in range(k):
mask = np.zeros(num_samples, dtype=bool)
mask[int(i*fold_size):int((i+1)*fold_size)] = True
masks.append(mask)
return masks
class NCV:
'''Nested Cross-Validation.'''
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def __init__( self, X, y, loss=mean_squared_error, k=10 ):
self._all_data = np.array( list( zip( X, y ) ) )
np.random.shuffle( self._all_data )
self._loss = loss
# Number of groups in the inner loop.
self._k = k
def train( self ):
X, y = zip( *self._all_data )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
tr = np.array( list( zip( X_train, y_train ) ) )
c = self.calc_hyperparams( tr )
y_pred = self.fit_model( tr, c).predict( X_test )
print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )
m = self.fit_model( data=self._all_data, c=c )
return m
@staticmethod
def fit_model( data, c, g=10 ):
m = SVC( gamma=g, C=c )
X, y = zip( *data )
m.fit( X, y )
return m
@staticmethod
def calc_risk( y_pred, y_true, loss ):
'''Empirical risk on a sample.'''
assert len( y_pred ) == len( y_true )
return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])
@classmethod
def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
'''Train a model on a dataset. Return a risk estimate.'''
m = cls.fit_model( train, c, g )
X, y = zip( *test )
pred = m.predict( X )
r = cls.calc_risk( pred, y, loss )
return r
@staticmethod
def calc_crossval_risk( dataset, body, k ):
'''Apply `body` to overlapping batches of the dataset.'''
risk =
for mask in create_kfold_mask( len( dataset ), k ):
tr = dataset[ ~ mask ]
te = dataset[ mask ]
r = body( train=tr, test=te )
risk.append( r )
return sum(risk) / len(risk)
def calc_hyperparams( self
, dataset
, c_grid=np.logspace( start=0, stop=2, num=50 ) ):
'''Perform a grid search in hyperparameter space.'''
risk =
for c in c_grid:
body = lambda train, test: self.calc_OOB_risk( train=train, test=test
, loss=self._loss, c=c, g=10 )
r = self.calc_crossval_risk( dataset, body, self._k )
risk.append( r )
best = c_grid[ np.argmax( risk ) ]
return best
And this is a sample run:
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )
python python-3.x homework statistics
1
Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â Mathias Ettinger
Jul 13 at 7:53
Also, what isSVC
? It is currently undefined, as far as I can tell.
â Graipher
Jul 13 at 9:46
add a comment |Â
up vote
0
down vote
favorite
up vote
0
down vote
favorite
This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).
The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.
Here are some of my concerns, though I am probably completely overlooking the important parts:
Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?
Dataset storage - my class accepts its datapoints and labels as two different arrays
X
andy
. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.
This is the section of the notebook, relevant to this homework problem:
import numpy as np
def create_kfold_mask(num_samples, k):
masks =
fold_size = num_samples / k
for i in range(k):
mask = np.zeros(num_samples, dtype=bool)
mask[int(i*fold_size):int((i+1)*fold_size)] = True
masks.append(mask)
return masks
class NCV:
'''Nested Cross-Validation.'''
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def __init__( self, X, y, loss=mean_squared_error, k=10 ):
self._all_data = np.array( list( zip( X, y ) ) )
np.random.shuffle( self._all_data )
self._loss = loss
# Number of groups in the inner loop.
self._k = k
def train( self ):
X, y = zip( *self._all_data )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
tr = np.array( list( zip( X_train, y_train ) ) )
c = self.calc_hyperparams( tr )
y_pred = self.fit_model( tr, c).predict( X_test )
print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )
m = self.fit_model( data=self._all_data, c=c )
return m
@staticmethod
def fit_model( data, c, g=10 ):
m = SVC( gamma=g, C=c )
X, y = zip( *data )
m.fit( X, y )
return m
@staticmethod
def calc_risk( y_pred, y_true, loss ):
'''Empirical risk on a sample.'''
assert len( y_pred ) == len( y_true )
return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])
@classmethod
def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
'''Train a model on a dataset. Return a risk estimate.'''
m = cls.fit_model( train, c, g )
X, y = zip( *test )
pred = m.predict( X )
r = cls.calc_risk( pred, y, loss )
return r
@staticmethod
def calc_crossval_risk( dataset, body, k ):
'''Apply `body` to overlapping batches of the dataset.'''
risk =
for mask in create_kfold_mask( len( dataset ), k ):
tr = dataset[ ~ mask ]
te = dataset[ mask ]
r = body( train=tr, test=te )
risk.append( r )
return sum(risk) / len(risk)
def calc_hyperparams( self
, dataset
, c_grid=np.logspace( start=0, stop=2, num=50 ) ):
'''Perform a grid search in hyperparameter space.'''
risk =
for c in c_grid:
body = lambda train, test: self.calc_OOB_risk( train=train, test=test
, loss=self._loss, c=c, g=10 )
r = self.calc_crossval_risk( dataset, body, self._k )
risk.append( r )
best = c_grid[ np.argmax( risk ) ]
return best
And this is a sample run:
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )
python python-3.x homework statistics
This is a homework assignment to implement nested cross validation. It seems to work fine (sometimes).
The imports are inside the class and some methods are static, because this code needs to be in the same source file with a lot of other stuff (Jupiter notebook) and this is my attempt at reducing visibility of names.
Here are some of my concerns, though I am probably completely overlooking the important parts:
Architecture - if all this was implemented as two nested loops, it would be several times shorter. Was that the more readable approach?
Dataset storage - my class accepts its datapoints and labels as two different arrays
X
andy
. Then any functions in sklearn again expect that format. But I store it internally as a zipped list for easy shuffling and masking.All these static methods seem out of place. I have declared them as such because they access only a minimal part of the class state.
This is the section of the notebook, relevant to this homework problem:
import numpy as np
def create_kfold_mask(num_samples, k):
masks =
fold_size = num_samples / k
for i in range(k):
mask = np.zeros(num_samples, dtype=bool)
mask[int(i*fold_size):int((i+1)*fold_size)] = True
masks.append(mask)
return masks
class NCV:
'''Nested Cross-Validation.'''
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def __init__( self, X, y, loss=mean_squared_error, k=10 ):
self._all_data = np.array( list( zip( X, y ) ) )
np.random.shuffle( self._all_data )
self._loss = loss
# Number of groups in the inner loop.
self._k = k
def train( self ):
X, y = zip( *self._all_data )
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2 )
tr = np.array( list( zip( X_train, y_train ) ) )
c = self.calc_hyperparams( tr )
y_pred = self.fit_model( tr, c).predict( X_test )
print( 'OOB accuracy: ', metrics.accuracy_score( y_test, y_pred ) )
print( metrics.classification_report( y_test, y_pred, target_names=iris.target_names ) )
m = self.fit_model( data=self._all_data, c=c )
return m
@staticmethod
def fit_model( data, c, g=10 ):
m = SVC( gamma=g, C=c )
X, y = zip( *data )
m.fit( X, y )
return m
@staticmethod
def calc_risk( y_pred, y_true, loss ):
'''Empirical risk on a sample.'''
assert len( y_pred ) == len( y_true )
return ( 1 / len(y) ) * sum([ loss( y_pred, y_true ) ])
@classmethod
def calc_OOB_risk( cls, train, test, loss, c=1, g=10 ):
'''Train a model on a dataset. Return a risk estimate.'''
m = cls.fit_model( train, c, g )
X, y = zip( *test )
pred = m.predict( X )
r = cls.calc_risk( pred, y, loss )
return r
@staticmethod
def calc_crossval_risk( dataset, body, k ):
'''Apply `body` to overlapping batches of the dataset.'''
risk =
for mask in create_kfold_mask( len( dataset ), k ):
tr = dataset[ ~ mask ]
te = dataset[ mask ]
r = body( train=tr, test=te )
risk.append( r )
return sum(risk) / len(risk)
def calc_hyperparams( self
, dataset
, c_grid=np.logspace( start=0, stop=2, num=50 ) ):
'''Perform a grid search in hyperparameter space.'''
risk =
for c in c_grid:
body = lambda train, test: self.calc_OOB_risk( train=train, test=test
, loss=self._loss, c=c, g=10 )
r = self.calc_crossval_risk( dataset, body, self._k )
risk.append( r )
best = c_grid[ np.argmax( risk ) ]
return best
And this is a sample run:
from sklearn.datasets import load_iris
iris = load_iris()
X, y = iris.data, iris.target
X_versi = X[:, :2]
y_versi = np.zeros(len(y))
y_versi[y == 1] = 1
ncv = NCV( X_versi, y_versi )
m = ncv.train()
print( m )
python python-3.x homework statistics
edited Jul 13 at 18:03
200_success
123k14143399
123k14143399
asked Jul 13 at 7:09
Vorac
25517
25517
1
Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â Mathias Ettinger
Jul 13 at 7:53
Also, what isSVC
? It is currently undefined, as far as I can tell.
â Graipher
Jul 13 at 9:46
add a comment |Â
1
Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â Mathias Ettinger
Jul 13 at 7:53
Also, what isSVC
? It is currently undefined, as far as I can tell.
â Graipher
Jul 13 at 9:46
1
1
Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â Mathias Ettinger
Jul 13 at 7:53
Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â Mathias Ettinger
Jul 13 at 7:53
Also, what is
SVC
? It is currently undefined, as far as I can tell.â Graipher
Jul 13 at 9:46
Also, what is
SVC
? It is currently undefined, as far as I can tell.â Graipher
Jul 13 at 9:46
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f198406%2fnested-cross-validation%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
1
Hi! "It seems to work fine (sometimes)." Does this mean that most of the time it doesn't work as expected?
â Mathias Ettinger
Jul 13 at 7:53
Also, what is
SVC
? It is currently undefined, as far as I can tell.â Graipher
Jul 13 at 9:46