Calculation of clustering metric in Python
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
2
down vote
favorite
When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.
Any ideas?
from scipy.stats import entropy as KL
import numpy as np
def dis(di,dj):
di = np.asanyarray(di)
dj = np.asanyarray(dj)
m = 0.5 * (di+dj)
kl1 = KL(di,m)
kl2 = KL(dj,m)
return 0.5*(kl1+kl2)
def Intra_Cluster_dist(C):
C = np.asanyarray(C)
K = float(C.shape[0])
factor1 = 1.0/float(K)
total_sum = 0.0
for cluster in C:
cluster = np.asanyarray(cluster)
below1 = float(cluster.shape[0])
below2 = float(below1 - 1)
sub_sum = 0.0
for di in cluster:
#others = cluster[:]
#others.remove(di)
others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
#for dj in others:
# sub_sum = sub_sum +
(2*float(dis(di,dj)))/(float(below1)*float(below2))
sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
total_sum = total_sum + sub_sum
return float(factor1 * total_sum)
def Inter_Cluster_dist(C):
K = float(len(C))
factor1 = float((1/(K*(K-1))))
total_sum = 0.0
for cluster in C:
sub_sum = 0.0
other_clusters = C[:]
other_clusters.remove(cluster)
below1= float(len(cluster))
for other in other_clusters:
below2= float(len(other))
for di in cluster:
for dj in other:
sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
total_sum = total_sum + sub_sum
return float(factor1 * total_sum )
def H_score(C):
return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))
Link to example data:
The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.
The process to get C from the file and get the functions working should be something like this:
import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)
python performance numpy clustering
add a comment |Â
up vote
2
down vote
favorite
When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.
Any ideas?
from scipy.stats import entropy as KL
import numpy as np
def dis(di,dj):
di = np.asanyarray(di)
dj = np.asanyarray(dj)
m = 0.5 * (di+dj)
kl1 = KL(di,m)
kl2 = KL(dj,m)
return 0.5*(kl1+kl2)
def Intra_Cluster_dist(C):
C = np.asanyarray(C)
K = float(C.shape[0])
factor1 = 1.0/float(K)
total_sum = 0.0
for cluster in C:
cluster = np.asanyarray(cluster)
below1 = float(cluster.shape[0])
below2 = float(below1 - 1)
sub_sum = 0.0
for di in cluster:
#others = cluster[:]
#others.remove(di)
others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
#for dj in others:
# sub_sum = sub_sum +
(2*float(dis(di,dj)))/(float(below1)*float(below2))
sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
total_sum = total_sum + sub_sum
return float(factor1 * total_sum)
def Inter_Cluster_dist(C):
K = float(len(C))
factor1 = float((1/(K*(K-1))))
total_sum = 0.0
for cluster in C:
sub_sum = 0.0
other_clusters = C[:]
other_clusters.remove(cluster)
below1= float(len(cluster))
for other in other_clusters:
below2= float(len(other))
for di in cluster:
for dj in other:
sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
total_sum = total_sum + sub_sum
return float(factor1 * total_sum )
def H_score(C):
return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))
Link to example data:
The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.
The process to get C from the file and get the functions working should be something like this:
import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)
python performance numpy clustering
2
Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â Gareth Rees
Feb 6 at 9:18
It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â MPath
Feb 7 at 8:55
Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â Gareth Rees
Feb 9 at 11:00
Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â kibs
Feb 9 at 16:08
Mikey, that sounds promising. Could you please provide an example?
â kibs
Feb 9 at 16:23
add a comment |Â
up vote
2
down vote
favorite
up vote
2
down vote
favorite
When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.
Any ideas?
from scipy.stats import entropy as KL
import numpy as np
def dis(di,dj):
di = np.asanyarray(di)
dj = np.asanyarray(dj)
m = 0.5 * (di+dj)
kl1 = KL(di,m)
kl2 = KL(dj,m)
return 0.5*(kl1+kl2)
def Intra_Cluster_dist(C):
C = np.asanyarray(C)
K = float(C.shape[0])
factor1 = 1.0/float(K)
total_sum = 0.0
for cluster in C:
cluster = np.asanyarray(cluster)
below1 = float(cluster.shape[0])
below2 = float(below1 - 1)
sub_sum = 0.0
for di in cluster:
#others = cluster[:]
#others.remove(di)
others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
#for dj in others:
# sub_sum = sub_sum +
(2*float(dis(di,dj)))/(float(below1)*float(below2))
sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
total_sum = total_sum + sub_sum
return float(factor1 * total_sum)
def Inter_Cluster_dist(C):
K = float(len(C))
factor1 = float((1/(K*(K-1))))
total_sum = 0.0
for cluster in C:
sub_sum = 0.0
other_clusters = C[:]
other_clusters.remove(cluster)
below1= float(len(cluster))
for other in other_clusters:
below2= float(len(other))
for di in cluster:
for dj in other:
sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
total_sum = total_sum + sub_sum
return float(factor1 * total_sum )
def H_score(C):
return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))
Link to example data:
The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.
The process to get C from the file and get the functions working should be something like this:
import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)
python performance numpy clustering
When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.
Any ideas?
from scipy.stats import entropy as KL
import numpy as np
def dis(di,dj):
di = np.asanyarray(di)
dj = np.asanyarray(dj)
m = 0.5 * (di+dj)
kl1 = KL(di,m)
kl2 = KL(dj,m)
return 0.5*(kl1+kl2)
def Intra_Cluster_dist(C):
C = np.asanyarray(C)
K = float(C.shape[0])
factor1 = 1.0/float(K)
total_sum = 0.0
for cluster in C:
cluster = np.asanyarray(cluster)
below1 = float(cluster.shape[0])
below2 = float(below1 - 1)
sub_sum = 0.0
for di in cluster:
#others = cluster[:]
#others.remove(di)
others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
#for dj in others:
# sub_sum = sub_sum +
(2*float(dis(di,dj)))/(float(below1)*float(below2))
sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
total_sum = total_sum + sub_sum
return float(factor1 * total_sum)
def Inter_Cluster_dist(C):
K = float(len(C))
factor1 = float((1/(K*(K-1))))
total_sum = 0.0
for cluster in C:
sub_sum = 0.0
other_clusters = C[:]
other_clusters.remove(cluster)
below1= float(len(cluster))
for other in other_clusters:
below2= float(len(other))
for di in cluster:
for dj in other:
sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
total_sum = total_sum + sub_sum
return float(factor1 * total_sum )
def H_score(C):
return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))
Link to example data:
The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.
The process to get C from the file and get the functions working should be something like this:
import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)
python performance numpy clustering
edited Feb 27 at 15:45
Billal BEGUERADJ
1
1
asked Feb 5 at 21:08
kibs
112
112
2
Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â Gareth Rees
Feb 6 at 9:18
It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â MPath
Feb 7 at 8:55
Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â Gareth Rees
Feb 9 at 11:00
Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â kibs
Feb 9 at 16:08
Mikey, that sounds promising. Could you please provide an example?
â kibs
Feb 9 at 16:23
add a comment |Â
2
Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â Gareth Rees
Feb 6 at 9:18
It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â MPath
Feb 7 at 8:55
Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â Gareth Rees
Feb 9 at 11:00
Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â kibs
Feb 9 at 16:08
Mikey, that sounds promising. Could you please provide an example?
â kibs
Feb 9 at 16:23
2
2
Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â Gareth Rees
Feb 6 at 9:18
Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â Gareth Rees
Feb 6 at 9:18
It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â MPath
Feb 7 at 8:55
It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â MPath
Feb 7 at 8:55
Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â Gareth Rees
Feb 9 at 11:00
Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â Gareth Rees
Feb 9 at 11:00
Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â kibs
Feb 9 at 16:08
Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â kibs
Feb 9 at 16:08
Mikey, that sounds promising. Could you please provide an example?
â kibs
Feb 9 at 16:23
Mikey, that sounds promising. Could you please provide an example?
â kibs
Feb 9 at 16:23
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186870%2fcalculation-of-clustering-metric-in-python%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
2
Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â Gareth Rees
Feb 6 at 9:18
It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â MPath
Feb 7 at 8:55
Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â Gareth Rees
Feb 9 at 11:00
Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â kibs
Feb 9 at 16:08
Mikey, that sounds promising. Could you please provide an example?
â kibs
Feb 9 at 16:23