Calculation of clustering metric in Python

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
2
down vote

favorite












When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.



Any ideas?



from scipy.stats import entropy as KL
import numpy as np

def dis(di,dj):
di = np.asanyarray(di)
dj = np.asanyarray(dj)
m = 0.5 * (di+dj)
kl1 = KL(di,m)
kl2 = KL(dj,m)
return 0.5*(kl1+kl2)

def Intra_Cluster_dist(C):
C = np.asanyarray(C)
K = float(C.shape[0])
factor1 = 1.0/float(K)
total_sum = 0.0
for cluster in C:
cluster = np.asanyarray(cluster)
below1 = float(cluster.shape[0])
below2 = float(below1 - 1)
sub_sum = 0.0
for di in cluster:
#others = cluster[:]
#others.remove(di)
others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
#for dj in others:
# sub_sum = sub_sum +
(2*float(dis(di,dj)))/(float(below1)*float(below2))
sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
total_sum = total_sum + sub_sum
return float(factor1 * total_sum)

def Inter_Cluster_dist(C):
K = float(len(C))
factor1 = float((1/(K*(K-1))))
total_sum = 0.0
for cluster in C:
sub_sum = 0.0
other_clusters = C[:]
other_clusters.remove(cluster)
below1= float(len(cluster))
for other in other_clusters:
below2= float(len(other))
for di in cluster:
for dj in other:
sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
total_sum = total_sum + sub_sum
return float(factor1 * total_sum )

def H_score(C):
return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))


Link to example data:



The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.



The process to get C from the file and get the functions working should be something like this:



import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)






share|improve this question

















  • 2




    Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
    – Gareth Rees
    Feb 6 at 9:18










  • It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
    – MPath
    Feb 7 at 8:55










  • Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
    – Gareth Rees
    Feb 9 at 11:00










  • Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
    – kibs
    Feb 9 at 16:08










  • Mikey, that sounds promising. Could you please provide an example?
    – kibs
    Feb 9 at 16:23
















up vote
2
down vote

favorite












When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.



Any ideas?



from scipy.stats import entropy as KL
import numpy as np

def dis(di,dj):
di = np.asanyarray(di)
dj = np.asanyarray(dj)
m = 0.5 * (di+dj)
kl1 = KL(di,m)
kl2 = KL(dj,m)
return 0.5*(kl1+kl2)

def Intra_Cluster_dist(C):
C = np.asanyarray(C)
K = float(C.shape[0])
factor1 = 1.0/float(K)
total_sum = 0.0
for cluster in C:
cluster = np.asanyarray(cluster)
below1 = float(cluster.shape[0])
below2 = float(below1 - 1)
sub_sum = 0.0
for di in cluster:
#others = cluster[:]
#others.remove(di)
others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
#for dj in others:
# sub_sum = sub_sum +
(2*float(dis(di,dj)))/(float(below1)*float(below2))
sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
total_sum = total_sum + sub_sum
return float(factor1 * total_sum)

def Inter_Cluster_dist(C):
K = float(len(C))
factor1 = float((1/(K*(K-1))))
total_sum = 0.0
for cluster in C:
sub_sum = 0.0
other_clusters = C[:]
other_clusters.remove(cluster)
below1= float(len(cluster))
for other in other_clusters:
below2= float(len(other))
for di in cluster:
for dj in other:
sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
total_sum = total_sum + sub_sum
return float(factor1 * total_sum )

def H_score(C):
return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))


Link to example data:



The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.



The process to get C from the file and get the functions working should be something like this:



import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)






share|improve this question

















  • 2




    Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
    – Gareth Rees
    Feb 6 at 9:18










  • It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
    – MPath
    Feb 7 at 8:55










  • Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
    – Gareth Rees
    Feb 9 at 11:00










  • Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
    – kibs
    Feb 9 at 16:08










  • Mikey, that sounds promising. Could you please provide an example?
    – kibs
    Feb 9 at 16:23












up vote
2
down vote

favorite









up vote
2
down vote

favorite











When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.



Any ideas?



from scipy.stats import entropy as KL
import numpy as np

def dis(di,dj):
di = np.asanyarray(di)
dj = np.asanyarray(dj)
m = 0.5 * (di+dj)
kl1 = KL(di,m)
kl2 = KL(dj,m)
return 0.5*(kl1+kl2)

def Intra_Cluster_dist(C):
C = np.asanyarray(C)
K = float(C.shape[0])
factor1 = 1.0/float(K)
total_sum = 0.0
for cluster in C:
cluster = np.asanyarray(cluster)
below1 = float(cluster.shape[0])
below2 = float(below1 - 1)
sub_sum = 0.0
for di in cluster:
#others = cluster[:]
#others.remove(di)
others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
#for dj in others:
# sub_sum = sub_sum +
(2*float(dis(di,dj)))/(float(below1)*float(below2))
sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
total_sum = total_sum + sub_sum
return float(factor1 * total_sum)

def Inter_Cluster_dist(C):
K = float(len(C))
factor1 = float((1/(K*(K-1))))
total_sum = 0.0
for cluster in C:
sub_sum = 0.0
other_clusters = C[:]
other_clusters.remove(cluster)
below1= float(len(cluster))
for other in other_clusters:
below2= float(len(other))
for di in cluster:
for dj in other:
sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
total_sum = total_sum + sub_sum
return float(factor1 * total_sum )

def H_score(C):
return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))


Link to example data:



The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.



The process to get C from the file and get the functions working should be something like this:



import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)






share|improve this question













When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.



Any ideas?



from scipy.stats import entropy as KL
import numpy as np

def dis(di,dj):
di = np.asanyarray(di)
dj = np.asanyarray(dj)
m = 0.5 * (di+dj)
kl1 = KL(di,m)
kl2 = KL(dj,m)
return 0.5*(kl1+kl2)

def Intra_Cluster_dist(C):
C = np.asanyarray(C)
K = float(C.shape[0])
factor1 = 1.0/float(K)
total_sum = 0.0
for cluster in C:
cluster = np.asanyarray(cluster)
below1 = float(cluster.shape[0])
below2 = float(below1 - 1)
sub_sum = 0.0
for di in cluster:
#others = cluster[:]
#others.remove(di)
others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
#for dj in others:
# sub_sum = sub_sum +
(2*float(dis(di,dj)))/(float(below1)*float(below2))
sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
total_sum = total_sum + sub_sum
return float(factor1 * total_sum)

def Inter_Cluster_dist(C):
K = float(len(C))
factor1 = float((1/(K*(K-1))))
total_sum = 0.0
for cluster in C:
sub_sum = 0.0
other_clusters = C[:]
other_clusters.remove(cluster)
below1= float(len(cluster))
for other in other_clusters:
below2= float(len(other))
for di in cluster:
for dj in other:
sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
total_sum = total_sum + sub_sum
return float(factor1 * total_sum )

def H_score(C):
return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))


Link to example data:



The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.



The process to get C from the file and get the functions working should be something like this:



import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)








share|improve this question












share|improve this question




share|improve this question








edited Feb 27 at 15:45









Billal BEGUERADJ

1




1









asked Feb 5 at 21:08









kibs

112




112







  • 2




    Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
    – Gareth Rees
    Feb 6 at 9:18










  • It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
    – MPath
    Feb 7 at 8:55










  • Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
    – Gareth Rees
    Feb 9 at 11:00










  • Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
    – kibs
    Feb 9 at 16:08










  • Mikey, that sounds promising. Could you please provide an example?
    – kibs
    Feb 9 at 16:23












  • 2




    Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
    – Gareth Rees
    Feb 6 at 9:18










  • It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
    – MPath
    Feb 7 at 8:55










  • Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
    – Gareth Rees
    Feb 9 at 11:00










  • Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
    – kibs
    Feb 9 at 16:08










  • Mikey, that sounds promising. Could you please provide an example?
    – kibs
    Feb 9 at 16:23







2




2




Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
– Gareth Rees
Feb 6 at 9:18




Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
– Gareth Rees
Feb 6 at 9:18












It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
– MPath
Feb 7 at 8:55




It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
– MPath
Feb 7 at 8:55












Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
– Gareth Rees
Feb 9 at 11:00




Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
– Gareth Rees
Feb 9 at 11:00












Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
– kibs
Feb 9 at 16:08




Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
– kibs
Feb 9 at 16:08












Mikey, that sounds promising. Could you please provide an example?
– kibs
Feb 9 at 16:23




Mikey, that sounds promising. Could you please provide an example?
– kibs
Feb 9 at 16:23















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186870%2fcalculation-of-clustering-metric-in-python%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186870%2fcalculation-of-clustering-metric-in-python%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Chat program with C++ and SFML

Function to Return a JSON Like Objects Using VBA Collections and Arrays

Will my employers contract hold up in court?