Calculation of clustering metric in Python

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
2
down vote

favorite

When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.

Any ideas?

from scipy.stats import entropy as KL
import numpy as np

def dis(di,dj):
 di = np.asanyarray(di)
 dj = np.asanyarray(dj)
 m = 0.5 * (di+dj)
 kl1 = KL(di,m)
 kl2 = KL(dj,m)
 return 0.5*(kl1+kl2)

def Intra_Cluster_dist(C):
 C = np.asanyarray(C)
 K = float(C.shape[0])
 factor1 = 1.0/float(K)
 total_sum = 0.0
 for cluster in C:
 cluster = np.asanyarray(cluster)
 below1 = float(cluster.shape[0])
 below2 = float(below1 - 1)
 sub_sum = 0.0
 for di in cluster:
 #others = cluster[:] 
 #others.remove(di)
 others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
 #for dj in others:
 # sub_sum = sub_sum +
 (2*float(dis(di,dj)))/(float(below1)*float(below2))
 sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
 total_sum = total_sum + sub_sum
 return float(factor1 * total_sum)

def Inter_Cluster_dist(C):
 K = float(len(C))
 factor1 = float((1/(K*(K-1))))
 total_sum = 0.0
 for cluster in C:
 sub_sum = 0.0
 other_clusters = C[:]
 other_clusters.remove(cluster)
 below1= float(len(cluster))
 for other in other_clusters:
 below2= float(len(other))
 for di in cluster:
 for dj in other:
 sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
 total_sum = total_sum + sub_sum
 return float(factor1 * total_sum )

def H_score(C):
 return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))

Link to example data:

The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.

The process to get C from the file and get the functions working should be something like this:

import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)

edited Feb 27 at 15:45

Billal BEGUERADJ

asked Feb 5 at 21:08

kibs

112

2

Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â€“Â Gareth Rees
Feb 6 at 9:18

It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â€“Â MPath
Feb 7 at 8:55

Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â€“Â Gareth Rees
Feb 9 at 11:00

Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â€“Â kibs
Feb 9 at 16:08

Mikey, that sounds promising. Could you please provide an example?
â€“Â kibs
Feb 9 at 16:23

add a commentÂ |Â

up vote
2
down vote

favorite

When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.

Any ideas?

from scipy.stats import entropy as KL
import numpy as np

def dis(di,dj):
 di = np.asanyarray(di)
 dj = np.asanyarray(dj)
 m = 0.5 * (di+dj)
 kl1 = KL(di,m)
 kl2 = KL(dj,m)
 return 0.5*(kl1+kl2)

def Intra_Cluster_dist(C):
 C = np.asanyarray(C)
 K = float(C.shape[0])
 factor1 = 1.0/float(K)
 total_sum = 0.0
 for cluster in C:
 cluster = np.asanyarray(cluster)
 below1 = float(cluster.shape[0])
 below2 = float(below1 - 1)
 sub_sum = 0.0
 for di in cluster:
 #others = cluster[:] 
 #others.remove(di)
 others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
 #for dj in others:
 # sub_sum = sub_sum +
 (2*float(dis(di,dj)))/(float(below1)*float(below2))
 sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
 total_sum = total_sum + sub_sum
 return float(factor1 * total_sum)

def Inter_Cluster_dist(C):
 K = float(len(C))
 factor1 = float((1/(K*(K-1))))
 total_sum = 0.0
 for cluster in C:
 sub_sum = 0.0
 other_clusters = C[:]
 other_clusters.remove(cluster)
 below1= float(len(cluster))
 for other in other_clusters:
 below2= float(len(other))
 for di in cluster:
 for dj in other:
 sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
 total_sum = total_sum + sub_sum
 return float(factor1 * total_sum )

def H_score(C):
 return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))

Link to example data:

The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.

The process to get C from the file and get the functions working should be something like this:

import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)

edited Feb 27 at 15:45

Billal BEGUERADJ

asked Feb 5 at 21:08

kibs

112

2

Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â€“Â Gareth Rees
Feb 6 at 9:18

It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â€“Â MPath
Feb 7 at 8:55

Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â€“Â Gareth Rees
Feb 9 at 11:00

Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â€“Â kibs
Feb 9 at 16:08

Mikey, that sounds promising. Could you please provide an example?
â€“Â kibs
Feb 9 at 16:23

add a commentÂ |Â

up vote
2
down vote

favorite

When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.

Any ideas?

from scipy.stats import entropy as KL
import numpy as np

def dis(di,dj):
 di = np.asanyarray(di)
 dj = np.asanyarray(dj)
 m = 0.5 * (di+dj)
 kl1 = KL(di,m)
 kl2 = KL(dj,m)
 return 0.5*(kl1+kl2)

def Intra_Cluster_dist(C):
 C = np.asanyarray(C)
 K = float(C.shape[0])
 factor1 = 1.0/float(K)
 total_sum = 0.0
 for cluster in C:
 cluster = np.asanyarray(cluster)
 below1 = float(cluster.shape[0])
 below2 = float(below1 - 1)
 sub_sum = 0.0
 for di in cluster:
 #others = cluster[:] 
 #others.remove(di)
 others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
 #for dj in others:
 # sub_sum = sub_sum +
 (2*float(dis(di,dj)))/(float(below1)*float(below2))
 sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
 total_sum = total_sum + sub_sum
 return float(factor1 * total_sum)

def Inter_Cluster_dist(C):
 K = float(len(C))
 factor1 = float((1/(K*(K-1))))
 total_sum = 0.0
 for cluster in C:
 sub_sum = 0.0
 other_clusters = C[:]
 other_clusters.remove(cluster)
 below1= float(len(cluster))
 for other in other_clusters:
 below2= float(len(other))
 for di in cluster:
 for dj in other:
 sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
 total_sum = total_sum + sub_sum
 return float(factor1 * total_sum )

def H_score(C):
 return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))

Link to example data:

The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.

The process to get C from the file and get the functions working should be something like this:

import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)

edited Feb 27 at 15:45

Billal BEGUERADJ

asked Feb 5 at 21:08

kibs

112

When I try to run the following code for arrays with more than 10k elements, it takes hours and I don't know how to make it in the most efficient way.

Any ideas?

from scipy.stats import entropy as KL
import numpy as np

def dis(di,dj):
 di = np.asanyarray(di)
 dj = np.asanyarray(dj)
 m = 0.5 * (di+dj)
 kl1 = KL(di,m)
 kl2 = KL(dj,m)
 return 0.5*(kl1+kl2)

def Intra_Cluster_dist(C):
 C = np.asanyarray(C)
 K = float(C.shape[0])
 factor1 = 1.0/float(K)
 total_sum = 0.0
 for cluster in C:
 cluster = np.asanyarray(cluster)
 below1 = float(cluster.shape[0])
 below2 = float(below1 - 1)
 sub_sum = 0.0
 for di in cluster:
 #others = cluster[:] 
 #others.remove(di)
 others = cluster[np.logical_not((cluster == np.array(di)).all(axis=1))]
 #for dj in others:
 # sub_sum = sub_sum +
 (2*float(dis(di,dj)))/(float(below1)*float(below2))
 sub_sum = sub_sum + np.fromiter((((2*float(dis(di,dj)))/(float(below1)*float(below2))) for dj in others), dtype=float).sum()
 total_sum = total_sum + sub_sum
 return float(factor1 * total_sum)

def Inter_Cluster_dist(C):
 K = float(len(C))
 factor1 = float((1/(K*(K-1))))
 total_sum = 0.0
 for cluster in C:
 sub_sum = 0.0
 other_clusters = C[:]
 other_clusters.remove(cluster)
 below1= float(len(cluster))
 for other in other_clusters:
 below2= float(len(other))
 for di in cluster:
 for dj in other:
 sub_sum = sub_sum + (float((dis(di, dj)))/float((below1*below2)))
 total_sum = total_sum + sub_sum
 return float(factor1 * total_sum )

def H_score(C):
 return float(Intra_Cluster_dist(C))/float(Inter_Cluster_dist(C))

Link to example data:

The data file is a xlsx. It has three columns: label (cluster label), feature_1 and feature_2.

The process to get C from the file and get the functions working should be something like this:

import pandas as pd
import numpy as np
df = pd.read_excel('example_data.xlsx')
c1 = np.asanyarray(df[df['labels'] == 0].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
c2 = np.asanyarray(df[df['labels'] == 1].apply(lambda row: ([row['feature_1'], row['feature_2']]), axis=1))
C = [c1,c2]
H_score(C)

edited Feb 27 at 15:45

Billal BEGUERADJ

asked Feb 5 at 21:08

kibs

112

edited Feb 27 at 15:45

Billal BEGUERADJ

edited Feb 27 at 15:45

Billal BEGUERADJ

edited Feb 27 at 15:45

Billal BEGUERADJ

asked Feb 5 at 21:08

kibs

112

asked Feb 5 at 21:08

kibs

112

asked Feb 5 at 21:08

kibs

112

2

Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â€“Â Gareth Rees
Feb 6 at 9:18

It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â€“Â MPath
Feb 7 at 8:55

Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â€“Â Gareth Rees
Feb 9 at 11:00

Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â€“Â kibs
Feb 9 at 16:08

Mikey, that sounds promising. Could you please provide an example?
â€“Â kibs
Feb 9 at 16:23

add a commentÂ |Â

2

Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â€“Â Gareth Rees
Feb 6 at 9:18

It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â€“Â MPath
Feb 7 at 8:55

Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â€“Â Gareth Rees
Feb 9 at 11:00

Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â€“Â kibs
Feb 9 at 16:08

Mikey, that sounds promising. Could you please provide an example?
â€“Â kibs
Feb 9 at 16:23

Can you give us example data demonstrating the performance problem? (Or link to it, if too large to add to the post?)
â€“Â Gareth Rees
Feb 6 at 9:18

It may help speed-wise to incorporate numpy array functionality in place of nested for-loops (or at least comprehensions).
â€“Â MPath
Feb 7 at 8:55

Could you post the data in a format other than "pickle", please? Sorry to be a nuisance, but the "pickle" format can run arbitrary Python code when unpickling, and so it is a security risk.
â€“Â Gareth Rees
Feb 9 at 11:00

Don't worry, I understand. I've updated the link to an excel file with three columns: Labels (cluster label) , Feature 1, Feature 2.
â€“Â kibs
Feb 9 at 16:08

Mikey, that sounds promising. Could you please provide an example?
â€“Â kibs
Feb 9 at 16:23

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186870%2fcalculation-of-clustering-metric-in-python%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr