Fuzzy c Means in Python

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
2
down vote

favorite

This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.

import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):

 min_cost = np.inf
 for iter_init in range(n_init):

 # Randomly initialize centers
 centers = data[np.random.choice(
 data.shape[0], size=n_clusters, replace=False
 ), :]

 # Compute initial distances
 # Zeros are replaced by eps to avoid division issues
 dist = np.fmax(
 cdist(centers, data, metric='sqeuclidean'),
 np.finfo(np.float64).eps
 )

 for iter1 in range(max_iter):

 # Compute memberships 
 u = (1 / dist) ** (1 / (m-1))
 um = (u / u.sum(axis=0))**m

 # Recompute centers
 prev_centers = centers
 centers = um.dot(data) / um.sum(axis=1)[:, None]

 dist = cdist(centers, data, metric='sqeuclidean')

 if np.linalg.norm(centers - prev_centers) < tol:
 break

 # Compute cost
 cost = np.sum(um * dist)
 if cost < min_cost:
 min_cost = cost
 min_centers = centers
 mem = um.argmax(axis=0)

 return min_centers, mem


if __name__ == '__main__':
 data = np.loadtxt('grid5_edit.txt')
 labels = data[:, -1]
 k = len(np.unique(labels))
 data = data[:, 0:-1]

 repeats = 10

 # Time This
 fcm_time = 0
 for iter1 in range(repeats):
 fcm_start = time.time()
 centers, mem = fcm(
 data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
 )
 fcm_time += (time.time() - fcm_start)
 print('Average FCM time =', fcm_time/repeats)

 # Time This (as well)
 km_time = 0
 for iter1 in range(repeats):
 km_start = time.time()
 km1 = KMeans(
 n_clusters=k, n_init=30, max_iter=300, tol=1e-16
 ).fit(data)
 km_time += (time.time() - km_start)
 print('Average kMeans time =', km_time/repeats)

 print('Ratio of time =', fcm_time / km_time)

It seems the code is around 15 to 16 times slower than kMeans.

Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112

Other than a code review, I'm also hoping for any suggestions to make the code faster.

edited Feb 27 at 15:43

Billal BEGUERADJ

asked Feb 27 at 14:53

Avisek

5516

Did you consider using scikit-fuzzy? Implementation here.
â€“Â Gareth Rees
Feb 28 at 9:39

I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â€“Â Avisek
Mar 5 at 13:50

add a commentÂ |Â

up vote
2
down vote

favorite

This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.

import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):

 min_cost = np.inf
 for iter_init in range(n_init):

 # Randomly initialize centers
 centers = data[np.random.choice(
 data.shape[0], size=n_clusters, replace=False
 ), :]

 # Compute initial distances
 # Zeros are replaced by eps to avoid division issues
 dist = np.fmax(
 cdist(centers, data, metric='sqeuclidean'),
 np.finfo(np.float64).eps
 )

 for iter1 in range(max_iter):

 # Compute memberships 
 u = (1 / dist) ** (1 / (m-1))
 um = (u / u.sum(axis=0))**m

 # Recompute centers
 prev_centers = centers
 centers = um.dot(data) / um.sum(axis=1)[:, None]

 dist = cdist(centers, data, metric='sqeuclidean')

 if np.linalg.norm(centers - prev_centers) < tol:
 break

 # Compute cost
 cost = np.sum(um * dist)
 if cost < min_cost:
 min_cost = cost
 min_centers = centers
 mem = um.argmax(axis=0)

 return min_centers, mem


if __name__ == '__main__':
 data = np.loadtxt('grid5_edit.txt')
 labels = data[:, -1]
 k = len(np.unique(labels))
 data = data[:, 0:-1]

 repeats = 10

 # Time This
 fcm_time = 0
 for iter1 in range(repeats):
 fcm_start = time.time()
 centers, mem = fcm(
 data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
 )
 fcm_time += (time.time() - fcm_start)
 print('Average FCM time =', fcm_time/repeats)

 # Time This (as well)
 km_time = 0
 for iter1 in range(repeats):
 km_start = time.time()
 km1 = KMeans(
 n_clusters=k, n_init=30, max_iter=300, tol=1e-16
 ).fit(data)
 km_time += (time.time() - km_start)
 print('Average kMeans time =', km_time/repeats)

 print('Ratio of time =', fcm_time / km_time)

It seems the code is around 15 to 16 times slower than kMeans.

Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112

Other than a code review, I'm also hoping for any suggestions to make the code faster.

edited Feb 27 at 15:43

Billal BEGUERADJ

asked Feb 27 at 14:53

Avisek

5516

Did you consider using scikit-fuzzy? Implementation here.
â€“Â Gareth Rees
Feb 28 at 9:39

I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â€“Â Avisek
Mar 5 at 13:50

add a commentÂ |Â

up vote
2
down vote

favorite

This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.

import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):

 min_cost = np.inf
 for iter_init in range(n_init):

 # Randomly initialize centers
 centers = data[np.random.choice(
 data.shape[0], size=n_clusters, replace=False
 ), :]

 # Compute initial distances
 # Zeros are replaced by eps to avoid division issues
 dist = np.fmax(
 cdist(centers, data, metric='sqeuclidean'),
 np.finfo(np.float64).eps
 )

 for iter1 in range(max_iter):

 # Compute memberships 
 u = (1 / dist) ** (1 / (m-1))
 um = (u / u.sum(axis=0))**m

 # Recompute centers
 prev_centers = centers
 centers = um.dot(data) / um.sum(axis=1)[:, None]

 dist = cdist(centers, data, metric='sqeuclidean')

 if np.linalg.norm(centers - prev_centers) < tol:
 break

 # Compute cost
 cost = np.sum(um * dist)
 if cost < min_cost:
 min_cost = cost
 min_centers = centers
 mem = um.argmax(axis=0)

 return min_centers, mem


if __name__ == '__main__':
 data = np.loadtxt('grid5_edit.txt')
 labels = data[:, -1]
 k = len(np.unique(labels))
 data = data[:, 0:-1]

 repeats = 10

 # Time This
 fcm_time = 0
 for iter1 in range(repeats):
 fcm_start = time.time()
 centers, mem = fcm(
 data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
 )
 fcm_time += (time.time() - fcm_start)
 print('Average FCM time =', fcm_time/repeats)

 # Time This (as well)
 km_time = 0
 for iter1 in range(repeats):
 km_start = time.time()
 km1 = KMeans(
 n_clusters=k, n_init=30, max_iter=300, tol=1e-16
 ).fit(data)
 km_time += (time.time() - km_start)
 print('Average kMeans time =', km_time/repeats)

 print('Ratio of time =', fcm_time / km_time)

It seems the code is around 15 to 16 times slower than kMeans.

Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112

Other than a code review, I'm also hoping for any suggestions to make the code faster.

edited Feb 27 at 15:43

Billal BEGUERADJ

asked Feb 27 at 14:53

Avisek

5516

This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.

import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):

 min_cost = np.inf
 for iter_init in range(n_init):

 # Randomly initialize centers
 centers = data[np.random.choice(
 data.shape[0], size=n_clusters, replace=False
 ), :]

 # Compute initial distances
 # Zeros are replaced by eps to avoid division issues
 dist = np.fmax(
 cdist(centers, data, metric='sqeuclidean'),
 np.finfo(np.float64).eps
 )

 for iter1 in range(max_iter):

 # Compute memberships 
 u = (1 / dist) ** (1 / (m-1))
 um = (u / u.sum(axis=0))**m

 # Recompute centers
 prev_centers = centers
 centers = um.dot(data) / um.sum(axis=1)[:, None]

 dist = cdist(centers, data, metric='sqeuclidean')

 if np.linalg.norm(centers - prev_centers) < tol:
 break

 # Compute cost
 cost = np.sum(um * dist)
 if cost < min_cost:
 min_cost = cost
 min_centers = centers
 mem = um.argmax(axis=0)

 return min_centers, mem


if __name__ == '__main__':
 data = np.loadtxt('grid5_edit.txt')
 labels = data[:, -1]
 k = len(np.unique(labels))
 data = data[:, 0:-1]

 repeats = 10

 # Time This
 fcm_time = 0
 for iter1 in range(repeats):
 fcm_start = time.time()
 centers, mem = fcm(
 data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
 )
 fcm_time += (time.time() - fcm_start)
 print('Average FCM time =', fcm_time/repeats)

 # Time This (as well)
 km_time = 0
 for iter1 in range(repeats):
 km_start = time.time()
 km1 = KMeans(
 n_clusters=k, n_init=30, max_iter=300, tol=1e-16
 ).fit(data)
 km_time += (time.time() - km_start)
 print('Average kMeans time =', km_time/repeats)

 print('Ratio of time =', fcm_time / km_time)

It seems the code is around 15 to 16 times slower than kMeans.

Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112

Other than a code review, I'm also hoping for any suggestions to make the code faster.

edited Feb 27 at 15:43

Billal BEGUERADJ

asked Feb 27 at 14:53

Avisek

5516

edited Feb 27 at 15:43

Billal BEGUERADJ

edited Feb 27 at 15:43

Billal BEGUERADJ

edited Feb 27 at 15:43

Billal BEGUERADJ

asked Feb 27 at 14:53

Avisek

5516

asked Feb 27 at 14:53

Avisek

5516

asked Feb 27 at 14:53

Avisek

5516

Did you consider using scikit-fuzzy? Implementation here.
â€“Â Gareth Rees
Feb 28 at 9:39

I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â€“Â Avisek
Mar 5 at 13:50

add a commentÂ |Â

Did you consider using scikit-fuzzy? Implementation here.
â€“Â Gareth Rees
Feb 28 at 9:39

I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â€“Â Avisek
Mar 5 at 13:50

Did you consider using scikit-fuzzy? Implementation here.
â€“Â Gareth Rees
Feb 28 at 9:39

I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â€“Â Avisek
Mar 5 at 13:50

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188455%2ffuzzy-c-means-in-python%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr