Fuzzy c Means in Python

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
2
down vote

favorite












This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.



import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):

min_cost = np.inf
for iter_init in range(n_init):

# Randomly initialize centers
centers = data[np.random.choice(
data.shape[0], size=n_clusters, replace=False
), :]

# Compute initial distances
# Zeros are replaced by eps to avoid division issues
dist = np.fmax(
cdist(centers, data, metric='sqeuclidean'),
np.finfo(np.float64).eps
)

for iter1 in range(max_iter):

# Compute memberships
u = (1 / dist) ** (1 / (m-1))
um = (u / u.sum(axis=0))**m

# Recompute centers
prev_centers = centers
centers = um.dot(data) / um.sum(axis=1)[:, None]

dist = cdist(centers, data, metric='sqeuclidean')

if np.linalg.norm(centers - prev_centers) < tol:
break

# Compute cost
cost = np.sum(um * dist)
if cost < min_cost:
min_cost = cost
min_centers = centers
mem = um.argmax(axis=0)

return min_centers, mem


if __name__ == '__main__':
data = np.loadtxt('grid5_edit.txt')
labels = data[:, -1]
k = len(np.unique(labels))
data = data[:, 0:-1]

repeats = 10

# Time This
fcm_time = 0
for iter1 in range(repeats):
fcm_start = time.time()
centers, mem = fcm(
data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
)
fcm_time += (time.time() - fcm_start)
print('Average FCM time =', fcm_time/repeats)

# Time This (as well)
km_time = 0
for iter1 in range(repeats):
km_start = time.time()
km1 = KMeans(
n_clusters=k, n_init=30, max_iter=300, tol=1e-16
).fit(data)
km_time += (time.time() - km_start)
print('Average kMeans time =', km_time/repeats)

print('Ratio of time =', fcm_time / km_time)


It seems the code is around 15 to 16 times slower than kMeans.



Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112


Other than a code review, I'm also hoping for any suggestions to make the code faster.







share|improve this question





















  • Did you consider using scikit-fuzzy? Implementation here.
    – Gareth Rees
    Feb 28 at 9:39










  • I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
    – Avisek
    Mar 5 at 13:50
















up vote
2
down vote

favorite












This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.



import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):

min_cost = np.inf
for iter_init in range(n_init):

# Randomly initialize centers
centers = data[np.random.choice(
data.shape[0], size=n_clusters, replace=False
), :]

# Compute initial distances
# Zeros are replaced by eps to avoid division issues
dist = np.fmax(
cdist(centers, data, metric='sqeuclidean'),
np.finfo(np.float64).eps
)

for iter1 in range(max_iter):

# Compute memberships
u = (1 / dist) ** (1 / (m-1))
um = (u / u.sum(axis=0))**m

# Recompute centers
prev_centers = centers
centers = um.dot(data) / um.sum(axis=1)[:, None]

dist = cdist(centers, data, metric='sqeuclidean')

if np.linalg.norm(centers - prev_centers) < tol:
break

# Compute cost
cost = np.sum(um * dist)
if cost < min_cost:
min_cost = cost
min_centers = centers
mem = um.argmax(axis=0)

return min_centers, mem


if __name__ == '__main__':
data = np.loadtxt('grid5_edit.txt')
labels = data[:, -1]
k = len(np.unique(labels))
data = data[:, 0:-1]

repeats = 10

# Time This
fcm_time = 0
for iter1 in range(repeats):
fcm_start = time.time()
centers, mem = fcm(
data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
)
fcm_time += (time.time() - fcm_start)
print('Average FCM time =', fcm_time/repeats)

# Time This (as well)
km_time = 0
for iter1 in range(repeats):
km_start = time.time()
km1 = KMeans(
n_clusters=k, n_init=30, max_iter=300, tol=1e-16
).fit(data)
km_time += (time.time() - km_start)
print('Average kMeans time =', km_time/repeats)

print('Ratio of time =', fcm_time / km_time)


It seems the code is around 15 to 16 times slower than kMeans.



Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112


Other than a code review, I'm also hoping for any suggestions to make the code faster.







share|improve this question





















  • Did you consider using scikit-fuzzy? Implementation here.
    – Gareth Rees
    Feb 28 at 9:39










  • I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
    – Avisek
    Mar 5 at 13:50












up vote
2
down vote

favorite









up vote
2
down vote

favorite











This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.



import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):

min_cost = np.inf
for iter_init in range(n_init):

# Randomly initialize centers
centers = data[np.random.choice(
data.shape[0], size=n_clusters, replace=False
), :]

# Compute initial distances
# Zeros are replaced by eps to avoid division issues
dist = np.fmax(
cdist(centers, data, metric='sqeuclidean'),
np.finfo(np.float64).eps
)

for iter1 in range(max_iter):

# Compute memberships
u = (1 / dist) ** (1 / (m-1))
um = (u / u.sum(axis=0))**m

# Recompute centers
prev_centers = centers
centers = um.dot(data) / um.sum(axis=1)[:, None]

dist = cdist(centers, data, metric='sqeuclidean')

if np.linalg.norm(centers - prev_centers) < tol:
break

# Compute cost
cost = np.sum(um * dist)
if cost < min_cost:
min_cost = cost
min_centers = centers
mem = um.argmax(axis=0)

return min_centers, mem


if __name__ == '__main__':
data = np.loadtxt('grid5_edit.txt')
labels = data[:, -1]
k = len(np.unique(labels))
data = data[:, 0:-1]

repeats = 10

# Time This
fcm_time = 0
for iter1 in range(repeats):
fcm_start = time.time()
centers, mem = fcm(
data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
)
fcm_time += (time.time() - fcm_start)
print('Average FCM time =', fcm_time/repeats)

# Time This (as well)
km_time = 0
for iter1 in range(repeats):
km_start = time.time()
km1 = KMeans(
n_clusters=k, n_init=30, max_iter=300, tol=1e-16
).fit(data)
km_time += (time.time() - km_start)
print('Average kMeans time =', km_time/repeats)

print('Ratio of time =', fcm_time / km_time)


It seems the code is around 15 to 16 times slower than kMeans.



Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112


Other than a code review, I'm also hoping for any suggestions to make the code faster.







share|improve this question













This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.



import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):

min_cost = np.inf
for iter_init in range(n_init):

# Randomly initialize centers
centers = data[np.random.choice(
data.shape[0], size=n_clusters, replace=False
), :]

# Compute initial distances
# Zeros are replaced by eps to avoid division issues
dist = np.fmax(
cdist(centers, data, metric='sqeuclidean'),
np.finfo(np.float64).eps
)

for iter1 in range(max_iter):

# Compute memberships
u = (1 / dist) ** (1 / (m-1))
um = (u / u.sum(axis=0))**m

# Recompute centers
prev_centers = centers
centers = um.dot(data) / um.sum(axis=1)[:, None]

dist = cdist(centers, data, metric='sqeuclidean')

if np.linalg.norm(centers - prev_centers) < tol:
break

# Compute cost
cost = np.sum(um * dist)
if cost < min_cost:
min_cost = cost
min_centers = centers
mem = um.argmax(axis=0)

return min_centers, mem


if __name__ == '__main__':
data = np.loadtxt('grid5_edit.txt')
labels = data[:, -1]
k = len(np.unique(labels))
data = data[:, 0:-1]

repeats = 10

# Time This
fcm_time = 0
for iter1 in range(repeats):
fcm_start = time.time()
centers, mem = fcm(
data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
)
fcm_time += (time.time() - fcm_start)
print('Average FCM time =', fcm_time/repeats)

# Time This (as well)
km_time = 0
for iter1 in range(repeats):
km_start = time.time()
km1 = KMeans(
n_clusters=k, n_init=30, max_iter=300, tol=1e-16
).fit(data)
km_time += (time.time() - km_start)
print('Average kMeans time =', km_time/repeats)

print('Ratio of time =', fcm_time / km_time)


It seems the code is around 15 to 16 times slower than kMeans.



Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112


Other than a code review, I'm also hoping for any suggestions to make the code faster.









share|improve this question












share|improve this question




share|improve this question








edited Feb 27 at 15:43









Billal BEGUERADJ

1




1









asked Feb 27 at 14:53









Avisek

5516




5516











  • Did you consider using scikit-fuzzy? Implementation here.
    – Gareth Rees
    Feb 28 at 9:39










  • I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
    – Avisek
    Mar 5 at 13:50
















  • Did you consider using scikit-fuzzy? Implementation here.
    – Gareth Rees
    Feb 28 at 9:39










  • I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
    – Avisek
    Mar 5 at 13:50















Did you consider using scikit-fuzzy? Implementation here.
– Gareth Rees
Feb 28 at 9:39




Did you consider using scikit-fuzzy? Implementation here.
– Gareth Rees
Feb 28 at 9:39












I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
– Avisek
Mar 5 at 13:50




I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
– Avisek
Mar 5 at 13:50















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188455%2ffuzzy-c-means-in-python%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188455%2ffuzzy-c-means-in-python%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Greedy Best First Search implementation in Rust

Function to Return a JSON Like Objects Using VBA Collections and Arrays

C++11 CLH Lock Implementation