Fuzzy c Means in Python
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
2
down vote
favorite
This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.
import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):
min_cost = np.inf
for iter_init in range(n_init):
# Randomly initialize centers
centers = data[np.random.choice(
data.shape[0], size=n_clusters, replace=False
), :]
# Compute initial distances
# Zeros are replaced by eps to avoid division issues
dist = np.fmax(
cdist(centers, data, metric='sqeuclidean'),
np.finfo(np.float64).eps
)
for iter1 in range(max_iter):
# Compute memberships
u = (1 / dist) ** (1 / (m-1))
um = (u / u.sum(axis=0))**m
# Recompute centers
prev_centers = centers
centers = um.dot(data) / um.sum(axis=1)[:, None]
dist = cdist(centers, data, metric='sqeuclidean')
if np.linalg.norm(centers - prev_centers) < tol:
break
# Compute cost
cost = np.sum(um * dist)
if cost < min_cost:
min_cost = cost
min_centers = centers
mem = um.argmax(axis=0)
return min_centers, mem
if __name__ == '__main__':
data = np.loadtxt('grid5_edit.txt')
labels = data[:, -1]
k = len(np.unique(labels))
data = data[:, 0:-1]
repeats = 10
# Time This
fcm_time = 0
for iter1 in range(repeats):
fcm_start = time.time()
centers, mem = fcm(
data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
)
fcm_time += (time.time() - fcm_start)
print('Average FCM time =', fcm_time/repeats)
# Time This (as well)
km_time = 0
for iter1 in range(repeats):
km_start = time.time()
km1 = KMeans(
n_clusters=k, n_init=30, max_iter=300, tol=1e-16
).fit(data)
km_time += (time.time() - km_start)
print('Average kMeans time =', km_time/repeats)
print('Ratio of time =', fcm_time / km_time)
It seems the code is around 15 to 16 times slower than kMeans.
Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112
Other than a code review, I'm also hoping for any suggestions to make the code faster.
python numpy clustering benchmarking scipy
add a comment |Â
up vote
2
down vote
favorite
This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.
import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):
min_cost = np.inf
for iter_init in range(n_init):
# Randomly initialize centers
centers = data[np.random.choice(
data.shape[0], size=n_clusters, replace=False
), :]
# Compute initial distances
# Zeros are replaced by eps to avoid division issues
dist = np.fmax(
cdist(centers, data, metric='sqeuclidean'),
np.finfo(np.float64).eps
)
for iter1 in range(max_iter):
# Compute memberships
u = (1 / dist) ** (1 / (m-1))
um = (u / u.sum(axis=0))**m
# Recompute centers
prev_centers = centers
centers = um.dot(data) / um.sum(axis=1)[:, None]
dist = cdist(centers, data, metric='sqeuclidean')
if np.linalg.norm(centers - prev_centers) < tol:
break
# Compute cost
cost = np.sum(um * dist)
if cost < min_cost:
min_cost = cost
min_centers = centers
mem = um.argmax(axis=0)
return min_centers, mem
if __name__ == '__main__':
data = np.loadtxt('grid5_edit.txt')
labels = data[:, -1]
k = len(np.unique(labels))
data = data[:, 0:-1]
repeats = 10
# Time This
fcm_time = 0
for iter1 in range(repeats):
fcm_start = time.time()
centers, mem = fcm(
data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
)
fcm_time += (time.time() - fcm_start)
print('Average FCM time =', fcm_time/repeats)
# Time This (as well)
km_time = 0
for iter1 in range(repeats):
km_start = time.time()
km1 = KMeans(
n_clusters=k, n_init=30, max_iter=300, tol=1e-16
).fit(data)
km_time += (time.time() - km_start)
print('Average kMeans time =', km_time/repeats)
print('Ratio of time =', fcm_time / km_time)
It seems the code is around 15 to 16 times slower than kMeans.
Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112
Other than a code review, I'm also hoping for any suggestions to make the code faster.
python numpy clustering benchmarking scipy
Did you consider using scikit-fuzzy? Implementation here.
â Gareth Rees
Feb 28 at 9:39
I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â Avisek
Mar 5 at 13:50
add a comment |Â
up vote
2
down vote
favorite
up vote
2
down vote
favorite
This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.
import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):
min_cost = np.inf
for iter_init in range(n_init):
# Randomly initialize centers
centers = data[np.random.choice(
data.shape[0], size=n_clusters, replace=False
), :]
# Compute initial distances
# Zeros are replaced by eps to avoid division issues
dist = np.fmax(
cdist(centers, data, metric='sqeuclidean'),
np.finfo(np.float64).eps
)
for iter1 in range(max_iter):
# Compute memberships
u = (1 / dist) ** (1 / (m-1))
um = (u / u.sum(axis=0))**m
# Recompute centers
prev_centers = centers
centers = um.dot(data) / um.sum(axis=1)[:, None]
dist = cdist(centers, data, metric='sqeuclidean')
if np.linalg.norm(centers - prev_centers) < tol:
break
# Compute cost
cost = np.sum(um * dist)
if cost < min_cost:
min_cost = cost
min_centers = centers
mem = um.argmax(axis=0)
return min_centers, mem
if __name__ == '__main__':
data = np.loadtxt('grid5_edit.txt')
labels = data[:, -1]
k = len(np.unique(labels))
data = data[:, 0:-1]
repeats = 10
# Time This
fcm_time = 0
for iter1 in range(repeats):
fcm_start = time.time()
centers, mem = fcm(
data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
)
fcm_time += (time.time() - fcm_start)
print('Average FCM time =', fcm_time/repeats)
# Time This (as well)
km_time = 0
for iter1 in range(repeats):
km_start = time.time()
km1 = KMeans(
n_clusters=k, n_init=30, max_iter=300, tol=1e-16
).fit(data)
km_time += (time.time() - km_start)
print('Average kMeans time =', km_time/repeats)
print('Ratio of time =', fcm_time / km_time)
It seems the code is around 15 to 16 times slower than kMeans.
Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112
Other than a code review, I'm also hoping for any suggestions to make the code faster.
python numpy clustering benchmarking scipy
This is my implementation of Fuzzy c-Means in Python. In the main section of the code, I compared the time it takes with the sklearn implementation of kMeans.
import time
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
def fcm(data, n_clusters=1, n_init=30, m=2, max_iter=300, tol=1e-16):
min_cost = np.inf
for iter_init in range(n_init):
# Randomly initialize centers
centers = data[np.random.choice(
data.shape[0], size=n_clusters, replace=False
), :]
# Compute initial distances
# Zeros are replaced by eps to avoid division issues
dist = np.fmax(
cdist(centers, data, metric='sqeuclidean'),
np.finfo(np.float64).eps
)
for iter1 in range(max_iter):
# Compute memberships
u = (1 / dist) ** (1 / (m-1))
um = (u / u.sum(axis=0))**m
# Recompute centers
prev_centers = centers
centers = um.dot(data) / um.sum(axis=1)[:, None]
dist = cdist(centers, data, metric='sqeuclidean')
if np.linalg.norm(centers - prev_centers) < tol:
break
# Compute cost
cost = np.sum(um * dist)
if cost < min_cost:
min_cost = cost
min_centers = centers
mem = um.argmax(axis=0)
return min_centers, mem
if __name__ == '__main__':
data = np.loadtxt('grid5_edit.txt')
labels = data[:, -1]
k = len(np.unique(labels))
data = data[:, 0:-1]
repeats = 10
# Time This
fcm_time = 0
for iter1 in range(repeats):
fcm_start = time.time()
centers, mem = fcm(
data, n_clusters=k, n_init=30, m=2, max_iter=300, tol=1e-16
)
fcm_time += (time.time() - fcm_start)
print('Average FCM time =', fcm_time/repeats)
# Time This (as well)
km_time = 0
for iter1 in range(repeats):
km_start = time.time()
km1 = KMeans(
n_clusters=k, n_init=30, max_iter=300, tol=1e-16
).fit(data)
km_time += (time.time() - km_start)
print('Average kMeans time =', km_time/repeats)
print('Ratio of time =', fcm_time / km_time)
It seems the code is around 15 to 16 times slower than kMeans.
Average FCM time = 3.7663435697555543
Average kMeans time = 0.24237003326416015
Ratio of time = 15.539642087892112
Other than a code review, I'm also hoping for any suggestions to make the code faster.
python numpy clustering benchmarking scipy
edited Feb 27 at 15:43
Billal BEGUERADJ
1
1
asked Feb 27 at 14:53
Avisek
5516
5516
Did you consider using scikit-fuzzy? Implementation here.
â Gareth Rees
Feb 28 at 9:39
I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â Avisek
Mar 5 at 13:50
add a comment |Â
Did you consider using scikit-fuzzy? Implementation here.
â Gareth Rees
Feb 28 at 9:39
I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â Avisek
Mar 5 at 13:50
Did you consider using scikit-fuzzy? Implementation here.
â Gareth Rees
Feb 28 at 9:39
Did you consider using scikit-fuzzy? Implementation here.
â Gareth Rees
Feb 28 at 9:39
I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â Avisek
Mar 5 at 13:50
I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â Avisek
Mar 5 at 13:50
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188455%2ffuzzy-c-means-in-python%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Did you consider using scikit-fuzzy? Implementation here.
â Gareth Rees
Feb 28 at 9:39
I seem to be getting slower times on scikit-fuzzy, somewhere between 3 to 9 times slower. The source code of scikit-fuzzy is more general, for example, it considers the possibility of negative exponents. It also does a lot of checks. When comparing my code with k-Means, I guess the slower time is due to the divisions and exponentiation. With 1 increase in the number of clusters, there's N more divisions and exponents to do (where N is the number of data rows).
â Avisek
Mar 5 at 13:50