[Zad3] Po odpowiedzi
This commit is contained in:
parent
d30f72f4c1
commit
28d5d702ac
BIN
zad3/data1_errors.png
(Stored with Git LFS)
BIN
zad3/data1_errors.png
(Stored with Git LFS)
Binary file not shown.
145
zad3/kmeans.py
Normal file
145
zad3/kmeans.py
Normal file
@ -0,0 +1,145 @@
|
||||
import matplotlib.pyplot as plt
|
||||
import utils
|
||||
from matplotlib.animation import FuncAnimation
|
||||
from random import sample, shuffle
|
||||
import numpy as np
|
||||
|
||||
|
||||
def plot_kmeans(all_data, k, name_suffix):
|
||||
fig, ax = plt.subplots()
|
||||
ax.set_xlabel('X')
|
||||
ax.set_ylabel('Y')
|
||||
ax.set_title(f'k={k}')
|
||||
time_text = ax.text(0.05, 0.95, 'iter=0', horizontalalignment='left',
|
||||
verticalalignment='top', transform=ax.transAxes)
|
||||
plt.grid(True)
|
||||
centroid_scatters = []
|
||||
cluster_scatters = {}
|
||||
centroids, clusters = all_data[0]
|
||||
for key in clusters:
|
||||
color = utils.get_color(key / k)
|
||||
if clusters[key]:
|
||||
lst_x, lst_y = zip(*clusters[key])
|
||||
lst_x = list(lst_x)
|
||||
lst_y = list(lst_y)
|
||||
cluster_scatters[key] = ax.scatter(lst_x, lst_y, color=color)
|
||||
centroid_scatters.append(ax.scatter([centroids[key][0]], [
|
||||
centroids[key][1]], color=color, marker='X'))
|
||||
|
||||
def update_plot_kmeans(i):
|
||||
centroids, clusters = all_data[i]
|
||||
time_text.set_text(f'iter={i}')
|
||||
for key in clusters:
|
||||
centroid_scatters[key].set_offsets(centroids[key])
|
||||
if clusters[key]:
|
||||
if key in cluster_scatters:
|
||||
cluster_scatters[key].set_offsets(clusters[key])
|
||||
else:
|
||||
color = utils.get_color(key/k)
|
||||
lst_x, lst_y = zip(*clusters[key])
|
||||
lst_x = list(lst_x)
|
||||
lst_y = list(lst_y)
|
||||
cluster_scatters[key] = ax.scatter(
|
||||
lst_x, lst_y, color=color)
|
||||
return centroid_scatters + list(cluster_scatters.values()) + [time_text, ]
|
||||
|
||||
anim = FuncAnimation(fig, update_plot_kmeans,
|
||||
frames=len(all_data), blit=True)
|
||||
anim.save(f'animationKMEANS{name_suffix}.gif')
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def calc_error(centroids, clusters, k):
|
||||
squared_errors = []
|
||||
for i in range(k):
|
||||
cluster = np.array(clusters[i])
|
||||
centroid = np.array([centroids[i] for _ in range(len(cluster))])
|
||||
errors = cluster - centroid
|
||||
squared_errors.append([e ** 2 for e in errors])
|
||||
return sum([np.mean(err) if err else 0 for err in squared_errors])
|
||||
|
||||
|
||||
def plot_error_data(error_data):
|
||||
fig, ax = plt.subplots()
|
||||
ax.set_xlabel('k')
|
||||
ax.set_ylabel('err')
|
||||
ax.set_xlim(2, 20)
|
||||
plt.title('Errors')
|
||||
plt.grid(True)
|
||||
|
||||
lst_x, lst_y = zip(*error_data)
|
||||
lst_x = list(lst_x)
|
||||
lst_y = list(lst_y)
|
||||
ax.plot(lst_x, lst_y, 'ro-')
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def print_stats(k, data):
|
||||
print(f'k={k}')
|
||||
centroids_with_clusters, errs = zip(*data)
|
||||
centroids, clusters = zip(*centroids_with_clusters)
|
||||
m = np.mean(errs)
|
||||
std = np.std(errs)
|
||||
min_err = np.min(errs)
|
||||
empty_clusters = [sum([1 for cluster in sample.values() if not cluster]) for sample in clusters]
|
||||
empty_clusters_mean = sum(empty_clusters)/len(empty_clusters)
|
||||
empty_clusters_std = np.std(empty_clusters)
|
||||
print(f'MSE={m}')
|
||||
print(f'std={std}')
|
||||
print(f'min(err)={min_err}')
|
||||
print(f'Mean of empty clusters count={empty_clusters_mean}')
|
||||
print(f'Standard deviation of empty clusters count={empty_clusters_std}')
|
||||
print('='*20)
|
||||
|
||||
|
||||
def kmeans(data, method, k):
|
||||
kmeans_with_err = []
|
||||
for _ in range(100):
|
||||
centroids_with_clusters = []
|
||||
centroids = init_units(data, k, method=method)
|
||||
clusters = {}
|
||||
for i in range(k):
|
||||
clusters[i] = []
|
||||
for point in data:
|
||||
lengths = [utils.calc_length(c, point) for c in centroids]
|
||||
index_min = int(np.argmin(lengths))
|
||||
clusters[index_min].append(point)
|
||||
centroids_with_clusters.append((list(centroids), clusters))
|
||||
for _ in range(100):
|
||||
for key in clusters:
|
||||
if clusters[key]:
|
||||
centroids[key] = np.mean(clusters[key], axis=0)
|
||||
clusters = {}
|
||||
for i in range(k):
|
||||
clusters[i] = []
|
||||
for point in data:
|
||||
lengths = [utils.calc_length(c, point)
|
||||
for c in centroids]
|
||||
index_min = int(np.argmin(lengths))
|
||||
clusters[index_min].append(point)
|
||||
centroids_with_clusters.append(
|
||||
(list(centroids), clusters))
|
||||
if all([all(np.isclose(centroids_with_clusters[-1][0][i], centroids_with_clusters[-2][0][i]))
|
||||
for i in range(k)]):
|
||||
break
|
||||
err = calc_error(centroids, clusters, k)
|
||||
kmeans_with_err.append((centroids_with_clusters, err))
|
||||
return kmeans_with_err
|
||||
|
||||
|
||||
def init_units(data, k, method='forgy'): # TODO: Add k-units++ and Random Partition
|
||||
match method:
|
||||
case 'forgy':
|
||||
return sample(data, k)
|
||||
case 'random_partition':
|
||||
shuffled = list(data)
|
||||
shuffle(shuffled)
|
||||
div = len(shuffled) / k
|
||||
partition = [
|
||||
shuffled[int(round(div * i)):int(round(div * (i + 1)))] for i in range(k)]
|
||||
return [np.mean(prt, axis=0) for prt in partition]
|
||||
case _:
|
||||
raise NotImplementedError(
|
||||
f'method {method} is not implemented yet')
|
56
zad3/utils.py
Normal file
56
zad3/utils.py
Normal file
@ -0,0 +1,56 @@
|
||||
import matplotlib.pyplot as plt
|
||||
from generate_points import get_random_point
|
||||
|
||||
|
||||
def get_color(i):
|
||||
return plt.get_cmap('tab20')(i)
|
||||
|
||||
|
||||
def calc_length(a, b):
|
||||
# return ((b[0]-a[0])**2+(b[1]-a[1])**2)**0.5
|
||||
# no need to calculate square root for comparison
|
||||
return (b[0] - a[0]) ** 2 + (b[1] - a[1]) ** 2
|
||||
|
||||
|
||||
def plot_data(data):
|
||||
lst_x, lst_y = zip(*data)
|
||||
lst_x = list(lst_x)
|
||||
lst_y = list(lst_y)
|
||||
plt.figure(1)
|
||||
ax = plt.axes()
|
||||
ax.scatter(lst_x, lst_y)
|
||||
ax.set_xlabel('X')
|
||||
ax.set_ylabel('Y')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_error_data(error_data):
|
||||
fig, ax = plt.subplots()
|
||||
ax.set_xlabel('k')
|
||||
ax.set_ylabel('err')
|
||||
ax.set_xlim(2, 20)
|
||||
plt.title('Errors')
|
||||
plt.grid(True)
|
||||
|
||||
lst_x, lst_y = zip(*error_data)
|
||||
lst_x = list(lst_x)
|
||||
lst_y = list(lst_y)
|
||||
ax.plot(lst_x, lst_y, 'ro-')
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def get_data1():
|
||||
data = []
|
||||
for _ in range(200):
|
||||
data.append(get_random_point((0, 0), 1))
|
||||
return data
|
||||
|
||||
|
||||
def get_data2():
|
||||
data = []
|
||||
for i in range(2):
|
||||
for _ in range(100):
|
||||
data.append(get_random_point((3 * ((-1) ** i), 0), 0.5))
|
||||
return data
|
202
zad3/zad3.py
202
zad3/zad3.py
@ -1,194 +1,50 @@
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib.animation import FuncAnimation
|
||||
from random import sample, shuffle
|
||||
from generate_points import get_random_point
|
||||
import numpy as np
|
||||
import kmeans as km
|
||||
import utils
|
||||
import json
|
||||
|
||||
|
||||
METHODS = ['forgy', 'random_partition']
|
||||
|
||||
|
||||
def get_color(i):
|
||||
return plt.get_cmap('tab20')(i)
|
||||
def get_datas_from_json():
|
||||
datas = []
|
||||
with open('data1.json', 'r') as d:
|
||||
datas.append(json.loads(d.read()))
|
||||
with open('data2.json', 'r') as d:
|
||||
datas.append(json.loads(d.read()))
|
||||
return datas
|
||||
|
||||
|
||||
def get_data1():
|
||||
data = []
|
||||
for _ in range(200):
|
||||
data.append(get_random_point((0, 0), 1))
|
||||
return data
|
||||
def get_datas_random():
|
||||
datas = []
|
||||
for get_data in [utils.get_data1, utils.get_data2]:
|
||||
datas.append(get_data())
|
||||
return datas
|
||||
|
||||
|
||||
def get_data2():
|
||||
data = []
|
||||
for i in range(2):
|
||||
for _ in range(100):
|
||||
data.append(get_random_point((3*((-1)**i), 0), 0.5))
|
||||
return data
|
||||
|
||||
|
||||
def plot_data(data):
|
||||
lst_x, lst_y = zip(*data)
|
||||
lst_x = list(lst_x)
|
||||
lst_y = list(lst_y)
|
||||
plt.figure(1)
|
||||
ax = plt.axes()
|
||||
ax.scatter(lst_x, lst_y)
|
||||
ax.set_xlabel('X')
|
||||
ax.set_ylabel('Y')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
|
||||
def plot_kmeans(all_data, k):
|
||||
fig, ax = plt.subplots()
|
||||
ax.set_xlabel('X')
|
||||
ax.set_ylabel('Y')
|
||||
ax.set_title(f'k={k}')
|
||||
time_text = ax.text(0.05, 0.95, 'iter=0', horizontalalignment='left',
|
||||
verticalalignment='top', transform=ax.transAxes)
|
||||
plt.grid(True)
|
||||
centroid_scatters = []
|
||||
cluster_scatters = []
|
||||
centroids, clusters = all_data[0]
|
||||
for key in clusters:
|
||||
color = get_color(key/k)
|
||||
if clusters[key]:
|
||||
lst_x, lst_y = zip(*clusters[key])
|
||||
lst_x = list(lst_x)
|
||||
lst_y = list(lst_y)
|
||||
cluster_scatters.append(ax.scatter(lst_x, lst_y, color=color))
|
||||
centroid_scatters.append(ax.scatter([centroids[key][0]], [
|
||||
centroids[key][1]], color=color, marker='X'))
|
||||
|
||||
def update_plot_kmeans(i):
|
||||
centroids, clusters = all_data[i]
|
||||
time_text.set_text(f'iter={i}')
|
||||
for key in clusters:
|
||||
centroid_scatters[key].set_offsets(centroids[key])
|
||||
cluster_scatters[key].set_offsets(clusters[key])
|
||||
return centroid_scatters+cluster_scatters+[time_text, ]
|
||||
anim = FuncAnimation(fig, update_plot_kmeans,
|
||||
frames=len(all_data), blit=True)
|
||||
# anim.save('animation.mp4')
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def calc_length(a, b):
|
||||
# return ((b[0]-a[0])**2+(b[1]-a[1])**2)**0.5
|
||||
# no need to calculate square root for comparison
|
||||
return (b[0]-a[0])**2+(b[1]-a[1])**2
|
||||
|
||||
|
||||
def init_centroids(data, k, method='forgy'): # TODO: Add k-means++ and Random Partition
|
||||
match method:
|
||||
case 'forgy':
|
||||
return sample(data, k)
|
||||
case 'random_partition':
|
||||
shuffled = list(data)
|
||||
shuffle(shuffled)
|
||||
div = len(shuffled)/k
|
||||
partition = [shuffled[int(round(div*i)):int(round(div*(i+1)))] for i in range(k)]
|
||||
return [np.mean(prt, axis=0) for prt in partition]
|
||||
case _:
|
||||
raise NotImplementedError(
|
||||
f'method {method} is not implemented yet')
|
||||
|
||||
|
||||
def calc_error(centroids, clusters, k):
|
||||
squared_errors = []
|
||||
for i in range(k):
|
||||
cluster = np.array(clusters[i])
|
||||
centroid = np.array([centroids[i] for _ in range(len(cluster))])
|
||||
errors = cluster - centroid
|
||||
squared_errors.append([e**2 for e in errors])
|
||||
return sum([np.mean(err) if err else 0 for err in squared_errors])
|
||||
|
||||
|
||||
def plot_error_data(error_data):
|
||||
fig, ax = plt.subplots()
|
||||
ax.set_xlabel('k')
|
||||
ax.set_ylabel('err')
|
||||
ax.set_xlim(2, 20)
|
||||
plt.title('Errors')
|
||||
plt.grid(True)
|
||||
|
||||
lst_x, lst_y = zip(*error_data)
|
||||
lst_x = list(lst_x)
|
||||
lst_y = list(lst_y)
|
||||
ax.plot(lst_x, lst_y, 'ro-')
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def print_stats(k, data):
|
||||
print('='*20)
|
||||
print(f'k={k}')
|
||||
errs = [x[1] for x in data]
|
||||
m = np.mean(errs)
|
||||
std = np.std(errs)
|
||||
min_err = np.min(errs)
|
||||
lst_empty = [sum([1 for cluster in centroids_with_clusters[1] if not cluster]) for centroids_with_clusters,_ in data]
|
||||
print(lst_empty)
|
||||
|
||||
|
||||
def main(datas):
|
||||
# for get_data in [get_data1, get_data2]:
|
||||
# data = get_data()
|
||||
def main():
|
||||
datas = get_datas_from_json()
|
||||
index = 1
|
||||
for data in datas:
|
||||
plot_data(data)
|
||||
utils.plot_data(data)
|
||||
for method in METHODS:
|
||||
print(f'Method: {method}')
|
||||
kmeans_data = {}
|
||||
for k in [20]: # range(2, 21):
|
||||
kmeans_with_err = []
|
||||
for _ in range(100):
|
||||
centroids_with_clusters = []
|
||||
centroids = init_centroids(data, k, method=method)
|
||||
clusters = {}
|
||||
for i in range(k):
|
||||
clusters[i] = []
|
||||
for point in data:
|
||||
lengths = [calc_length(c, point) for c in centroids]
|
||||
index_min = np.argmin(lengths)
|
||||
clusters[index_min].append(point)
|
||||
centroids_with_clusters.append((list(centroids), clusters))
|
||||
for _ in range(100):
|
||||
for key in clusters:
|
||||
if clusters[key]:
|
||||
centroids[key] = np.mean(clusters[key], axis=0)
|
||||
clusters = {}
|
||||
for i in range(k):
|
||||
clusters[i] = []
|
||||
for point in data:
|
||||
lengths = [calc_length(c, point)
|
||||
for c in centroids]
|
||||
index_min = np.argmin(lengths)
|
||||
clusters[index_min].append(point)
|
||||
centroids_with_clusters.append(
|
||||
(list(centroids), clusters))
|
||||
if all([all(np.isclose(centroids_with_clusters[-1][0][i], centroids_with_clusters[-2][0][i])) for i in range(k)]):
|
||||
break
|
||||
err = calc_error(centroids, clusters, k)
|
||||
kmeans_with_err.append((centroids_with_clusters, err))
|
||||
print_stats(k, [(iterations[-1],err) for iterations, err in kmeans_with_err])
|
||||
for k in range(2, 21):
|
||||
kmeans_with_err = km.kmeans(data, method, k)
|
||||
km.print_stats(k, [(iterations[-1], err)
|
||||
for iterations, err in kmeans_with_err])
|
||||
min_err = kmeans_with_err[0][1]
|
||||
kmeans = kmeans_with_err[0][0]
|
||||
for temp_kmeans, err in kmeans_with_err:
|
||||
if err < min_err:
|
||||
min_err = err
|
||||
kmeans = temp_kmeans
|
||||
kmeans_data[k]=(kmeans, min_err)
|
||||
plot_kmeans(kmeans, k)
|
||||
#error_data = [[i, kmeans_data[i][1]] for i in range(2, 21, 2)]
|
||||
#plot_error_data(error_data)
|
||||
kmeans_data[k] = (kmeans, min_err)
|
||||
km.plot_kmeans(kmeans, k, f'_{method}_{k}_{index}')
|
||||
error_data = [[i, kmeans_data[i][1]] for i in range(2, 21, 2)]
|
||||
utils.plot_error_data(error_data)
|
||||
index += 1
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
datas = []
|
||||
with open('data1.json', 'r') as d:
|
||||
datas.append(json.loads(d.read()))
|
||||
with open('data2.json', 'r') as d:
|
||||
datas.append(json.loads(d.read()))
|
||||
main(datas)
|
||||
main()
|
||||
|
Loading…
Reference in New Issue
Block a user