diff --git a/zad3/data1_errors.png b/zad3/data1_errors.png index 290f4db..426121d 100644 --- a/zad3/data1_errors.png +++ b/zad3/data1_errors.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce4ddc242499f89264fdb8ba0d43d995cc617c2180a17c49be9e3a6a41612a82 -size 20193 +oid sha256:c9f2b395c6b354afc855fb1826b4908bbaa2e3c5fdabee0723d6cbc7c90eaeee +size 17887 diff --git a/zad3/kmeans.py b/zad3/kmeans.py new file mode 100644 index 0000000..aa9323b --- /dev/null +++ b/zad3/kmeans.py @@ -0,0 +1,145 @@ +import matplotlib.pyplot as plt +import utils +from matplotlib.animation import FuncAnimation +from random import sample, shuffle +import numpy as np + + +def plot_kmeans(all_data, k, name_suffix): + fig, ax = plt.subplots() + ax.set_xlabel('X') + ax.set_ylabel('Y') + ax.set_title(f'k={k}') + time_text = ax.text(0.05, 0.95, 'iter=0', horizontalalignment='left', + verticalalignment='top', transform=ax.transAxes) + plt.grid(True) + centroid_scatters = [] + cluster_scatters = {} + centroids, clusters = all_data[0] + for key in clusters: + color = utils.get_color(key / k) + if clusters[key]: + lst_x, lst_y = zip(*clusters[key]) + lst_x = list(lst_x) + lst_y = list(lst_y) + cluster_scatters[key] = ax.scatter(lst_x, lst_y, color=color) + centroid_scatters.append(ax.scatter([centroids[key][0]], [ + centroids[key][1]], color=color, marker='X')) + + def update_plot_kmeans(i): + centroids, clusters = all_data[i] + time_text.set_text(f'iter={i}') + for key in clusters: + centroid_scatters[key].set_offsets(centroids[key]) + if clusters[key]: + if key in cluster_scatters: + cluster_scatters[key].set_offsets(clusters[key]) + else: + color = utils.get_color(key/k) + lst_x, lst_y = zip(*clusters[key]) + lst_x = list(lst_x) + lst_y = list(lst_y) + cluster_scatters[key] = ax.scatter( + lst_x, lst_y, color=color) + return centroid_scatters + list(cluster_scatters.values()) + [time_text, ] + + anim = FuncAnimation(fig, update_plot_kmeans, + frames=len(all_data), blit=True) + anim.save(f'animationKMEANS{name_suffix}.gif') + + plt.show() + + +def calc_error(centroids, clusters, k): + squared_errors = [] + for i in range(k): + cluster = np.array(clusters[i]) + centroid = np.array([centroids[i] for _ in range(len(cluster))]) + errors = cluster - centroid + squared_errors.append([e ** 2 for e in errors]) + return sum([np.mean(err) if err else 0 for err in squared_errors]) + + +def plot_error_data(error_data): + fig, ax = plt.subplots() + ax.set_xlabel('k') + ax.set_ylabel('err') + ax.set_xlim(2, 20) + plt.title('Errors') + plt.grid(True) + + lst_x, lst_y = zip(*error_data) + lst_x = list(lst_x) + lst_y = list(lst_y) + ax.plot(lst_x, lst_y, 'ro-') + + plt.show() + + +def print_stats(k, data): + print(f'k={k}') + centroids_with_clusters, errs = zip(*data) + centroids, clusters = zip(*centroids_with_clusters) + m = np.mean(errs) + std = np.std(errs) + min_err = np.min(errs) + empty_clusters = [sum([1 for cluster in sample.values() if not cluster]) for sample in clusters] + empty_clusters_mean = sum(empty_clusters)/len(empty_clusters) + empty_clusters_std = np.std(empty_clusters) + print(f'MSE={m}') + print(f'std={std}') + print(f'min(err)={min_err}') + print(f'Mean of empty clusters count={empty_clusters_mean}') + print(f'Standard deviation of empty clusters count={empty_clusters_std}') + print('='*20) + + +def kmeans(data, method, k): + kmeans_with_err = [] + for _ in range(100): + centroids_with_clusters = [] + centroids = init_units(data, k, method=method) + clusters = {} + for i in range(k): + clusters[i] = [] + for point in data: + lengths = [utils.calc_length(c, point) for c in centroids] + index_min = int(np.argmin(lengths)) + clusters[index_min].append(point) + centroids_with_clusters.append((list(centroids), clusters)) + for _ in range(100): + for key in clusters: + if clusters[key]: + centroids[key] = np.mean(clusters[key], axis=0) + clusters = {} + for i in range(k): + clusters[i] = [] + for point in data: + lengths = [utils.calc_length(c, point) + for c in centroids] + index_min = int(np.argmin(lengths)) + clusters[index_min].append(point) + centroids_with_clusters.append( + (list(centroids), clusters)) + if all([all(np.isclose(centroids_with_clusters[-1][0][i], centroids_with_clusters[-2][0][i])) + for i in range(k)]): + break + err = calc_error(centroids, clusters, k) + kmeans_with_err.append((centroids_with_clusters, err)) + return kmeans_with_err + + +def init_units(data, k, method='forgy'): # TODO: Add k-units++ and Random Partition + match method: + case 'forgy': + return sample(data, k) + case 'random_partition': + shuffled = list(data) + shuffle(shuffled) + div = len(shuffled) / k + partition = [ + shuffled[int(round(div * i)):int(round(div * (i + 1)))] for i in range(k)] + return [np.mean(prt, axis=0) for prt in partition] + case _: + raise NotImplementedError( + f'method {method} is not implemented yet') diff --git a/zad3/utils.py b/zad3/utils.py new file mode 100644 index 0000000..efb266c --- /dev/null +++ b/zad3/utils.py @@ -0,0 +1,56 @@ +import matplotlib.pyplot as plt +from generate_points import get_random_point + + +def get_color(i): + return plt.get_cmap('tab20')(i) + + +def calc_length(a, b): + # return ((b[0]-a[0])**2+(b[1]-a[1])**2)**0.5 + # no need to calculate square root for comparison + return (b[0] - a[0]) ** 2 + (b[1] - a[1]) ** 2 + + +def plot_data(data): + lst_x, lst_y = zip(*data) + lst_x = list(lst_x) + lst_y = list(lst_y) + plt.figure(1) + ax = plt.axes() + ax.scatter(lst_x, lst_y) + ax.set_xlabel('X') + ax.set_ylabel('Y') + plt.grid(True) + plt.show() + + +def plot_error_data(error_data): + fig, ax = plt.subplots() + ax.set_xlabel('k') + ax.set_ylabel('err') + ax.set_xlim(2, 20) + plt.title('Errors') + plt.grid(True) + + lst_x, lst_y = zip(*error_data) + lst_x = list(lst_x) + lst_y = list(lst_y) + ax.plot(lst_x, lst_y, 'ro-') + + plt.show() + + +def get_data1(): + data = [] + for _ in range(200): + data.append(get_random_point((0, 0), 1)) + return data + + +def get_data2(): + data = [] + for i in range(2): + for _ in range(100): + data.append(get_random_point((3 * ((-1) ** i), 0), 0.5)) + return data diff --git a/zad3/zad3.py b/zad3/zad3.py index 9589a2f..32a8bf4 100644 --- a/zad3/zad3.py +++ b/zad3/zad3.py @@ -1,194 +1,50 @@ -import matplotlib.pyplot as plt -from matplotlib.animation import FuncAnimation -from random import sample, shuffle -from generate_points import get_random_point -import numpy as np +import kmeans as km +import utils import json - METHODS = ['forgy', 'random_partition'] -def get_color(i): - return plt.get_cmap('tab20')(i) +def get_datas_from_json(): + datas = [] + with open('data1.json', 'r') as d: + datas.append(json.loads(d.read())) + with open('data2.json', 'r') as d: + datas.append(json.loads(d.read())) + return datas -def get_data1(): - data = [] - for _ in range(200): - data.append(get_random_point((0, 0), 1)) - return data +def get_datas_random(): + datas = [] + for get_data in [utils.get_data1, utils.get_data2]: + datas.append(get_data()) + return datas -def get_data2(): - data = [] - for i in range(2): - for _ in range(100): - data.append(get_random_point((3*((-1)**i), 0), 0.5)) - return data - - -def plot_data(data): - lst_x, lst_y = zip(*data) - lst_x = list(lst_x) - lst_y = list(lst_y) - plt.figure(1) - ax = plt.axes() - ax.scatter(lst_x, lst_y) - ax.set_xlabel('X') - ax.set_ylabel('Y') - plt.grid(True) - plt.show() - - -def plot_kmeans(all_data, k): - fig, ax = plt.subplots() - ax.set_xlabel('X') - ax.set_ylabel('Y') - ax.set_title(f'k={k}') - time_text = ax.text(0.05, 0.95, 'iter=0', horizontalalignment='left', - verticalalignment='top', transform=ax.transAxes) - plt.grid(True) - centroid_scatters = [] - cluster_scatters = [] - centroids, clusters = all_data[0] - for key in clusters: - color = get_color(key/k) - if clusters[key]: - lst_x, lst_y = zip(*clusters[key]) - lst_x = list(lst_x) - lst_y = list(lst_y) - cluster_scatters.append(ax.scatter(lst_x, lst_y, color=color)) - centroid_scatters.append(ax.scatter([centroids[key][0]], [ - centroids[key][1]], color=color, marker='X')) - - def update_plot_kmeans(i): - centroids, clusters = all_data[i] - time_text.set_text(f'iter={i}') - for key in clusters: - centroid_scatters[key].set_offsets(centroids[key]) - cluster_scatters[key].set_offsets(clusters[key]) - return centroid_scatters+cluster_scatters+[time_text, ] - anim = FuncAnimation(fig, update_plot_kmeans, - frames=len(all_data), blit=True) - # anim.save('animation.mp4') - - plt.show() - - -def calc_length(a, b): - # return ((b[0]-a[0])**2+(b[1]-a[1])**2)**0.5 - # no need to calculate square root for comparison - return (b[0]-a[0])**2+(b[1]-a[1])**2 - - -def init_centroids(data, k, method='forgy'): # TODO: Add k-means++ and Random Partition - match method: - case 'forgy': - return sample(data, k) - case 'random_partition': - shuffled = list(data) - shuffle(shuffled) - div = len(shuffled)/k - partition = [shuffled[int(round(div*i)):int(round(div*(i+1)))] for i in range(k)] - return [np.mean(prt, axis=0) for prt in partition] - case _: - raise NotImplementedError( - f'method {method} is not implemented yet') - - -def calc_error(centroids, clusters, k): - squared_errors = [] - for i in range(k): - cluster = np.array(clusters[i]) - centroid = np.array([centroids[i] for _ in range(len(cluster))]) - errors = cluster - centroid - squared_errors.append([e**2 for e in errors]) - return sum([np.mean(err) if err else 0 for err in squared_errors]) - - -def plot_error_data(error_data): - fig, ax = plt.subplots() - ax.set_xlabel('k') - ax.set_ylabel('err') - ax.set_xlim(2, 20) - plt.title('Errors') - plt.grid(True) - - lst_x, lst_y = zip(*error_data) - lst_x = list(lst_x) - lst_y = list(lst_y) - ax.plot(lst_x, lst_y, 'ro-') - - plt.show() - - -def print_stats(k, data): - print('='*20) - print(f'k={k}') - errs = [x[1] for x in data] - m = np.mean(errs) - std = np.std(errs) - min_err = np.min(errs) - lst_empty = [sum([1 for cluster in centroids_with_clusters[1] if not cluster]) for centroids_with_clusters,_ in data] - print(lst_empty) - - -def main(datas): - # for get_data in [get_data1, get_data2]: - # data = get_data() +def main(): + datas = get_datas_from_json() + index = 1 for data in datas: - plot_data(data) + utils.plot_data(data) for method in METHODS: + print(f'Method: {method}') kmeans_data = {} - for k in [20]: # range(2, 21): - kmeans_with_err = [] - for _ in range(100): - centroids_with_clusters = [] - centroids = init_centroids(data, k, method=method) - clusters = {} - for i in range(k): - clusters[i] = [] - for point in data: - lengths = [calc_length(c, point) for c in centroids] - index_min = np.argmin(lengths) - clusters[index_min].append(point) - centroids_with_clusters.append((list(centroids), clusters)) - for _ in range(100): - for key in clusters: - if clusters[key]: - centroids[key] = np.mean(clusters[key], axis=0) - clusters = {} - for i in range(k): - clusters[i] = [] - for point in data: - lengths = [calc_length(c, point) - for c in centroids] - index_min = np.argmin(lengths) - clusters[index_min].append(point) - centroids_with_clusters.append( - (list(centroids), clusters)) - if all([all(np.isclose(centroids_with_clusters[-1][0][i], centroids_with_clusters[-2][0][i])) for i in range(k)]): - break - err = calc_error(centroids, clusters, k) - kmeans_with_err.append((centroids_with_clusters, err)) - print_stats(k, [(iterations[-1],err) for iterations, err in kmeans_with_err]) + for k in range(2, 21): + kmeans_with_err = km.kmeans(data, method, k) + km.print_stats(k, [(iterations[-1], err) + for iterations, err in kmeans_with_err]) min_err = kmeans_with_err[0][1] kmeans = kmeans_with_err[0][0] for temp_kmeans, err in kmeans_with_err: if err < min_err: min_err = err kmeans = temp_kmeans - kmeans_data[k]=(kmeans, min_err) - plot_kmeans(kmeans, k) - #error_data = [[i, kmeans_data[i][1]] for i in range(2, 21, 2)] - #plot_error_data(error_data) + kmeans_data[k] = (kmeans, min_err) + km.plot_kmeans(kmeans, k, f'_{method}_{k}_{index}') + error_data = [[i, kmeans_data[i][1]] for i in range(2, 21, 2)] + utils.plot_error_data(error_data) + index += 1 if __name__ == '__main__': - datas = [] - with open('data1.json', 'r') as d: - datas.append(json.loads(d.read())) - with open('data2.json', 'r') as d: - datas.append(json.loads(d.read())) - main(datas) + main()