From 28d5d702ac1cddc9fb28d49f1cee8fb4e30db738 Mon Sep 17 00:00:00 2001
From: Kapitan <kapitan@mlesniak.pl>
Date: Mon, 7 Feb 2022 15:24:23 +0100
Subject: [PATCH] [Zad3] Po odpowiedzi

---
 zad3/data1_errors.png |   4 +-
 zad3/kmeans.py        | 145 ++++++++++++++++++++++++++++++
 zad3/utils.py         |  56 ++++++++++++
 zad3/zad3.py          | 202 ++++++------------------------------------
 4 files changed, 232 insertions(+), 175 deletions(-)
 create mode 100644 zad3/kmeans.py
 create mode 100644 zad3/utils.py

diff --git a/zad3/data1_errors.png b/zad3/data1_errors.png
index 290f4db..426121d 100644
--- a/zad3/data1_errors.png
+++ b/zad3/data1_errors.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ce4ddc242499f89264fdb8ba0d43d995cc617c2180a17c49be9e3a6a41612a82
-size 20193
+oid sha256:c9f2b395c6b354afc855fb1826b4908bbaa2e3c5fdabee0723d6cbc7c90eaeee
+size 17887
diff --git a/zad3/kmeans.py b/zad3/kmeans.py
new file mode 100644
index 0000000..aa9323b
--- /dev/null
+++ b/zad3/kmeans.py
@@ -0,0 +1,145 @@
+import matplotlib.pyplot as plt
+import utils
+from matplotlib.animation import FuncAnimation
+from random import sample, shuffle
+import numpy as np
+
+
+def plot_kmeans(all_data, k, name_suffix):
+    fig, ax = plt.subplots()
+    ax.set_xlabel('X')
+    ax.set_ylabel('Y')
+    ax.set_title(f'k={k}')
+    time_text = ax.text(0.05, 0.95, 'iter=0', horizontalalignment='left',
+                        verticalalignment='top', transform=ax.transAxes)
+    plt.grid(True)
+    centroid_scatters = []
+    cluster_scatters = {}
+    centroids, clusters = all_data[0]
+    for key in clusters:
+        color = utils.get_color(key / k)
+        if clusters[key]:
+            lst_x, lst_y = zip(*clusters[key])
+            lst_x = list(lst_x)
+            lst_y = list(lst_y)
+            cluster_scatters[key] = ax.scatter(lst_x, lst_y, color=color)
+        centroid_scatters.append(ax.scatter([centroids[key][0]], [
+            centroids[key][1]], color=color, marker='X'))
+
+    def update_plot_kmeans(i):
+        centroids, clusters = all_data[i]
+        time_text.set_text(f'iter={i}')
+        for key in clusters:
+            centroid_scatters[key].set_offsets(centroids[key])
+            if clusters[key]:
+                if key in cluster_scatters:
+                    cluster_scatters[key].set_offsets(clusters[key])
+                else:
+                    color = utils.get_color(key/k)
+                    lst_x, lst_y = zip(*clusters[key])
+                    lst_x = list(lst_x)
+                    lst_y = list(lst_y)
+                    cluster_scatters[key] = ax.scatter(
+                        lst_x, lst_y, color=color)
+        return centroid_scatters + list(cluster_scatters.values()) + [time_text, ]
+
+    anim = FuncAnimation(fig, update_plot_kmeans,
+                         frames=len(all_data), blit=True)
+    anim.save(f'animationKMEANS{name_suffix}.gif')
+
+    plt.show()
+
+
+def calc_error(centroids, clusters, k):
+    squared_errors = []
+    for i in range(k):
+        cluster = np.array(clusters[i])
+        centroid = np.array([centroids[i] for _ in range(len(cluster))])
+        errors = cluster - centroid
+        squared_errors.append([e ** 2 for e in errors])
+    return sum([np.mean(err) if err else 0 for err in squared_errors])
+
+
+def plot_error_data(error_data):
+    fig, ax = plt.subplots()
+    ax.set_xlabel('k')
+    ax.set_ylabel('err')
+    ax.set_xlim(2, 20)
+    plt.title('Errors')
+    plt.grid(True)
+
+    lst_x, lst_y = zip(*error_data)
+    lst_x = list(lst_x)
+    lst_y = list(lst_y)
+    ax.plot(lst_x, lst_y, 'ro-')
+
+    plt.show()
+
+
+def print_stats(k, data):
+    print(f'k={k}')
+    centroids_with_clusters, errs = zip(*data)
+    centroids, clusters = zip(*centroids_with_clusters)
+    m = np.mean(errs)
+    std = np.std(errs)
+    min_err = np.min(errs)
+    empty_clusters = [sum([1 for cluster in sample.values() if not cluster]) for sample in clusters]
+    empty_clusters_mean = sum(empty_clusters)/len(empty_clusters)
+    empty_clusters_std = np.std(empty_clusters)
+    print(f'MSE={m}')
+    print(f'std={std}')
+    print(f'min(err)={min_err}')
+    print(f'Mean of empty clusters count={empty_clusters_mean}')
+    print(f'Standard deviation of empty clusters count={empty_clusters_std}')
+    print('='*20)
+
+
+def kmeans(data, method, k):
+    kmeans_with_err = []
+    for _ in range(100):
+        centroids_with_clusters = []
+        centroids = init_units(data, k, method=method)
+        clusters = {}
+        for i in range(k):
+            clusters[i] = []
+        for point in data:
+            lengths = [utils.calc_length(c, point) for c in centroids]
+            index_min = int(np.argmin(lengths))
+            clusters[index_min].append(point)
+        centroids_with_clusters.append((list(centroids), clusters))
+        for _ in range(100):
+            for key in clusters:
+                if clusters[key]:
+                    centroids[key] = np.mean(clusters[key], axis=0)
+            clusters = {}
+            for i in range(k):
+                clusters[i] = []
+            for point in data:
+                lengths = [utils.calc_length(c, point)
+                           for c in centroids]
+                index_min = int(np.argmin(lengths))
+                clusters[index_min].append(point)
+            centroids_with_clusters.append(
+                (list(centroids), clusters))
+            if all([all(np.isclose(centroids_with_clusters[-1][0][i], centroids_with_clusters[-2][0][i]))
+                    for i in range(k)]):
+                break
+        err = calc_error(centroids, clusters, k)
+        kmeans_with_err.append((centroids_with_clusters, err))
+    return kmeans_with_err
+
+
+def init_units(data, k, method='forgy'):  # TODO: Add k-units++ and Random Partition
+    match method:
+        case 'forgy':
+            return sample(data, k)
+        case 'random_partition':
+            shuffled = list(data)
+            shuffle(shuffled)
+            div = len(shuffled) / k
+            partition = [
+                shuffled[int(round(div * i)):int(round(div * (i + 1)))] for i in range(k)]
+            return [np.mean(prt, axis=0) for prt in partition]
+        case _:
+            raise NotImplementedError(
+                f'method {method} is not implemented yet')
diff --git a/zad3/utils.py b/zad3/utils.py
new file mode 100644
index 0000000..efb266c
--- /dev/null
+++ b/zad3/utils.py
@@ -0,0 +1,56 @@
+import matplotlib.pyplot as plt
+from generate_points import get_random_point
+
+
+def get_color(i):
+    return plt.get_cmap('tab20')(i)
+
+
+def calc_length(a, b):
+    # return ((b[0]-a[0])**2+(b[1]-a[1])**2)**0.5
+    # no need to calculate square root for comparison
+    return (b[0] - a[0]) ** 2 + (b[1] - a[1]) ** 2
+
+
+def plot_data(data):
+    lst_x, lst_y = zip(*data)
+    lst_x = list(lst_x)
+    lst_y = list(lst_y)
+    plt.figure(1)
+    ax = plt.axes()
+    ax.scatter(lst_x, lst_y)
+    ax.set_xlabel('X')
+    ax.set_ylabel('Y')
+    plt.grid(True)
+    plt.show()
+
+
+def plot_error_data(error_data):
+    fig, ax = plt.subplots()
+    ax.set_xlabel('k')
+    ax.set_ylabel('err')
+    ax.set_xlim(2, 20)
+    plt.title('Errors')
+    plt.grid(True)
+
+    lst_x, lst_y = zip(*error_data)
+    lst_x = list(lst_x)
+    lst_y = list(lst_y)
+    ax.plot(lst_x, lst_y, 'ro-')
+
+    plt.show()
+
+
+def get_data1():
+    data = []
+    for _ in range(200):
+        data.append(get_random_point((0, 0), 1))
+    return data
+
+
+def get_data2():
+    data = []
+    for i in range(2):
+        for _ in range(100):
+            data.append(get_random_point((3 * ((-1) ** i), 0), 0.5))
+    return data
diff --git a/zad3/zad3.py b/zad3/zad3.py
index 9589a2f..32a8bf4 100644
--- a/zad3/zad3.py
+++ b/zad3/zad3.py
@@ -1,194 +1,50 @@
-import matplotlib.pyplot as plt
-from matplotlib.animation import FuncAnimation
-from random import sample, shuffle
-from generate_points import get_random_point
-import numpy as np
+import kmeans as km
+import utils
 import json
 
-
 METHODS = ['forgy', 'random_partition']
 
 
-def get_color(i):
-    return plt.get_cmap('tab20')(i)
+def get_datas_from_json():
+    datas = []
+    with open('data1.json', 'r') as d:
+        datas.append(json.loads(d.read()))
+    with open('data2.json', 'r') as d:
+        datas.append(json.loads(d.read()))
+    return datas
 
 
-def get_data1():
-    data = []
-    for _ in range(200):
-        data.append(get_random_point((0, 0), 1))
-    return data
+def get_datas_random():
+    datas = []
+    for get_data in [utils.get_data1, utils.get_data2]:
+        datas.append(get_data())
+    return datas
 
 
-def get_data2():
-    data = []
-    for i in range(2):
-        for _ in range(100):
-            data.append(get_random_point((3*((-1)**i), 0), 0.5))
-    return data
-
-
-def plot_data(data):
-    lst_x, lst_y = zip(*data)
-    lst_x = list(lst_x)
-    lst_y = list(lst_y)
-    plt.figure(1)
-    ax = plt.axes()
-    ax.scatter(lst_x, lst_y)
-    ax.set_xlabel('X')
-    ax.set_ylabel('Y')
-    plt.grid(True)
-    plt.show()
-
-
-def plot_kmeans(all_data, k):
-    fig, ax = plt.subplots()
-    ax.set_xlabel('X')
-    ax.set_ylabel('Y')
-    ax.set_title(f'k={k}')
-    time_text = ax.text(0.05, 0.95, 'iter=0', horizontalalignment='left',
-                        verticalalignment='top', transform=ax.transAxes)
-    plt.grid(True)
-    centroid_scatters = []
-    cluster_scatters = []
-    centroids, clusters = all_data[0]
-    for key in clusters:
-        color = get_color(key/k)
-        if clusters[key]:
-            lst_x, lst_y = zip(*clusters[key])
-            lst_x = list(lst_x)
-            lst_y = list(lst_y)
-            cluster_scatters.append(ax.scatter(lst_x, lst_y, color=color))
-        centroid_scatters.append(ax.scatter([centroids[key][0]], [
-                                 centroids[key][1]], color=color, marker='X'))
-
-    def update_plot_kmeans(i):
-        centroids, clusters = all_data[i]
-        time_text.set_text(f'iter={i}')
-        for key in clusters:
-            centroid_scatters[key].set_offsets(centroids[key])
-            cluster_scatters[key].set_offsets(clusters[key])
-        return centroid_scatters+cluster_scatters+[time_text, ]
-    anim = FuncAnimation(fig, update_plot_kmeans,
-                         frames=len(all_data), blit=True)
-    # anim.save('animation.mp4')
-
-    plt.show()
-
-
-def calc_length(a, b):
-    # return ((b[0]-a[0])**2+(b[1]-a[1])**2)**0.5
-    # no need to calculate square root for comparison
-    return (b[0]-a[0])**2+(b[1]-a[1])**2
-
-
-def init_centroids(data, k, method='forgy'):  # TODO: Add k-means++ and Random Partition
-    match method:
-        case 'forgy':
-            return sample(data, k)
-        case 'random_partition':
-            shuffled = list(data)
-            shuffle(shuffled)
-            div = len(shuffled)/k
-            partition = [shuffled[int(round(div*i)):int(round(div*(i+1)))] for i in range(k)]
-            return [np.mean(prt, axis=0) for prt in partition]
-        case _:
-            raise NotImplementedError(
-                f'method {method} is not implemented yet')
-
-
-def calc_error(centroids, clusters, k):
-    squared_errors = []
-    for i in range(k):
-        cluster = np.array(clusters[i])
-        centroid = np.array([centroids[i] for _ in range(len(cluster))])
-        errors = cluster - centroid
-        squared_errors.append([e**2 for e in errors])
-    return sum([np.mean(err) if err else 0 for err in squared_errors])
-
-
-def plot_error_data(error_data):
-    fig, ax = plt.subplots()
-    ax.set_xlabel('k')
-    ax.set_ylabel('err')
-    ax.set_xlim(2, 20)
-    plt.title('Errors')
-    plt.grid(True)
-
-    lst_x, lst_y = zip(*error_data)
-    lst_x = list(lst_x)
-    lst_y = list(lst_y)
-    ax.plot(lst_x, lst_y, 'ro-')
-
-    plt.show()
-
-
-def print_stats(k, data):
-    print('='*20)
-    print(f'k={k}')
-    errs = [x[1] for x in data]
-    m = np.mean(errs)
-    std = np.std(errs)
-    min_err = np.min(errs)
-    lst_empty = [sum([1 for cluster in centroids_with_clusters[1] if not cluster]) for centroids_with_clusters,_ in data]
-    print(lst_empty)
-
-
-def main(datas):
-    # for get_data in [get_data1, get_data2]:
-    #    data = get_data()
+def main():
+    datas = get_datas_from_json()
+    index = 1
     for data in datas:
-        plot_data(data)
+        utils.plot_data(data)
         for method in METHODS:
+            print(f'Method: {method}')
             kmeans_data = {}
-            for k in [20]:  # range(2, 21):
-                kmeans_with_err = []
-                for _ in range(100):
-                    centroids_with_clusters = []
-                    centroids = init_centroids(data, k, method=method)
-                    clusters = {}
-                    for i in range(k):
-                        clusters[i] = []
-                    for point in data:
-                        lengths = [calc_length(c, point) for c in centroids]
-                        index_min = np.argmin(lengths)
-                        clusters[index_min].append(point)
-                    centroids_with_clusters.append((list(centroids), clusters))
-                    for _ in range(100):
-                        for key in clusters:
-                            if clusters[key]:
-                                centroids[key] = np.mean(clusters[key], axis=0)
-                        clusters = {}
-                        for i in range(k):
-                            clusters[i] = []
-                        for point in data:
-                            lengths = [calc_length(c, point)
-                                    for c in centroids]
-                            index_min = np.argmin(lengths)
-                            clusters[index_min].append(point)
-                        centroids_with_clusters.append(
-                            (list(centroids), clusters))
-                        if all([all(np.isclose(centroids_with_clusters[-1][0][i], centroids_with_clusters[-2][0][i])) for i in range(k)]):
-                            break
-                    err = calc_error(centroids, clusters, k)
-                    kmeans_with_err.append((centroids_with_clusters, err))
-                print_stats(k, [(iterations[-1],err) for iterations, err in kmeans_with_err])
+            for k in range(2, 21):
+                kmeans_with_err = km.kmeans(data, method, k)
+                km.print_stats(k, [(iterations[-1], err)
+                               for iterations, err in kmeans_with_err])
                 min_err = kmeans_with_err[0][1]
                 kmeans = kmeans_with_err[0][0]
                 for temp_kmeans, err in kmeans_with_err:
                     if err < min_err:
                         min_err = err
                         kmeans = temp_kmeans
-                kmeans_data[k]=(kmeans, min_err)
-                plot_kmeans(kmeans, k)
-            #error_data = [[i, kmeans_data[i][1]] for i in range(2, 21, 2)]
-            #plot_error_data(error_data)
+                kmeans_data[k] = (kmeans, min_err)
+                km.plot_kmeans(kmeans, k, f'_{method}_{k}_{index}')
+            error_data = [[i, kmeans_data[i][1]] for i in range(2, 21, 2)]
+            utils.plot_error_data(error_data)
+        index += 1
 
 
 if __name__ == '__main__':
-    datas = []
-    with open('data1.json', 'r') as d:
-        datas.append(json.loads(d.read()))
-    with open('data2.json', 'r') as d:
-        datas.append(json.loads(d.read()))
-    main(datas)
+    main()