diff --git a/zad3/.vscode/launch.json b/zad3/.vscode/launch.json new file mode 100644 index 0000000..c8d2773 --- /dev/null +++ b/zad3/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "zad3.py", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/zad3/bz_236713_ml_195642_zad3.pdf b/zad3/bz_236713_ml_195642_zad3.pdf new file mode 100644 index 0000000..ee2624b --- /dev/null +++ b/zad3/bz_236713_ml_195642_zad3.pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4926ec2f52c6955b4436cca4bdc2db71fec86471081516189a9348f68879aad0 +size 557310 diff --git a/zad3/data1_errors.png b/zad3/data1_errors.png deleted file mode 100644 index 426121d..0000000 --- a/zad3/data1_errors.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9f2b395c6b354afc855fb1826b4908bbaa2e3c5fdabee0723d6cbc7c90eaeee -size 17887 diff --git a/zad3/data2_errors.png b/zad3/data2_errors.png deleted file mode 100644 index bce4f2c..0000000 --- a/zad3/data2_errors.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c73cce33452f045cd249468cfa1de26a3e9048a0452680aceb217102ea9fe0e5 -size 23167 diff --git a/zad3/kmeans.py b/zad3/kmeans.py index aa9323b..ca2571f 100644 --- a/zad3/kmeans.py +++ b/zad3/kmeans.py @@ -50,14 +50,47 @@ def plot_kmeans(all_data, k, name_suffix): plt.show() +def plot_kmeans_change(all_data, k, name_suffix, show=True): + fig, ax = plt.subplots() + ax.set_xlabel('X') + ax.set_ylabel('Y') + ax.set_title(f'k={k}') + + plt.grid(True) + cluster_scatters = {} + _, clusters = all_data[-1] + for key in clusters: + color = utils.get_color(key / k) + if clusters[key]: + lst_x, lst_y = zip(*clusters[key]) + lst_x = list(lst_x) + lst_y = list(lst_y) + cluster_scatters[key] = ax.scatter(lst_x, lst_y, color=color) + all_centroids, _ = zip(*all_data) + key = 0 + for centroids in zip(*all_centroids): + color = utils.get_color(key / k) + lst_x, lst_y = zip(*centroids) + lst_x = list(lst_x) + lst_y = list(lst_y) + cluster_scatters[key] = ax.plot( + lst_x, lst_y, color=color, marker='X') + key += 1 + + fig.savefig(f'kmeans_change{name_suffix}') + if show: + plt.show() + else: + plt.close(fig) + + def calc_error(centroids, clusters, k): - squared_errors = [] + errors = 0 for i in range(k): - cluster = np.array(clusters[i]) - centroid = np.array([centroids[i] for _ in range(len(cluster))]) - errors = cluster - centroid - squared_errors.append([e ** 2 for e in errors]) - return sum([np.mean(err) if err else 0 for err in squared_errors]) + for point in clusters[i]: + errors += np.sqrt(utils.calc_length(point, centroids[i])) + points_count = sum([len(clusters[n]) for n in clusters]) + return errors/points_count def plot_error_data(error_data): @@ -83,10 +116,11 @@ def print_stats(k, data): m = np.mean(errs) std = np.std(errs) min_err = np.min(errs) - empty_clusters = [sum([1 for cluster in sample.values() if not cluster]) for sample in clusters] + empty_clusters = [ + sum([1 for cluster in sample.values() if not cluster]) for sample in clusters] empty_clusters_mean = sum(empty_clusters)/len(empty_clusters) empty_clusters_std = np.std(empty_clusters) - print(f'MSE={m}') + print(f'Średni błąd={m}') print(f'std={std}') print(f'min(err)={min_err}') print(f'Mean of empty clusters count={empty_clusters_mean}') @@ -129,7 +163,7 @@ def kmeans(data, method, k): return kmeans_with_err -def init_units(data, k, method='forgy'): # TODO: Add k-units++ and Random Partition +def init_units(data, k, method='forgy'): # TODO: Add k-units++ match method: case 'forgy': return sample(data, k) diff --git a/zad3/ml_195642_zad3.odt b/zad3/ml_195642_zad3.odt deleted file mode 100644 index b1b0004..0000000 --- a/zad3/ml_195642_zad3.odt +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e7a4a417d8c0c27e15b86dc40de94e044ee3152320f923b2021c155c355a633 -size 708612 diff --git a/zad3/som.py b/zad3/som.py index 3a34bc7..00c8777 100644 --- a/zad3/som.py +++ b/zad3/som.py @@ -5,25 +5,26 @@ from random import shuffle import numpy as np -def find_bmu(som, x): +def find_bmu(som, exhausted, x): '''Return the (g,h) index of the BMU in the grid''' #wrong_dist_sq = np.asarray([u.calc_length(x, s) for s in som]) - dist_sq = (np.square(som - x)).sum(axis=2) + dist_sq = exhausted * (np.square(som - x)).sum(axis=2) return np.unravel_index(np.argmin(dist_sq, axis=None), dist_sq.shape) -def dist_comp(som, x): +def dist_comp(som, exhausted, x): distsq = [] for i in range(som.shape[0]): for j in range(som.shape[1]): - distsq.append([(i, j), u.calc_length(x, som[i][j])]) + distsq.append([(i, j), exhausted[i][j] * + u.calc_length(x, som[i][j])]) return sorted(distsq, key=lambda x: x[1]) -# Update the weights of the SOM cells when given a single training example -# and the model parameters along with BMU coordinates as a tuple -def update_weights(som, train_ex, learn_rate, radius_sq, - bmu_coord, algorithm, step=3): +def update_weights(som, exhausted, train_ex, learn_rate, radius_sq, + bmu_coord, algorithm): + '''Update the weights of the SOM cells when given a single training example + and the model parameters along with BMU coordinates as a tuple''' g, h = bmu_coord # if radius is close to zero then only BMU is changed if radius_sq < 1e-3: @@ -33,19 +34,19 @@ def update_weights(som, train_ex, learn_rate, radius_sq, match algorithm: case 'kohonen': # Change all cells in a neighborhood of BMU - for i in range(max(0, g-step), min(som.shape[0], g+step)): - for j in range(max(0, h-step), min(som.shape[1], h+step)): + for i in range(som.shape[0]): + for j in range(som.shape[1]): dist_sq = np.square(i - g) + np.square(j - h) dist_func = np.exp(-dist_sq / 2 / radius_sq) som[i, j, :] += learn_rate * \ dist_func * (train_ex - som[i, j, :]) case 'neuron gas': - dist_rank = dist_comp(som, train_ex) + dist_rank = dist_comp(som, exhausted, train_ex) for i in range(len(dist_rank)): dist_func = np.exp(-i / 2 / np.sqrt(radius_sq)) - som[dist_rank[i][0], dist_rank[i][1], :] += \ + som[dist_rank[i][0][0], dist_rank[i][0][1], :] += \ learn_rate * dist_func * \ - (train_ex - som[dist_rank[i][0], dist_rank[i][1], :]) + (train_ex - som[dist_rank[i][0][0], dist_rank[i][0][1], :]) case _: raise NotImplementedError( @@ -57,29 +58,33 @@ def train_som(som, train_data, learn_rate=.1, radius_sq=1, lr_decay=.1, radius_decay=.1, epochs=20, algorithm='kohonen'): '''Main routine for training an SOM. It requires an initialized SOM grid or a partially trained grid as parameter''' + exhausted = np.ones((som.shape[0], som.shape[1])) learn_rate_0 = learn_rate radius_0 = radius_sq - soms_with_error = [(som.copy(), calc_som_error(som, train_data))] + soms_with_error = [ + (som.copy(), calc_som_error(som, exhausted, train_data))] for epoch in np.arange(epochs): shuffle(train_data) for train_ex in train_data: - g, h = find_bmu(som, train_ex) - som = update_weights(som, train_ex, + g, h = find_bmu(som, exhausted, train_ex) + som = update_weights(som, exhausted, train_ex, learn_rate, radius_sq, (g, h), algorithm) + exhausted[g][h] += 1 # Update learning rate and radius learn_rate = learn_rate_0 * np.exp(-epoch * lr_decay) radius_sq = radius_0 * np.exp(-epoch * radius_decay) - error = calc_som_error(som, train_data) + exhausted = np.ones((som.shape[0], som.shape[1])) + error = calc_som_error(som, exhausted, train_data) soms_with_error.append((som.copy(), error)) if error < 1e-3: break return soms_with_error -def calc_som_error(som, train_data): +def calc_som_error(som, exhausted, train_data): errors = [] for train_ex in train_data: - g, h = find_bmu(som, train_ex) + g, h = find_bmu(som, exhausted, train_ex) errors.append(u.calc_length(train_ex, som[g][h])) return np.mean(np.sqrt(np.asarray(errors))) @@ -135,6 +140,8 @@ def init_neurons(data, k, rand: np.random.RandomState = None, method='random'): def print_som_stats(soms_with_errors, train_data): print('=' * 20) + exhausted = np.ones( + (soms_with_errors[0][0].shape[0], soms_with_errors[0][0].shape[1])) soms, errs = zip(*soms_with_errors) m = np.mean(errs) std = np.std(errs) @@ -142,7 +149,7 @@ def print_som_stats(soms_with_errors, train_data): dead_neurons_count = [] for som in soms: dead_neurons_count.append( - 20-len(set([find_bmu(som, x) for x in train_data]))) + 20-len(set([find_bmu(som, exhausted, x) for x in train_data]))) print("Średni błąd: ", m) print("Odchylenie standardowe: ", std) print("Błąd minimalny: ", min_err) @@ -150,3 +157,4 @@ def print_som_stats(soms_with_errors, train_data): f'Średnia liczba nieaktywnych neuronów: {np.mean(dead_neurons_count)}') print( f'Odchylenie standardowe liczby nieaktywnych neuronów: {np.std(dead_neurons_count)}') + print('=' * 20) diff --git a/zad3/utils.py b/zad3/utils.py index c678fd1..d6f84dd 100644 --- a/zad3/utils.py +++ b/zad3/utils.py @@ -9,7 +9,7 @@ def get_color(i): def calc_length(a, b): '''Calculate Euclidian distance between points''' - assert len(a)==len(b) + assert len(a) == len(b) return np.square(np.asarray(b)-np.asarray(a)).sum() @@ -26,7 +26,7 @@ def plot_data(data): plt.show() -def plot_error_data(error_data): +def plot_error_data(error_data, fname=None): fig, ax = plt.subplots() ax.set_xlabel('k') ax.set_ylabel('err') @@ -39,7 +39,10 @@ def plot_error_data(error_data): lst_y = list(lst_y) ax.plot(lst_x, lst_y, 'ro-') - plt.show() + if fname: + plt.savefig(fname) + else: + plt.show() def get_data1(): diff --git a/zad3/zad3.py b/zad3/zad3.py index 91168bf..56b405f 100644 --- a/zad3/zad3.py +++ b/zad3/zad3.py @@ -6,6 +6,9 @@ import json METHODS = ['forgy', 'random_partition'] SOM_INIT_METHODS = ['random', 'zeros'] +SOM_ALGORITHMS = ['kohonen', 'neuron gas'] +SOM_PARAMETERS_SETS = [(.1, .5), (.1, .5), (.1, 1), (.33, .1), (.33, .5), (.33, 1), (.66, .1), (.66, .5), (.66, 1), + (.99, .1), (.99, .5), (.99, 1)] def get_datas_from_json(): @@ -26,33 +29,46 @@ def get_datas_random(): def main(): datas = get_datas_from_json() + benchmark_errors = False rand = np.random.RandomState(0) index = 1 print("Self-organizing map") for data in datas: print(f'Data set: {index}') utils.plot_data(data) - for method in SOM_INIT_METHODS: - print(f'Initialization method: {method}') - errors = [] - for k in range(2, 21, 2): - som_data = som.init_neurons(data, k, rand, method) - soms_with_error = som.train_som(som_data, data, algorithm='kohonen') - error = soms_with_error[-1][1] - errors.append((k, error)) - soms,_ = zip(*soms_with_error) - #som.plot_with_data(soms, data, f'_{method}_{k}_data{index}') - utils.plot_error_data(errors) - soms_with_errors = [] - for _ in range(100): - som_data = som.init_neurons(data, k, rand, method) - soms_with_error = som.train_som(som_data, data, algorithm='kohonen') - soms_with_errors.append(soms_with_error[-1]) - som.print_som_stats(soms_with_errors, data) + for algorithm in SOM_ALGORITHMS: + print(f'Weights update algorithm: {algorithm}') + for method in SOM_INIT_METHODS: + print(f'Initialization method: {method}') + for param_set in SOM_PARAMETERS_SETS: + print( + f'Learn rate: {param_set[0]}, Radius square: {param_set[1]}') + errors = {} + for k in range(2, 21): + som_data = som.init_neurons(data, k, rand, method) + soms_with_error = som.train_som(som_data, data, learn_rate=param_set[0], radius_sq=param_set[1], + algorithm=algorithm) + error = soms_with_error[-1][1] + errors[k] = error + soms, _ = zip(*soms_with_error) + som.plot_with_data( + soms, data, f'_LR{param_set[0]}_RSQ{param_set[1]}_{algorithm}_{method}_neurons{k}_data{index}') + if all([i in errors for i in range(2, 21, 2)]): + fname = f'som_errors_data{index}_{SOM_PARAMETERS_SETS.index(param_set)}_{algorithm}_{method}.png' + utils.plot_error_data([(k, errors[k]) for k in range(2, 21, 2)], fname=fname) + if benchmark_errors: + soms_with_errors = [] + for _ in range(100): + som_data = som.init_neurons(data, 20, rand, method) + soms_with_error = som.train_som( + som_data, data, algorithm=algorithm) + soms_with_errors.append(soms_with_error[-1]) + som.print_som_stats(soms_with_errors, data) index += 1 index = 1 for data in datas: + print(f'Data set {index}') utils.plot_data(data) for method in METHODS: print(f'Method: {method}') @@ -60,7 +76,7 @@ def main(): for k in range(2, 21): kmeans_with_err = km.kmeans(data, method, k) km.print_stats(k, [(iterations[-1], err) - for iterations, err in kmeans_with_err]) + for iterations, err in kmeans_with_err]) min_err = kmeans_with_err[0][1] kmeans = kmeans_with_err[0][0] for temp_kmeans, err in kmeans_with_err: @@ -69,9 +85,12 @@ def main(): kmeans = temp_kmeans kmeans_data[k] = (kmeans, min_err) km.plot_kmeans(kmeans, k, f'_{method}_{k}_{index}') - error_data = [[i, kmeans_data[i][1]] for i in range(2, 21, 2)] - utils.plot_error_data(error_data) - index += 1 + if k in [2, 10]: + km.plot_kmeans_change(kmeans, k, f'_{method}_{k}_{index}') + if all([i in kmeans_data for i in range(2, 21, 2)]): + error_data = [[i, kmeans_data[i][1]] for i in range(2, 21, 2)] + utils.plot_error_data(error_data) + index += 1 if __name__ == '__main__':