""" Komputerowa analiza danych Zadanie 2 Michał Leśniak 195642 """ from math import sin from statistics import mean import matplotlib.pyplot as plt from chi2_normality import chi2normality_describe import numpy as np def var(lst): x_mean = mean(lst) return sum((x-x_mean)**2 for x in lst)/len(lst) def cov(lst_x, lst_y): assert len(lst_x) == len(lst_y) x_mean = mean(lst_x) y_mean = mean(lst_y) return sum((lst_x[i]-x_mean)*(lst_y[i]-y_mean) for i in range(len(lst_x)))/len(lst_x) def load_data(*args): ret = () for arg in args: with open(arg, 'r') as f: lines = f.read().splitlines() lst = [] for line in lines: lst.append(tuple([float(x.strip()) for x in line.split(',')])) ret += lst, return ret def reglin(data, name, model): model_func, use_reglinw, func_str = model if use_reglinw: Y, Z, param_str = reglinw(data, model_func) else: Y, Z, param_str = reglinp(data, model_func) err = Y-Z lst_err = np.transpose(err)[0].tolist() lst_y = np.transpose(Y)[0].tolist() lst_z = np.transpose(Z)[0].tolist() mse = mean([x**2 for x in lst_err]) md = max([abs(x) for x in lst_err]) var_err = var(lst_err) var_y = var(lst_y) r2 = 1-(var_err/var_y) if len(data[0]) > 2: print(f'Regresja liniowa wielu zmiennych dla {name}:') else: print(f'Prosta regresja liniowa jednej zmiennej dla {name}:') print(func_str) print(param_str) print(f'MSE={mse}') print(f'maxD={md}') print(f'VarErr<=VarY - {var_err<=var_y}') print(f'r2={r2}') chi2normality_describe(lst_err) lst_z = np.transpose(Z)[0].tolist() if len(data[0]) == 2: # print 2D lst_x, lst_y = zip(*data) lst_x = list(lst_x) lst_y = list(lst_y) plt.figure(1) ax = plt.axes() ax.scatter(lst_x, lst_y) ax.plot(lst_x, lst_z, 'r-') ax.set_xlabel('X') ax.set_ylabel('Y') plt.grid(True) elif len(data[0]) == 3: lst_x1, lst_x2, lst_y = zip(*data) lst_x1 = list(lst_x1) lst_x2 = list(lst_x2) lst_y = list(lst_y) plt.figure(1) ax = plt.axes(projection='3d') ax.scatter(lst_x1, lst_x2, lst_y) ax.scatter(lst_x1, lst_x2, lst_z, color='r') ax.set_xlabel('X1') ax.set_ylabel('X2') ax.set_zlabel('Y') else: raise RuntimeError plt.title(f'{name}\n{func_str}') plt.figure(2) plt.hist(err, 50) plt.xlabel('Err') plt.title(f'Histogram Err dla {name}\n{func_str}') plt.grid(True) plt.show() def reglinp(data, model_func): lst_x, lst_y = zip(*data) lst_x = list(lst_x) lst_y = list(lst_y) return model_func(lst_x, lst_y) def reglinw(data, prepare_data): X, Y = prepare_data(data) XT = np.transpose(X) XTX = np.matmul(XT, X) try: inv_XTX = np.linalg.inv(XTX) except np.linang.LinAlgError: print("XTX is not inversible") raise A = np.matmul(np.matmul(inv_XTX, XT), Y) Z = np.matmul(X, A) params = [a[0] for a in A] params = params[1:] + params[:1] param_str = [] for i in range(len(params)): param_str.append(f'{chr(ord("a")+i)} = {params[i]}') return Y, Z, '\n'.join(param_str) def model_func1(lst_x, lst_y): a = mean([lst_y[i]*lst_x[i] for i in range(len(lst_x))]) / \ mean([x**2 for x in lst_x]) Y = np.array([list((y,)) for y in lst_y]) Z = np.array([list((a*x,))for x in lst_x]) return Y, Z, f'a = {a}' def model_func2(lst_x, lst_y): a = cov(lst_x, lst_y)/var(lst_x) b = mean(lst_y) - a*mean(lst_x) Y = np.array([list((y,)) for y in lst_y]) Z = np.array([list((a*x+b,))for x in lst_x]) return Y, Z, f'a = {a}\nb = {b}' def model_func3(data): return np.array([list((1.0, x**2, sin(x))) for x, _ in data]), np.array([list((y,)) for _, y in data]) def model_func4(data): return np.array([list((1.0, x1, x2)) for x1, x2, _ in data]), np.array([list((y,)) for _, _, y in data]) def model_func5(data): return np.array([list((1.0, x1**2, x1*x2, x2**2, x1, x2)) for x1, x2, _ in data]), np.array([list((y,)) for _, _, y in data]) MODELS = [ (model_func1, False, '$f(X) = aX$'), (model_func2, False, '$f(X) = aX + b$'), (model_func3, True, '$f(X) = aX^2 + bsin(X) + c$'), (model_func4, True, '$f(X_1, X_2) = aX_1 + bX_2 + c$'), (model_func5, True, r'$f(X_1, X_2) = a{X_1}^2 + bX_1 X_2 + c{X_2}^2 +dX_1 +eX_2 +f$') ] def main(): data1, data2, data3, data4 = load_data( 'data1.csv', 'data2.csv', 'data3.csv', 'data4.csv') for i in range(3): reglin(data1, 'data1.csv', MODELS[i]) reglin(data2, 'data2.csv', MODELS[i]) for i in range(3, 5): reglin(data3, 'data3.csv', MODELS[i]) reglin(data4, 'data4.csv', MODELS[i]) if __name__ == '__main__': main()