From 2efbd423ada8bc638ce3d361e3c44c6cf6a1f64e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20Le=C5=9Bniak?= Date: Sun, 5 Dec 2021 19:59:44 +0100 Subject: [PATCH] Zad 2 wip --- zad2/README.md | 31 +++++++ zad2/chi2_normality.py | 159 ++++++++++++++++++++++++++++++++++ zad2/data1.csv | 100 +++++++++++++++++++++ zad2/data2.csv | 100 +++++++++++++++++++++ zad2/data3.csv | 100 +++++++++++++++++++++ zad2/data4.csv | 100 +++++++++++++++++++++ zad2/regresja-liniowa (1).pdf | 3 + zad2/zad2.py | 71 +++++++++++++++ 8 files changed, 664 insertions(+) create mode 100644 zad2/README.md create mode 100644 zad2/chi2_normality.py create mode 100644 zad2/data1.csv create mode 100644 zad2/data2.csv create mode 100644 zad2/data3.csv create mode 100644 zad2/data4.csv create mode 100644 zad2/regresja-liniowa (1).pdf create mode 100644 zad2/zad2.py diff --git a/zad2/README.md b/zad2/README.md new file mode 100644 index 0000000..14ddf19 --- /dev/null +++ b/zad2/README.md @@ -0,0 +1,31 @@ +# Zadanie 2: Instrukcja +Dla podanych zestawów danych i proponowanych modeli oszacuj wartości parametrów a..f, które minimalizują średni błąd kwadratowy modelowanej funkcji w podanych punktach. + +Dla każdego modelu przedstaw: +- sposób przygotowania danych do zastosowania prostej regresji liniowej +- wyznaczone wartości parametrów +- wykres przedstawiający modelowaną funkcję na tle danych punktów +- średni błąd kwadratowy dotyczący wartości funkcji w danych punktach +- największą wartość odchylenia wartości funkcji od danych punktów +- wartość współczynnika R**2 +- histogram odchyleń wartości funkcji od danych +- (*) test hipotezy statystycznej, że błędy mają rozkład normalny (test chi-kwadrat Pearsona lub test Shapiro-Wilka) +- komentarz na temat przydatności zastosowania rozważanego modelu + + + +Dla zestawów danych: dane1.csv, dane2.csv (dwie kolumny: X, wartość) +Należy rozważyć modele: +``` +f(X) = a * X +f(X) = a * X + b +f(X) = a * X**2 + b * sin(X) + c +``` + +Dla zestawów danych: dane3.csv, dane4.csv (trzy kolumny: X1, X2, wartość) +Należy rozważyć modele: +``` +f(X1, X2) = a * X1 + b * X2 + c +f(X1, X2) = a * X1**2 + b * X1*X2 + c * X2**2 + d * X1 + e * X2 + f +``` +Ostatnia modyfikacja: czwartek, 5 listopada 2020, 14:05 \ No newline at end of file diff --git a/zad2/chi2_normality.py b/zad2/chi2_normality.py new file mode 100644 index 0000000..20899e8 --- /dev/null +++ b/zad2/chi2_normality.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python3 + +import math +import random + +# for stats.chi2.cdf (cumulative chi squared distribution) +from scipy import stats + + +# X - list, b - begin (first index), e - end (too big index), +# k - index of the element to set in the right place +def quickselect(X, b, e, k): + # end condition + if e - b <= 1: return + # random pivot + r = random.randint(b, e - 1) + X[b], X[r] = X[r], X[b] + # partition + p0 = b + p1 = b + 1 + while p1 < e: + if X[p1] < X[p0]: + if p1 == p0 + 1: + X[p0], X[p1] = X[p1], X[p0] + p0 = p1 + else: + X[p0], X[p0+1], X[p1] = X[p1], X[p0], X[p0+1] + p0 += 1 + p1 += 1 + # recursion + if k < p0: + quickselect(X, b, p0, k) + elif k > p0: + quickselect(X, p0+1, e, k) + + + +# quantile calculator (no libraries needed!) +def quantile(X, q, copy=True): + # edge cases and exceeded range of q + if q >= 1.0: + return max(X) + elif q <= 0.0: + return min(X) + + n = len(X) + pos = (n - 1) * q # target position + left = int(pos) # index of the left element + rcoeff = pos - left # coefficient for the right element + + # optional copy + if copy: + X = X[:] + + quickselect(X, 0, n, left) # quickselect! + + # X[left] is in order now, and elements on the right are greater + xleft = X[left] # left element + + if rcoeff == 0.0: # the simple case + return xleft + else: # the general case + xright = min(X[i] for i in range(left+1, n)) # right element + return xleft * (1.0 - rcoeff) + xright * rcoeff + + +# interquantile range +def IQR(X, copy=True): + if copy: + X = X[:] + q1 = quantile(X, 0.25, copy=False) + q3 = quantile(X, 0.75, copy=False) + return q3 - q1 + + +# cumulative distribution function of normal distribution N(mu, sigma) +def normal_cdf(mu, sigma, x): + return 0.5 * (1.0 + math.erf((x - mu) / (2.0**0.5 * sigma))) + + +# Chi-squared normality test +def chi2normality(X, var_df=1, bins=None): + n = len(X) + assert n >= 4 + mu = sum(X) / n # mean + evar = sum((x - mu)**2 for x in X) / (n - var_df) # estimated variance + sigma = evar**0.5 # the expected distribution will be N(mu, sigma) + + minx = min(X) + gap = max(X) - minx + assert gap != 0.0 + if bins is None: # guess the number of bins + # see https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule + h = 2.0 * IQR(X) / (n**(1.0/3.0)) # bin size by the Freedman–Diaconis rule + k = int(math.ceil(gap / h)) # number of histogram bins + else: + k = bins + k = min(4, k) # k should be at least 4 + h = gap / k # actual bin size + + # section points between bins + points = [minx + h * i for i in range(k + 1)] + points[0] = -math.inf + points[-1] = math.inf + + # actual frequency + freq = [0] * k + for x in X: + freq[min(int((x - minx) / h), k-1)] += 1 + + # cumulative distributions for the section points + cdfs = [normal_cdf(mu, sigma, p) for p in points] + + # expected frequencies + expected = [(cdfs[i+1] - cdfs[i]) * n for i in range(k)] + + # Chi-squared statistic + chi2stat = sum((freq[i] - expected[i])**2 / expected[i] for i in range(k)) + # Chi-squared degrees of freedom + # i.e. number of bins - 1 - number of parameters of of distribution + # which is number of bins - 3, as normal distribution has two parameters + chi2df = k - 3 + # p-value for the "X is sampled from a normal distribution" hypothesis + # note: for very big chi2stat values, it is close to 0 + pvalue = 1.0 - stats.chi2.cdf(chi2stat, chi2df) + + # significance (alpha) is a probability, that the hypothesis will be rejected + # despite of being actually true + + # when p-value is lower than alpha, the hypothesis is rejected + # otherwise, the test fails to reject the hypothesis (this usually happens + # when the hypothesis is true) + + return pvalue + + +# perform the test, print some messages +def chi2normality_describe(X, alpha=0.05): + print('Hypothesis: X is sampled from a normal distribution') + pvalue = chi2normality(X) + print('Significance level:', alpha) + print('p-value: %.7f' % pvalue) + if pvalue < alpha: + print('Hypothesis rejected. X doesn\'t seem to be sampled from a normal distribution.') + else: + print('Failed to reject the hypothesis.') + print() + + + +if __name__ == '__main__': + # test for some uniform distribution + X = [random.random() * 2.0 + 3.0 for i in range(200)] + chi2normality_describe(X) + + # test for actual normal distribution + X = [random.normalvariate(4.0, 5.0) for i in range(200)] + chi2normality_describe(X) + diff --git a/zad2/data1.csv b/zad2/data1.csv new file mode 100644 index 0000000..8b80e87 --- /dev/null +++ b/zad2/data1.csv @@ -0,0 +1,100 @@ + 0.60, 0.06 + 0.90, 0.21 + 0.93, 0.21 + 1.06, 0.24 + 1.30, 0.29 + 1.52, 0.27 + 1.58, 0.16 + 1.69, 0.34 + 1.75, 0.37 + 1.83, 0.35 + 1.94, 0.37 + 2.00, 0.36 + 2.03, 0.42 + 2.18, 0.46 + 2.22, 0.32 + 2.22, 0.45 + 2.22, 0.47 + 2.22, 0.46 + 2.27, 0.45 + 2.28, 0.41 + 2.29, 0.40 + 2.31, 0.40 + 2.35, 0.42 + 2.38, 0.46 + 2.39, 0.45 + 2.43, 0.51 + 2.53, 0.49 + 2.55, 0.52 + 2.57, 0.50 + 2.60, 0.55 + 2.60, 0.54 + 2.61, 0.46 + 2.62, 0.46 + 2.62, 0.49 + 2.62, 0.51 + 2.62, 0.54 + 2.67, 0.62 + 2.68, 0.51 + 2.72, 0.53 + 2.73, 0.46 + 2.74, 0.60 + 2.74, 0.48 + 2.75, 0.50 + 2.76, 0.59 + 2.79, 0.47 + 2.83, 0.59 + 2.84, 0.45 + 2.84, 0.51 + 2.92, 0.50 + 2.94, 0.54 + 2.96, 0.47 + 2.96, 0.59 + 3.05, 0.48 + 3.08, 0.50 + 3.08, 0.61 + 3.33, 0.61 + 3.33, 0.60 + 3.33, 0.67 + 3.36, 0.63 + 3.36, 0.62 + 3.37, 0.60 + 3.42, 0.57 + 3.47, 0.63 + 3.49, 0.48 + 3.49, 0.61 + 3.50, 0.63 + 3.51, 0.64 + 3.51, 0.63 + 3.54, 0.66 + 3.63, 0.71 + 3.74, 0.76 + 3.77, 0.69 + 3.77, 0.68 + 3.80, 0.74 + 3.83, 0.59 + 3.85, 0.70 + 3.87, 0.63 + 3.87, 0.70 + 3.90, 0.74 + 3.93, 0.65 + 3.93, 0.66 + 3.98, 0.68 + 3.98, 0.68 + 4.02, 0.67 + 4.13, 0.73 + 4.20, 0.77 + 4.20, 0.80 + 4.20, 0.70 + 4.20, 0.78 + 4.27, 0.67 + 4.35, 0.78 + 4.35, 0.68 + 4.39, 0.86 + 4.41, 0.73 + 4.48, 0.80 + 4.55, 0.77 + 4.75, 0.77 + 4.89, 0.78 + 5.34, 0.86 + 5.45, 0.89 diff --git a/zad2/data2.csv b/zad2/data2.csv new file mode 100644 index 0000000..620cd8b --- /dev/null +++ b/zad2/data2.csv @@ -0,0 +1,100 @@ + 1.87, 4.71 + 1.95, 4.55 + 2.09, 4.67 + 2.20, 4.46 + 2.22, 4.55 + 2.42, 4.51 + 2.53, 4.57 + 2.56, 4.55 + 2.59, 4.58 + 2.59, 4.56 + 2.65, 4.46 + 2.80, 4.38 + 2.80, 4.51 + 2.83, 4.48 + 2.85, 4.45 + 2.87, 4.45 + 2.90, 4.46 + 2.90, 4.47 + 2.97, 4.45 + 2.97, 4.43 + 2.97, 4.41 + 2.98, 4.37 + 3.15, 4.33 + 3.18, 4.33 + 3.18, 4.54 + 3.23, 4.42 + 3.27, 4.29 + 3.32, 4.39 + 3.33, 4.24 + 3.35, 4.40 + 3.38, 4.35 + 3.40, 4.29 + 3.42, 4.34 + 3.45, 4.35 + 3.45, 4.32 + 3.50, 4.28 + 3.58, 4.31 + 3.69, 4.33 + 3.70, 4.33 + 3.72, 4.28 + 3.74, 4.24 + 3.75, 4.23 + 3.77, 4.31 + 3.85, 4.28 + 3.90, 4.28 + 3.90, 4.21 + 3.91, 4.24 + 3.92, 4.35 + 3.99, 4.26 + 4.04, 4.24 + 4.04, 4.28 + 4.05, 4.18 + 4.07, 4.25 + 4.10, 4.31 + 4.12, 4.16 + 4.16, 4.24 + 4.21, 4.20 + 4.23, 4.30 + 4.24, 4.28 + 4.24, 4.28 + 4.32, 4.23 + 4.35, 4.22 + 4.35, 4.12 + 4.36, 4.26 + 4.43, 4.36 + 4.45, 4.33 + 4.45, 4.32 + 4.47, 4.37 + 4.49, 4.40 + 4.57, 4.39 + 4.59, 4.27 + 4.60, 4.32 + 4.60, 4.35 + 4.61, 4.32 + 4.71, 4.34 + 4.75, 4.33 + 4.77, 4.46 + 4.78, 4.33 + 4.82, 4.46 + 4.90, 4.56 + 4.97, 4.40 + 4.99, 4.50 + 5.02, 4.56 + 5.05, 4.58 + 5.05, 4.59 + 5.06, 4.55 + 5.15, 4.62 + 5.20, 4.59 + 5.28, 4.78 + 5.28, 4.66 + 5.33, 4.71 + 5.38, 4.81 + 5.40, 4.71 + 5.54, 4.85 + 5.66, 4.93 + 5.69, 4.97 + 5.77, 5.05 + 5.84, 5.15 + 5.99, 5.15 + 6.21, 5.43 diff --git a/zad2/data3.csv b/zad2/data3.csv new file mode 100644 index 0000000..88aa557 --- /dev/null +++ b/zad2/data3.csv @@ -0,0 +1,100 @@ + 2.75, 3.79, 3.21 + 2.80, 1.42, 5.13 + 2.01, 3.11, 3.35 + 2.02, 1.32, 3.13 + 3.28, 4.61, 3.55 + 2.69, 3.06, 3.72 + 3.15, 1.10, 5.46 + 2.72, 2.61, 4.39 + 3.07, 4.20, 4.81 + 2.91, 3.77, 3.46 + 3.61, 3.45, 5.70 + 0.91, 1.42, 1.51 + 2.45, 5.00, 2.90 + 3.18, 2.82, 5.27 + 2.65, 2.93, 3.78 + 1.51, 4.66, 0.68 + 1.76, 3.16, 2.60 + 1.38, 4.05, 0.82 + 2.85, 2.65, 4.34 + 2.52, 2.99, 3.32 + 3.33, 3.79, 4.59 + 2.82, 3.34, 4.41 + 2.83, 2.89, 3.39 + 4.71, 4.31, 8.44 + 2.81, 2.42, 4.32 + 2.83, 2.35, 3.89 + 3.51, 1.58, 6.63 + 1.87, 3.49, 1.71 + 3.00, 3.24, 4.66 + 2.02, 3.67, 2.50 + 3.57, 4.36, 5.30 + 1.59, 1.37, 3.07 + 4.28, 4.27, 5.95 + 4.03, 4.00, 5.54 + 3.78, 2.30, 5.82 + 1.97, 1.27, 3.29 + 2.60, 2.16, 3.96 + 3.13, 3.38, 4.87 + 5.93, 3.52, 10.12 + 3.20, 3.66, 3.96 + 3.60, 1.95, 5.27 + 2.44, 4.78, 2.88 + 0.82, 1.95, 0.52 + 4.58, 2.19, 7.74 + 1.39, 2.26, 1.33 + 3.40, 1.77, 6.96 + 3.26, 3.38, 4.37 + 2.93, 1.09, 5.39 + 3.04, 4.57, 4.09 + 2.09, 3.28, 2.51 + 0.80, 3.91, -0.14 + 2.18, 2.72, 3.31 + 3.50, 1.13, 7.11 + 2.11, 2.32, 2.50 + 3.73, 2.41, 6.18 + 4.91, 2.87, 8.61 + 3.48, 0.20, 5.67 + 3.68, 4.18, 5.29 + 4.13, 4.18, 5.80 + 3.12, 2.72, 5.18 + 3.04, 4.76, 3.37 + 1.98, 3.33, 2.49 + 2.46, 3.60, 4.04 + 2.47, 4.05, 2.36 + 2.65, 1.80, 4.45 + 2.20, 3.82, 2.99 + 5.53, 3.55, 8.99 + 1.35, 4.95, 0.19 + 3.32, 3.79, 4.96 + 2.74, 3.75, 2.60 + 2.42, 1.24, 4.68 + 2.15, 3.61, 2.26 + 3.24, 3.58, 4.78 + 3.84, 2.43, 5.78 + 2.68, 3.08, 3.95 + 3.66, 3.86, 5.03 + 2.43, 1.20, 3.97 + 1.94, 1.67, 3.33 + 4.55, 2.78, 7.85 + 3.34, 3.50, 5.36 + 1.92, 5.04, 1.82 + 2.71, 2.45, 4.32 + 1.63, 4.45, -0.19 + 3.80, 5.57, 5.94 + 3.20, 5.54, 3.52 + 3.66, 2.38, 6.20 + 3.80, 4.36, 5.54 + 2.56, 4.35, 2.27 + 2.85, 2.99, 4.21 + 3.09, 1.80, 5.12 + 2.96, 3.96, 3.04 + 2.23, 4.21, 2.51 + 1.82, 3.40, 2.91 + 2.66, 1.51, 4.79 + 4.48, 3.93, 7.25 + 2.15, 2.77, 2.76 + 1.09, 1.58, 1.85 + 3.13, 4.81, 3.73 + 2.24, 1.74, 2.85 + 2.58, 3.51, 3.47 diff --git a/zad2/data4.csv b/zad2/data4.csv new file mode 100644 index 0000000..b967863 --- /dev/null +++ b/zad2/data4.csv @@ -0,0 +1,100 @@ + 2.17, 3.71, 1.62 + 2.51, 2.96, 3.37 + 2.39, 4.18, -0.20 + 4.57, 3.32, 6.29 + 1.90, 4.69, -0.94 + 2.78, 4.06, -0.22 + 2.18, 4.85, -2.46 + 1.78, 4.81, -1.21 + 2.60, 2.18, 5.03 + 2.69, 3.14, 2.95 + 2.03, 3.01, 3.54 + 4.78, 2.10, 12.28 + 3.07, 1.70, 6.67 + 4.06, 3.68, 2.84 + 1.66, 2.61, 4.53 + 1.94, 2.65, 4.21 + 3.85, 4.62, -2.49 + 3.77, 4.78, -3.54 + 0.62, 1.97, 6.96 + 0.82, 1.58, 6.27 + 1.65, 4.04, 1.90 + 2.32, 3.50, 2.03 + 1.65, 4.04, 1.66 + 2.81, 2.18, 5.30 + 2.66, 4.62, -2.39 + 4.39, 0.99, 13.59 + 2.48, 3.03, 3.21 + 3.31, 3.91, 0.41 + 4.19, 1.92, 9.94 + 2.18, 4.25, -0.15 + 3.64, 4.17, -0.40 + 2.65, 1.40, 6.24 + 2.22, 3.28, 2.69 + 4.05, 2.90, 6.07 + 1.46, 3.09, 4.18 + 4.41, 2.98, 7.14 + 3.47, 2.59, 5.46 + 2.80, 1.36, 6.68 + 2.39, 3.83, 0.95 + 3.14, 1.52, 7.38 + 3.23, 4.85, -3.88 + 0.83, 4.86, 2.76 + 2.95, 2.58, 4.59 + 2.07, 3.87, 1.29 + 1.88, 2.55, 4.39 + 4.53, 4.29, 0.77 + 2.44, 2.51, 4.35 + 3.65, 4.93, -4.36 + 3.41, 1.23, 8.68 + 1.18, 1.48, 5.63 + 3.43, 3.12, 3.63 + 2.54, 2.39, 4.64 + 2.70, 3.50, 1.81 + 4.29, 4.11, 1.11 + 1.68, 2.51, 4.62 + 3.02, 2.46, 4.89 + 2.20, 2.57, 4.15 + 3.31, 1.67, 7.41 + 1.49, 3.19, 3.98 + 2.82, 1.22, 6.81 + 3.00, 2.41, 5.07 + 2.96, 1.61, 6.62 + 1.68, 3.84, 2.25 + 4.49, 1.73, 11.90 + 3.68, 4.92, -4.27 + 3.37, 4.90, -4.15 + 3.39, 1.35, 8.35 + 4.82, 4.66, -0.92 + 2.74, 3.23, 2.67 + 2.10, 1.82, 5.19 + 1.77, 1.04, 5.33 + 2.41, 2.99, 3.25 + 2.08, 4.33, -0.38 + 0.74, 3.81, 5.32 + 1.61, 1.58, 5.32 + 2.69, 4.96, -4.06 + 2.20, 3.24, 2.84 + 2.83, 1.49, 6.54 + 1.64, 4.31, 0.96 + 3.51, 4.05, 0.05 + 2.84, 1.27, 6.77 + 4.02, 3.44, 3.99 + 3.25, 4.25, -1.11 + 3.72, 2.60, 6.10 + 3.85, 3.95, 1.00 + 4.63, 3.47, 5.56 + 2.30, 4.43, -1.04 + 1.44, 3.37, 3.76 + 3.16, 2.71, 4.62 + 1.85, 4.90, -1.60 + 4.37, 3.26, 5.59 + 3.55, 2.81, 5.05 + 3.28, 1.32, 7.88 + 3.93, 2.02, 8.84 + 3.21, 2.14, 6.19 + 2.67, 1.44, 6.23 + 1.68, 1.16, 5.32 + 3.36, 3.68, 1.56 + 2.31, 1.71, 5.40 + 3.87, 1.19, 10.51 diff --git a/zad2/regresja-liniowa (1).pdf b/zad2/regresja-liniowa (1).pdf new file mode 100644 index 0000000..c0ba30d --- /dev/null +++ b/zad2/regresja-liniowa (1).pdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca67e3f4155a74e6938bcc7e8f7ab27c91c0e0a157e30ee3ec43eab1e24ff9f2 +size 97699 diff --git a/zad2/zad2.py b/zad2/zad2.py new file mode 100644 index 0000000..25bae13 --- /dev/null +++ b/zad2/zad2.py @@ -0,0 +1,71 @@ +""" +Komputerowa analiza danych +Zadanie 2 +Michał Leśniak 195642 +""" +from statistics import mean + + +def var(lst): + x_mean = mean(lst) + return sum((x-x_mean)**2 for x in lst)/len(lst) + + +def cov(lst_x, lst_y): + assert len(lst_x) == len(lst_y) + x_mean = mean(lst_x) + y_mean = mean(lst_y) + return sum((lst_x[i]-x_mean)*(lst_y[i]-y_mean) for i in range(len(lst_x)))/len(lst_x) + + +def load_data(*args): + ret = () + for arg in args: + with open(arg, 'r') as f: + lines = f.read().splitlines() + lst = [] + for line in lines: + lst.append(tuple([float(x.strip()) for x in line.split(',')])) + ret += lst, + return ret + + +def model1(data): + lst_x = [x for x, _ in data] + lst_y = [y for _, y in data] + + a = cov(lst_x, lst_y)/var(lst_x) + print(f'f(X) = {a} * X') + + +def model2(data): + lst_x = [x for x, _ in data] + lst_y = [y for _, y in data] + + a = cov(lst_x, lst_y)/var(lst_x) + b = mean(lst_y) - a*mean(lst_x) + print(f'f(X) = {a} * X + {b}') + + +def main(): + data1, data2, data3, data4 = load_data( + 'data1.csv', 'data2.csv', 'data3.csv', 'data4.csv') + print(var([x for x, _ in data1])) + print(cov([x for x, _ in data1], [y for _, y in data1])) + # print(data2) + # print(data3) + # print(data4) + model1(data1) + model1(data2) + model2(data1) + model2(data2) + x_mean = mean([x for x, _ in data1]) + y_mean = mean([y for _, y in data1]) + xy = sum([x*y for x, y in data1]) + x_2 = sum([x**2 for x, _ in data1]) + print(sum([2*x-2*x*y for x,y in data1])/len(data1)) + print(xy/x_2) + + +if __name__ == '__main__': + main()