Zad 2 wip

2021-12-05 19:59:44 +01:00
parent 3149b56948
commit 2efbd423ad
8 changed files with 664 additions and 0 deletions
--- a/zad2/README.md
+++ b/zad2/README.md
@ -0,0 +1,31 @@
+# Zadanie 2: Instrukcja
+Dla podanych zestawów danych i proponowanych modeli oszacuj wartości parametrów a..f, które minimalizują średni błąd kwadratowy modelowanej funkcji w podanych punktach.
+
+Dla każdego modelu przedstaw:
+- sposób przygotowania danych do zastosowania prostej regresji liniowej
+- wyznaczone wartości parametrów
+- wykres przedstawiający modelowaną funkcję na tle danych punktów
+- średni błąd kwadratowy dotyczący wartości funkcji w danych punktach
+- największą wartość odchylenia wartości funkcji od danych punktów
+- wartość współczynnika R**2
+- histogram odchyleń wartości funkcji od danych
+- (*) test hipotezy statystycznej, że błędy mają rozkład normalny (test chi-kwadrat Pearsona lub test Shapiro-Wilka)
+- komentarz na temat przydatności zastosowania rozważanego modelu
+
+
+
+Dla zestawów danych: dane1.csv, dane2.csv (dwie kolumny: X, wartość)
+Należy rozważyć modele:
+```
+f(X) = a * X
+f(X) = a * X + b
+f(X) = a * X**2 + b * sin(X) + c
+```
+
+Dla zestawów danych: dane3.csv, dane4.csv (trzy kolumny: X1, X2, wartość)
+Należy rozważyć modele:
+```
+f(X1, X2) = a * X1 + b * X2 + c
+f(X1, X2) = a * X1**2 + b * X1*X2 + c * X2**2 + d * X1 + e * X2 + f
+```
+Ostatnia modyfikacja: czwartek, 5 listopada 2020, 14:05
--- a/zad2/chi2_normality.py
+++ b/zad2/chi2_normality.py
@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+
+import math
+import random
+
+# for stats.chi2.cdf (cumulative chi squared distribution)
+from scipy import stats
+
+
+# X - list, b - begin (first index), e - end (too big index),
+# k - index of the element to set in the right place
+def quickselect(X, b, e, k):
+  # end condition
+  if e - b <= 1: return
+  # random pivot
+  r = random.randint(b, e - 1)
+  X[b], X[r] = X[r], X[b]
+  # partition
+  p0 = b
+  p1 = b + 1
+  while p1 < e:
+    if X[p1] < X[p0]:
+      if p1 == p0 + 1:
+        X[p0], X[p1] = X[p1], X[p0]
+        p0 = p1
+      else:
+        X[p0], X[p0+1], X[p1] = X[p1], X[p0], X[p0+1]
+        p0 += 1
+    p1 += 1
+  # recursion
+  if k < p0:
+    quickselect(X, b, p0, k)
+  elif k > p0:
+    quickselect(X, p0+1, e, k)
+
+
+
+# quantile calculator (no libraries needed!)
+def quantile(X, q, copy=True):
+  # edge cases and exceeded range of q
+  if q >= 1.0:
+    return max(X)
+  elif q <= 0.0:
+    return min(X)
+  
+  n = len(X)
+  pos = (n - 1) * q # target position
+  left = int(pos) # index of the left element
+  rcoeff = pos - left # coefficient for the right element
+  
+  # optional copy
+  if copy:
+    X = X[:]
+  
+  quickselect(X, 0, n, left) # quickselect!
+  
+  # X[left] is in order now, and elements on the right are greater
+  xleft = X[left] # left element
+  
+  if rcoeff == 0.0: # the simple case
+    return xleft
+  else: # the general case
+    xright = min(X[i] for i in range(left+1, n)) # right element
+    return xleft * (1.0 - rcoeff) + xright * rcoeff
+
+
+# interquantile range
+def IQR(X, copy=True):
+  if copy:
+    X = X[:]
+  q1 = quantile(X, 0.25, copy=False)
+  q3 = quantile(X, 0.75, copy=False)
+  return q3 - q1
+
+
+# cumulative distribution function of normal distribution N(mu, sigma)
+def normal_cdf(mu, sigma, x):
+  return 0.5 * (1.0 + math.erf((x - mu) / (2.0**0.5 * sigma)))
+
+
+# Chi-squared normality test
+def chi2normality(X, var_df=1, bins=None):
+  n = len(X)
+  assert n >= 4
+  mu = sum(X) / n # mean
+  evar = sum((x - mu)**2 for x in X) / (n - var_df) # estimated variance
+  sigma = evar**0.5 # the expected distribution will be N(mu, sigma)
+  
+  minx = min(X)
+  gap = max(X) - minx
+  assert gap != 0.0
+  if bins is None: # guess the number of bins
+    # see https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
+    h = 2.0 * IQR(X) / (n**(1.0/3.0)) # bin size by the Freedman–Diaconis rule
+    k = int(math.ceil(gap / h)) # number of histogram bins
+  else:
+    k = bins
+  k = min(4, k) # k should be at least 4
+  h = gap / k # actual bin size
+  
+  # section points between bins
+  points = [minx + h * i for i in range(k + 1)]
+  points[0] = -math.inf
+  points[-1] = math.inf
+  
+  # actual frequency
+  freq = [0] * k
+  for x in X:
+    freq[min(int((x - minx) / h), k-1)] += 1
+  
+  # cumulative distributions for the section points
+  cdfs = [normal_cdf(mu, sigma, p) for p in points]
+  
+  # expected frequencies
+  expected = [(cdfs[i+1] - cdfs[i]) * n for i in range(k)]
+  
+  # Chi-squared statistic
+  chi2stat = sum((freq[i] - expected[i])**2 / expected[i] for i in range(k))
+  # Chi-squared degrees of freedom
+  # i.e. number of bins - 1 - number of parameters of of distribution
+  # which is number of bins - 3, as normal distribution has two parameters
+  chi2df = k - 3
+  # p-value for the "X is sampled from a normal distribution" hypothesis
+  # note: for very big chi2stat values, it is close to 0
+  pvalue = 1.0 - stats.chi2.cdf(chi2stat, chi2df)
+  
+  # significance (alpha) is a probability, that the hypothesis will be rejected
+  # despite of being actually true
+  
+  # when p-value is lower than alpha, the hypothesis is rejected
+  # otherwise, the test fails to reject the hypothesis (this usually happens
+  # when the hypothesis is true) 
+  
+  return pvalue
+
+
+# perform the test, print some messages
+def chi2normality_describe(X, alpha=0.05):
+  print('Hypothesis: X is sampled from a normal distribution')
+  pvalue = chi2normality(X)
+  print('Significance level:', alpha)
+  print('p-value: %.7f' % pvalue)
+  if pvalue < alpha:
+    print('Hypothesis rejected. X doesn\'t seem to be sampled from a normal distribution.')
+  else:
+    print('Failed to reject the hypothesis.')
+  print()
+
+
+
+if __name__ == '__main__':
+  # test for some uniform distribution
+  X = [random.random() * 2.0 + 3.0 for i in range(200)]
+  chi2normality_describe(X)
+
+  # test for actual normal distribution
+  X = [random.normalvariate(4.0, 5.0) for i in range(200)]
+  chi2normality_describe(X)
+
--- a/zad2/data1.csv
+++ b/zad2/data1.csv
@ -0,0 +1,100 @@
+ 0.60,  0.06
+ 0.90,  0.21
+ 0.93,  0.21
+ 1.06,  0.24
+ 1.30,  0.29
+ 1.52,  0.27
+ 1.58,  0.16
+ 1.69,  0.34
+ 1.75,  0.37
+ 1.83,  0.35
+ 1.94,  0.37
+ 2.00,  0.36
+ 2.03,  0.42
+ 2.18,  0.46
+ 2.22,  0.32
+ 2.22,  0.45
+ 2.22,  0.47
+ 2.22,  0.46
+ 2.27,  0.45
+ 2.28,  0.41
+ 2.29,  0.40
+ 2.31,  0.40
+ 2.35,  0.42
+ 2.38,  0.46
+ 2.39,  0.45
+ 2.43,  0.51
+ 2.53,  0.49
+ 2.55,  0.52
+ 2.57,  0.50
+ 2.60,  0.55
+ 2.60,  0.54
+ 2.61,  0.46
+ 2.62,  0.46
+ 2.62,  0.49
+ 2.62,  0.51
+ 2.62,  0.54
+ 2.67,  0.62
+ 2.68,  0.51
+ 2.72,  0.53
+ 2.73,  0.46
+ 2.74,  0.60
+ 2.74,  0.48
+ 2.75,  0.50
+ 2.76,  0.59
+ 2.79,  0.47
+ 2.83,  0.59
+ 2.84,  0.45
+ 2.84,  0.51
+ 2.92,  0.50
+ 2.94,  0.54
+ 2.96,  0.47
+ 2.96,  0.59
+ 3.05,  0.48
+ 3.08,  0.50
+ 3.08,  0.61
+ 3.33,  0.61
+ 3.33,  0.60
+ 3.33,  0.67
+ 3.36,  0.63
+ 3.36,  0.62
+ 3.37,  0.60
+ 3.42,  0.57
+ 3.47,  0.63
+ 3.49,  0.48
+ 3.49,  0.61
+ 3.50,  0.63
+ 3.51,  0.64
+ 3.51,  0.63
+ 3.54,  0.66
+ 3.63,  0.71
+ 3.74,  0.76
+ 3.77,  0.69
+ 3.77,  0.68
+ 3.80,  0.74
+ 3.83,  0.59
+ 3.85,  0.70
+ 3.87,  0.63
+ 3.87,  0.70
+ 3.90,  0.74
+ 3.93,  0.65
+ 3.93,  0.66
+ 3.98,  0.68
+ 3.98,  0.68
+ 4.02,  0.67
+ 4.13,  0.73
+ 4.20,  0.77
+ 4.20,  0.80
+ 4.20,  0.70
+ 4.20,  0.78
+ 4.27,  0.67
+ 4.35,  0.78
+ 4.35,  0.68
+ 4.39,  0.86
+ 4.41,  0.73
+ 4.48,  0.80
+ 4.55,  0.77
+ 4.75,  0.77
+ 4.89,  0.78
+ 5.34,  0.86
+ 5.45,  0.89
--- a/zad2/data2.csv
+++ b/zad2/data2.csv
@ -0,0 +1,100 @@
+ 1.87,  4.71
+ 1.95,  4.55
+ 2.09,  4.67
+ 2.20,  4.46
+ 2.22,  4.55
+ 2.42,  4.51
+ 2.53,  4.57
+ 2.56,  4.55
+ 2.59,  4.58
+ 2.59,  4.56
+ 2.65,  4.46
+ 2.80,  4.38
+ 2.80,  4.51
+ 2.83,  4.48
+ 2.85,  4.45
+ 2.87,  4.45
+ 2.90,  4.46
+ 2.90,  4.47
+ 2.97,  4.45
+ 2.97,  4.43
+ 2.97,  4.41
+ 2.98,  4.37
+ 3.15,  4.33
+ 3.18,  4.33
+ 3.18,  4.54
+ 3.23,  4.42
+ 3.27,  4.29
+ 3.32,  4.39
+ 3.33,  4.24
+ 3.35,  4.40
+ 3.38,  4.35
+ 3.40,  4.29
+ 3.42,  4.34
+ 3.45,  4.35
+ 3.45,  4.32
+ 3.50,  4.28
+ 3.58,  4.31
+ 3.69,  4.33
+ 3.70,  4.33
+ 3.72,  4.28
+ 3.74,  4.24
+ 3.75,  4.23
+ 3.77,  4.31
+ 3.85,  4.28
+ 3.90,  4.28
+ 3.90,  4.21
+ 3.91,  4.24
+ 3.92,  4.35
+ 3.99,  4.26
+ 4.04,  4.24
+ 4.04,  4.28
+ 4.05,  4.18
+ 4.07,  4.25
+ 4.10,  4.31
+ 4.12,  4.16
+ 4.16,  4.24
+ 4.21,  4.20
+ 4.23,  4.30
+ 4.24,  4.28
+ 4.24,  4.28
+ 4.32,  4.23
+ 4.35,  4.22
+ 4.35,  4.12
+ 4.36,  4.26
+ 4.43,  4.36
+ 4.45,  4.33
+ 4.45,  4.32
+ 4.47,  4.37
+ 4.49,  4.40
+ 4.57,  4.39
+ 4.59,  4.27
+ 4.60,  4.32
+ 4.60,  4.35
+ 4.61,  4.32
+ 4.71,  4.34
+ 4.75,  4.33
+ 4.77,  4.46
+ 4.78,  4.33
+ 4.82,  4.46
+ 4.90,  4.56
+ 4.97,  4.40
+ 4.99,  4.50
+ 5.02,  4.56
+ 5.05,  4.58
+ 5.05,  4.59
+ 5.06,  4.55
+ 5.15,  4.62
+ 5.20,  4.59
+ 5.28,  4.78
+ 5.28,  4.66
+ 5.33,  4.71
+ 5.38,  4.81
+ 5.40,  4.71
+ 5.54,  4.85
+ 5.66,  4.93
+ 5.69,  4.97
+ 5.77,  5.05
+ 5.84,  5.15
+ 5.99,  5.15
+ 6.21,  5.43
--- a/zad2/data3.csv
+++ b/zad2/data3.csv
@ -0,0 +1,100 @@
+ 2.75,  3.79,  3.21
+ 2.80,  1.42,  5.13
+ 2.01,  3.11,  3.35
+ 2.02,  1.32,  3.13
+ 3.28,  4.61,  3.55
+ 2.69,  3.06,  3.72
+ 3.15,  1.10,  5.46
+ 2.72,  2.61,  4.39
+ 3.07,  4.20,  4.81
+ 2.91,  3.77,  3.46
+ 3.61,  3.45,  5.70
+ 0.91,  1.42,  1.51
+ 2.45,  5.00,  2.90
+ 3.18,  2.82,  5.27
+ 2.65,  2.93,  3.78
+ 1.51,  4.66,  0.68
+ 1.76,  3.16,  2.60
+ 1.38,  4.05,  0.82
+ 2.85,  2.65,  4.34
+ 2.52,  2.99,  3.32
+ 3.33,  3.79,  4.59
+ 2.82,  3.34,  4.41
+ 2.83,  2.89,  3.39
+ 4.71,  4.31,  8.44
+ 2.81,  2.42,  4.32
+ 2.83,  2.35,  3.89
+ 3.51,  1.58,  6.63
+ 1.87,  3.49,  1.71
+ 3.00,  3.24,  4.66
+ 2.02,  3.67,  2.50
+ 3.57,  4.36,  5.30
+ 1.59,  1.37,  3.07
+ 4.28,  4.27,  5.95
+ 4.03,  4.00,  5.54
+ 3.78,  2.30,  5.82
+ 1.97,  1.27,  3.29
+ 2.60,  2.16,  3.96
+ 3.13,  3.38,  4.87
+ 5.93,  3.52, 10.12
+ 3.20,  3.66,  3.96
+ 3.60,  1.95,  5.27
+ 2.44,  4.78,  2.88
+ 0.82,  1.95,  0.52
+ 4.58,  2.19,  7.74
+ 1.39,  2.26,  1.33
+ 3.40,  1.77,  6.96
+ 3.26,  3.38,  4.37
+ 2.93,  1.09,  5.39
+ 3.04,  4.57,  4.09
+ 2.09,  3.28,  2.51
+ 0.80,  3.91, -0.14
+ 2.18,  2.72,  3.31
+ 3.50,  1.13,  7.11
+ 2.11,  2.32,  2.50
+ 3.73,  2.41,  6.18
+ 4.91,  2.87,  8.61
+ 3.48,  0.20,  5.67
+ 3.68,  4.18,  5.29
+ 4.13,  4.18,  5.80
+ 3.12,  2.72,  5.18
+ 3.04,  4.76,  3.37
+ 1.98,  3.33,  2.49
+ 2.46,  3.60,  4.04
+ 2.47,  4.05,  2.36
+ 2.65,  1.80,  4.45
+ 2.20,  3.82,  2.99
+ 5.53,  3.55,  8.99
+ 1.35,  4.95,  0.19
+ 3.32,  3.79,  4.96
+ 2.74,  3.75,  2.60
+ 2.42,  1.24,  4.68
+ 2.15,  3.61,  2.26
+ 3.24,  3.58,  4.78
+ 3.84,  2.43,  5.78
+ 2.68,  3.08,  3.95
+ 3.66,  3.86,  5.03
+ 2.43,  1.20,  3.97
+ 1.94,  1.67,  3.33
+ 4.55,  2.78,  7.85
+ 3.34,  3.50,  5.36
+ 1.92,  5.04,  1.82
+ 2.71,  2.45,  4.32
+ 1.63,  4.45, -0.19
+ 3.80,  5.57,  5.94
+ 3.20,  5.54,  3.52
+ 3.66,  2.38,  6.20
+ 3.80,  4.36,  5.54
+ 2.56,  4.35,  2.27
+ 2.85,  2.99,  4.21
+ 3.09,  1.80,  5.12
+ 2.96,  3.96,  3.04
+ 2.23,  4.21,  2.51
+ 1.82,  3.40,  2.91
+ 2.66,  1.51,  4.79
+ 4.48,  3.93,  7.25
+ 2.15,  2.77,  2.76
+ 1.09,  1.58,  1.85
+ 3.13,  4.81,  3.73
+ 2.24,  1.74,  2.85
+ 2.58,  3.51,  3.47
--- a/zad2/data4.csv
+++ b/zad2/data4.csv
@ -0,0 +1,100 @@
+ 2.17,  3.71,  1.62
+ 2.51,  2.96,  3.37
+ 2.39,  4.18, -0.20
+ 4.57,  3.32,  6.29
+ 1.90,  4.69, -0.94
+ 2.78,  4.06, -0.22
+ 2.18,  4.85, -2.46
+ 1.78,  4.81, -1.21
+ 2.60,  2.18,  5.03
+ 2.69,  3.14,  2.95
+ 2.03,  3.01,  3.54
+ 4.78,  2.10, 12.28
+ 3.07,  1.70,  6.67
+ 4.06,  3.68,  2.84
+ 1.66,  2.61,  4.53
+ 1.94,  2.65,  4.21
+ 3.85,  4.62, -2.49
+ 3.77,  4.78, -3.54
+ 0.62,  1.97,  6.96
+ 0.82,  1.58,  6.27
+ 1.65,  4.04,  1.90
+ 2.32,  3.50,  2.03
+ 1.65,  4.04,  1.66
+ 2.81,  2.18,  5.30
+ 2.66,  4.62, -2.39
+ 4.39,  0.99, 13.59
+ 2.48,  3.03,  3.21
+ 3.31,  3.91,  0.41
+ 4.19,  1.92,  9.94
+ 2.18,  4.25, -0.15
+ 3.64,  4.17, -0.40
+ 2.65,  1.40,  6.24
+ 2.22,  3.28,  2.69
+ 4.05,  2.90,  6.07
+ 1.46,  3.09,  4.18
+ 4.41,  2.98,  7.14
+ 3.47,  2.59,  5.46
+ 2.80,  1.36,  6.68
+ 2.39,  3.83,  0.95
+ 3.14,  1.52,  7.38
+ 3.23,  4.85, -3.88
+ 0.83,  4.86,  2.76
+ 2.95,  2.58,  4.59
+ 2.07,  3.87,  1.29
+ 1.88,  2.55,  4.39
+ 4.53,  4.29,  0.77
+ 2.44,  2.51,  4.35
+ 3.65,  4.93, -4.36
+ 3.41,  1.23,  8.68
+ 1.18,  1.48,  5.63
+ 3.43,  3.12,  3.63
+ 2.54,  2.39,  4.64
+ 2.70,  3.50,  1.81
+ 4.29,  4.11,  1.11
+ 1.68,  2.51,  4.62
+ 3.02,  2.46,  4.89
+ 2.20,  2.57,  4.15
+ 3.31,  1.67,  7.41
+ 1.49,  3.19,  3.98
+ 2.82,  1.22,  6.81
+ 3.00,  2.41,  5.07
+ 2.96,  1.61,  6.62
+ 1.68,  3.84,  2.25
+ 4.49,  1.73, 11.90
+ 3.68,  4.92, -4.27
+ 3.37,  4.90, -4.15
+ 3.39,  1.35,  8.35
+ 4.82,  4.66, -0.92
+ 2.74,  3.23,  2.67
+ 2.10,  1.82,  5.19
+ 1.77,  1.04,  5.33
+ 2.41,  2.99,  3.25
+ 2.08,  4.33, -0.38
+ 0.74,  3.81,  5.32
+ 1.61,  1.58,  5.32
+ 2.69,  4.96, -4.06
+ 2.20,  3.24,  2.84
+ 2.83,  1.49,  6.54
+ 1.64,  4.31,  0.96
+ 3.51,  4.05,  0.05
+ 2.84,  1.27,  6.77
+ 4.02,  3.44,  3.99
+ 3.25,  4.25, -1.11
+ 3.72,  2.60,  6.10
+ 3.85,  3.95,  1.00
+ 4.63,  3.47,  5.56
+ 2.30,  4.43, -1.04
+ 1.44,  3.37,  3.76
+ 3.16,  2.71,  4.62
+ 1.85,  4.90, -1.60
+ 4.37,  3.26,  5.59
+ 3.55,  2.81,  5.05
+ 3.28,  1.32,  7.88
+ 3.93,  2.02,  8.84
+ 3.21,  2.14,  6.19
+ 2.67,  1.44,  6.23
+ 1.68,  1.16,  5.32
+ 3.36,  3.68,  1.56
+ 2.31,  1.71,  5.40
+ 3.87,  1.19, 10.51
--- a/zad2/regresja-liniowa
+++ b/zad2/regresja-liniowa
--- a/zad2/zad2.py
+++ b/zad2/zad2.py
@ -0,0 +1,71 @@
+"""
+Komputerowa analiza danych
+Zadanie 2
+Michał Leśniak 195642
+"""
+from statistics import mean
+
+
+def var(lst):
+    x_mean = mean(lst)
+    return sum((x-x_mean)**2 for x in lst)/len(lst)
+
+
+def cov(lst_x, lst_y):
+    assert len(lst_x) == len(lst_y)
+    x_mean = mean(lst_x)
+    y_mean = mean(lst_y)
+    return sum((lst_x[i]-x_mean)*(lst_y[i]-y_mean) for i in range(len(lst_x)))/len(lst_x)
+
+
+def load_data(*args):
+    ret = ()
+    for arg in args:
+        with open(arg, 'r') as f:
+            lines = f.read().splitlines()
+        lst = []
+        for line in lines:
+            lst.append(tuple([float(x.strip()) for x in line.split(',')]))
+        ret += lst,
+    return ret
+
+
+def model1(data):
+    lst_x = [x for x, _ in data]
+    lst_y = [y for _, y in data]
+
+    a = cov(lst_x, lst_y)/var(lst_x)
+    print(f'f(X) = {a} * X')
+
+
+def model2(data):
+    lst_x = [x for x, _ in data]
+    lst_y = [y for _, y in data]
+
+    a = cov(lst_x, lst_y)/var(lst_x)
+    b = mean(lst_y) - a*mean(lst_x)
+    print(f'f(X) = {a} * X + {b}')
+
+
+def main():
+    data1, data2, data3, data4 = load_data(
+        'data1.csv', 'data2.csv', 'data3.csv', 'data4.csv')
+    print(var([x for x, _ in data1]))
+    print(cov([x for x, _ in data1], [y for _, y in data1]))
+    # print(data2)
+    # print(data3)
+    # print(data4)
+    model1(data1)
+    model1(data2)
+    model2(data1)
+    model2(data2)
+    x_mean = mean([x for x, _ in data1])
+    y_mean = mean([y for _, y in data1])
+    xy = sum([x*y for x, y in data1])
+    x_2 = sum([x**2 for x, _ in data1])
+    print(sum([2*x-2*x*y for x,y in data1])/len(data1))
+    print(xy/x_2)
+
+
+if __name__ == '__main__':
+    main()