KAD/zad2/chi2_normality.py

160 lines
4.3 KiB
Python
Raw Permalink Normal View History

2021-12-05 19:59:44 +01:00
#!/usr/bin/env python3
import math
import random
# for stats.chi2.cdf (cumulative chi squared distribution)
from scipy import stats
# X - list, b - begin (first index), e - end (too big index),
# k - index of the element to set in the right place
def quickselect(X, b, e, k):
# end condition
if e - b <= 1: return
# random pivot
r = random.randint(b, e - 1)
X[b], X[r] = X[r], X[b]
# partition
p0 = b
p1 = b + 1
while p1 < e:
if X[p1] < X[p0]:
if p1 == p0 + 1:
X[p0], X[p1] = X[p1], X[p0]
p0 = p1
else:
X[p0], X[p0+1], X[p1] = X[p1], X[p0], X[p0+1]
p0 += 1
p1 += 1
# recursion
if k < p0:
quickselect(X, b, p0, k)
elif k > p0:
quickselect(X, p0+1, e, k)
# quantile calculator (no libraries needed!)
def quantile(X, q, copy=True):
# edge cases and exceeded range of q
if q >= 1.0:
return max(X)
elif q <= 0.0:
return min(X)
n = len(X)
pos = (n - 1) * q # target position
left = int(pos) # index of the left element
rcoeff = pos - left # coefficient for the right element
# optional copy
if copy:
X = X[:]
quickselect(X, 0, n, left) # quickselect!
# X[left] is in order now, and elements on the right are greater
xleft = X[left] # left element
if rcoeff == 0.0: # the simple case
return xleft
else: # the general case
xright = min(X[i] for i in range(left+1, n)) # right element
return xleft * (1.0 - rcoeff) + xright * rcoeff
# interquantile range
def IQR(X, copy=True):
if copy:
X = X[:]
q1 = quantile(X, 0.25, copy=False)
q3 = quantile(X, 0.75, copy=False)
return q3 - q1
# cumulative distribution function of normal distribution N(mu, sigma)
def normal_cdf(mu, sigma, x):
return 0.5 * (1.0 + math.erf((x - mu) / (2.0**0.5 * sigma)))
# Chi-squared normality test
def chi2normality(X, var_df=1, bins=None):
n = len(X)
assert n >= 4
mu = sum(X) / n # mean
evar = sum((x - mu)**2 for x in X) / (n - var_df) # estimated variance
sigma = evar**0.5 # the expected distribution will be N(mu, sigma)
minx = min(X)
gap = max(X) - minx
assert gap != 0.0
if bins is None: # guess the number of bins
# see https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
h = 2.0 * IQR(X) / (n**(1.0/3.0)) # bin size by the FreedmanDiaconis rule
k = int(math.ceil(gap / h)) # number of histogram bins
else:
k = bins
k = min(4, k) # k should be at least 4
h = gap / k # actual bin size
# section points between bins
points = [minx + h * i for i in range(k + 1)]
points[0] = -math.inf
points[-1] = math.inf
# actual frequency
freq = [0] * k
for x in X:
freq[min(int((x - minx) / h), k-1)] += 1
# cumulative distributions for the section points
cdfs = [normal_cdf(mu, sigma, p) for p in points]
# expected frequencies
expected = [(cdfs[i+1] - cdfs[i]) * n for i in range(k)]
# Chi-squared statistic
chi2stat = sum((freq[i] - expected[i])**2 / expected[i] for i in range(k))
# Chi-squared degrees of freedom
# i.e. number of bins - 1 - number of parameters of of distribution
# which is number of bins - 3, as normal distribution has two parameters
chi2df = k - 3
# p-value for the "X is sampled from a normal distribution" hypothesis
# note: for very big chi2stat values, it is close to 0
pvalue = 1.0 - stats.chi2.cdf(chi2stat, chi2df)
# significance (alpha) is a probability, that the hypothesis will be rejected
# despite of being actually true
# when p-value is lower than alpha, the hypothesis is rejected
# otherwise, the test fails to reject the hypothesis (this usually happens
# when the hypothesis is true)
return pvalue
# perform the test, print some messages
def chi2normality_describe(X, alpha=0.05):
print('Hypothesis: X is sampled from a normal distribution')
pvalue = chi2normality(X)
print('Significance level:', alpha)
print('p-value: %.7f' % pvalue)
if pvalue < alpha:
print('Hypothesis rejected. X doesn\'t seem to be sampled from a normal distribution.')
else:
print('Failed to reject the hypothesis.')
print()
if __name__ == '__main__':
# test for some uniform distribution
X = [random.random() * 2.0 + 3.0 for i in range(200)]
chi2normality_describe(X)
# test for actual normal distribution
X = [random.normalvariate(4.0, 5.0) for i in range(200)]
chi2normality_describe(X)