KAD/zad2/chi2_normality.py
2021-12-05 19:59:44 +01:00

160 lines
4.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import math
import random
# for stats.chi2.cdf (cumulative chi squared distribution)
from scipy import stats
# X - list, b - begin (first index), e - end (too big index),
# k - index of the element to set in the right place
def quickselect(X, b, e, k):
# end condition
if e - b <= 1: return
# random pivot
r = random.randint(b, e - 1)
X[b], X[r] = X[r], X[b]
# partition
p0 = b
p1 = b + 1
while p1 < e:
if X[p1] < X[p0]:
if p1 == p0 + 1:
X[p0], X[p1] = X[p1], X[p0]
p0 = p1
else:
X[p0], X[p0+1], X[p1] = X[p1], X[p0], X[p0+1]
p0 += 1
p1 += 1
# recursion
if k < p0:
quickselect(X, b, p0, k)
elif k > p0:
quickselect(X, p0+1, e, k)
# quantile calculator (no libraries needed!)
def quantile(X, q, copy=True):
# edge cases and exceeded range of q
if q >= 1.0:
return max(X)
elif q <= 0.0:
return min(X)
n = len(X)
pos = (n - 1) * q # target position
left = int(pos) # index of the left element
rcoeff = pos - left # coefficient for the right element
# optional copy
if copy:
X = X[:]
quickselect(X, 0, n, left) # quickselect!
# X[left] is in order now, and elements on the right are greater
xleft = X[left] # left element
if rcoeff == 0.0: # the simple case
return xleft
else: # the general case
xright = min(X[i] for i in range(left+1, n)) # right element
return xleft * (1.0 - rcoeff) + xright * rcoeff
# interquantile range
def IQR(X, copy=True):
if copy:
X = X[:]
q1 = quantile(X, 0.25, copy=False)
q3 = quantile(X, 0.75, copy=False)
return q3 - q1
# cumulative distribution function of normal distribution N(mu, sigma)
def normal_cdf(mu, sigma, x):
return 0.5 * (1.0 + math.erf((x - mu) / (2.0**0.5 * sigma)))
# Chi-squared normality test
def chi2normality(X, var_df=1, bins=None):
n = len(X)
assert n >= 4
mu = sum(X) / n # mean
evar = sum((x - mu)**2 for x in X) / (n - var_df) # estimated variance
sigma = evar**0.5 # the expected distribution will be N(mu, sigma)
minx = min(X)
gap = max(X) - minx
assert gap != 0.0
if bins is None: # guess the number of bins
# see https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
h = 2.0 * IQR(X) / (n**(1.0/3.0)) # bin size by the FreedmanDiaconis rule
k = int(math.ceil(gap / h)) # number of histogram bins
else:
k = bins
k = min(4, k) # k should be at least 4
h = gap / k # actual bin size
# section points between bins
points = [minx + h * i for i in range(k + 1)]
points[0] = -math.inf
points[-1] = math.inf
# actual frequency
freq = [0] * k
for x in X:
freq[min(int((x - minx) / h), k-1)] += 1
# cumulative distributions for the section points
cdfs = [normal_cdf(mu, sigma, p) for p in points]
# expected frequencies
expected = [(cdfs[i+1] - cdfs[i]) * n for i in range(k)]
# Chi-squared statistic
chi2stat = sum((freq[i] - expected[i])**2 / expected[i] for i in range(k))
# Chi-squared degrees of freedom
# i.e. number of bins - 1 - number of parameters of of distribution
# which is number of bins - 3, as normal distribution has two parameters
chi2df = k - 3
# p-value for the "X is sampled from a normal distribution" hypothesis
# note: for very big chi2stat values, it is close to 0
pvalue = 1.0 - stats.chi2.cdf(chi2stat, chi2df)
# significance (alpha) is a probability, that the hypothesis will be rejected
# despite of being actually true
# when p-value is lower than alpha, the hypothesis is rejected
# otherwise, the test fails to reject the hypothesis (this usually happens
# when the hypothesis is true)
return pvalue
# perform the test, print some messages
def chi2normality_describe(X, alpha=0.05):
print('Hypothesis: X is sampled from a normal distribution')
pvalue = chi2normality(X)
print('Significance level:', alpha)
print('p-value: %.7f' % pvalue)
if pvalue < alpha:
print('Hypothesis rejected. X doesn\'t seem to be sampled from a normal distribution.')
else:
print('Failed to reject the hypothesis.')
print()
if __name__ == '__main__':
# test for some uniform distribution
X = [random.random() * 2.0 + 3.0 for i in range(200)]
chi2normality_describe(X)
# test for actual normal distribution
X = [random.normalvariate(4.0, 5.0) for i in range(200)]
chi2normality_describe(X)