160 lines
4.3 KiB
Python
160 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
||
|
||
import math
|
||
import random
|
||
|
||
# for stats.chi2.cdf (cumulative chi squared distribution)
|
||
from scipy import stats
|
||
|
||
|
||
# X - list, b - begin (first index), e - end (too big index),
|
||
# k - index of the element to set in the right place
|
||
def quickselect(X, b, e, k):
|
||
# end condition
|
||
if e - b <= 1: return
|
||
# random pivot
|
||
r = random.randint(b, e - 1)
|
||
X[b], X[r] = X[r], X[b]
|
||
# partition
|
||
p0 = b
|
||
p1 = b + 1
|
||
while p1 < e:
|
||
if X[p1] < X[p0]:
|
||
if p1 == p0 + 1:
|
||
X[p0], X[p1] = X[p1], X[p0]
|
||
p0 = p1
|
||
else:
|
||
X[p0], X[p0+1], X[p1] = X[p1], X[p0], X[p0+1]
|
||
p0 += 1
|
||
p1 += 1
|
||
# recursion
|
||
if k < p0:
|
||
quickselect(X, b, p0, k)
|
||
elif k > p0:
|
||
quickselect(X, p0+1, e, k)
|
||
|
||
|
||
|
||
# quantile calculator (no libraries needed!)
|
||
def quantile(X, q, copy=True):
|
||
# edge cases and exceeded range of q
|
||
if q >= 1.0:
|
||
return max(X)
|
||
elif q <= 0.0:
|
||
return min(X)
|
||
|
||
n = len(X)
|
||
pos = (n - 1) * q # target position
|
||
left = int(pos) # index of the left element
|
||
rcoeff = pos - left # coefficient for the right element
|
||
|
||
# optional copy
|
||
if copy:
|
||
X = X[:]
|
||
|
||
quickselect(X, 0, n, left) # quickselect!
|
||
|
||
# X[left] is in order now, and elements on the right are greater
|
||
xleft = X[left] # left element
|
||
|
||
if rcoeff == 0.0: # the simple case
|
||
return xleft
|
||
else: # the general case
|
||
xright = min(X[i] for i in range(left+1, n)) # right element
|
||
return xleft * (1.0 - rcoeff) + xright * rcoeff
|
||
|
||
|
||
# interquantile range
|
||
def IQR(X, copy=True):
|
||
if copy:
|
||
X = X[:]
|
||
q1 = quantile(X, 0.25, copy=False)
|
||
q3 = quantile(X, 0.75, copy=False)
|
||
return q3 - q1
|
||
|
||
|
||
# cumulative distribution function of normal distribution N(mu, sigma)
|
||
def normal_cdf(mu, sigma, x):
|
||
return 0.5 * (1.0 + math.erf((x - mu) / (2.0**0.5 * sigma)))
|
||
|
||
|
||
# Chi-squared normality test
|
||
def chi2normality(X, var_df=1, bins=None):
|
||
n = len(X)
|
||
assert n >= 4
|
||
mu = sum(X) / n # mean
|
||
evar = sum((x - mu)**2 for x in X) / (n - var_df) # estimated variance
|
||
sigma = evar**0.5 # the expected distribution will be N(mu, sigma)
|
||
|
||
minx = min(X)
|
||
gap = max(X) - minx
|
||
assert gap != 0.0
|
||
if bins is None: # guess the number of bins
|
||
# see https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
|
||
h = 2.0 * IQR(X) / (n**(1.0/3.0)) # bin size by the Freedman–Diaconis rule
|
||
k = int(math.ceil(gap / h)) # number of histogram bins
|
||
else:
|
||
k = bins
|
||
k = min(4, k) # k should be at least 4
|
||
h = gap / k # actual bin size
|
||
|
||
# section points between bins
|
||
points = [minx + h * i for i in range(k + 1)]
|
||
points[0] = -math.inf
|
||
points[-1] = math.inf
|
||
|
||
# actual frequency
|
||
freq = [0] * k
|
||
for x in X:
|
||
freq[min(int((x - minx) / h), k-1)] += 1
|
||
|
||
# cumulative distributions for the section points
|
||
cdfs = [normal_cdf(mu, sigma, p) for p in points]
|
||
|
||
# expected frequencies
|
||
expected = [(cdfs[i+1] - cdfs[i]) * n for i in range(k)]
|
||
|
||
# Chi-squared statistic
|
||
chi2stat = sum((freq[i] - expected[i])**2 / expected[i] for i in range(k))
|
||
# Chi-squared degrees of freedom
|
||
# i.e. number of bins - 1 - number of parameters of of distribution
|
||
# which is number of bins - 3, as normal distribution has two parameters
|
||
chi2df = k - 3
|
||
# p-value for the "X is sampled from a normal distribution" hypothesis
|
||
# note: for very big chi2stat values, it is close to 0
|
||
pvalue = 1.0 - stats.chi2.cdf(chi2stat, chi2df)
|
||
|
||
# significance (alpha) is a probability, that the hypothesis will be rejected
|
||
# despite of being actually true
|
||
|
||
# when p-value is lower than alpha, the hypothesis is rejected
|
||
# otherwise, the test fails to reject the hypothesis (this usually happens
|
||
# when the hypothesis is true)
|
||
|
||
return pvalue
|
||
|
||
|
||
# perform the test, print some messages
|
||
def chi2normality_describe(X, alpha=0.05):
|
||
print('Hypothesis: X is sampled from a normal distribution')
|
||
pvalue = chi2normality(X)
|
||
print('Significance level:', alpha)
|
||
print('p-value: %.7f' % pvalue)
|
||
if pvalue < alpha:
|
||
print('Hypothesis rejected. X doesn\'t seem to be sampled from a normal distribution.')
|
||
else:
|
||
print('Failed to reject the hypothesis.')
|
||
print()
|
||
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# test for some uniform distribution
|
||
X = [random.random() * 2.0 + 3.0 for i in range(200)]
|
||
chi2normality_describe(X)
|
||
|
||
# test for actual normal distribution
|
||
X = [random.normalvariate(4.0, 5.0) for i in range(200)]
|
||
chi2normality_describe(X)
|
||
|