2021-10-17 21:28:23 +02:00
|
|
|
"""
|
|
|
|
Komputerowa analiza danych
|
|
|
|
Zadanie 1
|
|
|
|
Michał Leśniak 195642
|
|
|
|
"""
|
|
|
|
import locale
|
|
|
|
locale.setlocale(locale.LC_ALL, '')
|
|
|
|
|
|
|
|
SPECIES_NAMES = {
|
|
|
|
'0': 'setosa',
|
|
|
|
'1': 'versicolor',
|
|
|
|
'2': 'virginica'
|
|
|
|
}
|
|
|
|
|
|
|
|
ATTRIBUTE_NAMES = ['Długość działki kielicha [cm]',
|
|
|
|
'Szerokość działki kielicha [cm]',
|
|
|
|
'Długość płatka [cm]',
|
|
|
|
'Szerokość płatka [cm]']
|
|
|
|
|
|
|
|
|
|
|
|
def mean(lst):
|
|
|
|
sum = 0
|
|
|
|
for i in lst:
|
|
|
|
sum += i
|
|
|
|
return sum/len(lst)
|
|
|
|
|
|
|
|
|
2021-11-07 20:39:10 +01:00
|
|
|
def median(lst):
|
|
|
|
is_even = len(lst) % 2 == 0
|
2021-10-17 21:28:23 +02:00
|
|
|
return (lst[len(lst)//2-1]+lst[len(lst)//2])/2 if is_even else lst[len(lst)//2]
|
|
|
|
|
|
|
|
|
|
|
|
def sample_standard_deviation(lst, lst_mean):
|
|
|
|
sum = 0
|
|
|
|
for x in [(y-lst_mean)**2 for y in lst]:
|
|
|
|
sum += x
|
2021-11-07 20:39:10 +01:00
|
|
|
return (sum/(len(lst)-1))**0.5
|
2021-10-17 21:28:23 +02:00
|
|
|
|
|
|
|
|
2021-11-07 20:39:10 +01:00
|
|
|
def q1(lst, lst_median):
|
|
|
|
is_even = len(lst) % 2 == 0
|
2021-11-07 22:33:57 +01:00
|
|
|
idx = len(lst)//2
|
2021-10-17 21:28:23 +02:00
|
|
|
sorted_list = lst[:idx]
|
2021-11-07 22:33:57 +01:00
|
|
|
if not is_even:
|
2021-10-17 21:28:23 +02:00
|
|
|
sorted_list.append(lst_median)
|
2021-11-07 20:39:10 +01:00
|
|
|
return median(sorted_list)
|
2021-10-17 21:28:23 +02:00
|
|
|
|
2021-11-07 22:33:57 +01:00
|
|
|
|
2021-11-07 20:39:10 +01:00
|
|
|
def q3(lst, lst_median):
|
|
|
|
is_even = len(lst) % 2 == 0
|
2021-11-07 22:33:57 +01:00
|
|
|
idx = len(lst)//2 if is_even else len(lst)//2+1
|
2021-10-17 21:28:23 +02:00
|
|
|
sorted_list = lst[idx:]
|
2021-11-07 22:33:57 +01:00
|
|
|
if not is_even:
|
2021-10-17 21:28:23 +02:00
|
|
|
sorted_list.insert(0, lst_median)
|
2021-11-07 20:39:10 +01:00
|
|
|
return median(sorted_list)
|
2021-10-17 21:28:23 +02:00
|
|
|
|
|
|
|
|
|
|
|
def min(lst):
|
|
|
|
return lst[0]
|
|
|
|
|
|
|
|
|
|
|
|
def max(lst):
|
|
|
|
return lst[-1]
|
|
|
|
|
|
|
|
|
|
|
|
def calc_data(lst):
|
|
|
|
lst_mean = mean(lst)
|
2021-11-07 20:39:10 +01:00
|
|
|
lst_median = median(lst)
|
|
|
|
return min(lst), lst_mean, sample_standard_deviation(lst, lst_mean), lst_median, \
|
|
|
|
q1(lst, lst_median), q3(lst, lst_median), max(lst)
|
2021-10-17 21:28:23 +02:00
|
|
|
|
|
|
|
|
|
|
|
def calc_species_data(species):
|
|
|
|
total = 0
|
|
|
|
for x in species.values():
|
|
|
|
total += x
|
|
|
|
|
|
|
|
species_list = []
|
|
|
|
for key in species:
|
|
|
|
species_list.append(
|
|
|
|
(SPECIES_NAMES[key], species[key], species[key]/total))
|
|
|
|
|
|
|
|
species_list.append(("Razem", total, 1.0))
|
|
|
|
|
|
|
|
return species_list
|
|
|
|
|
|
|
|
|
|
|
|
def percentage_format(x):
|
|
|
|
return locale.format_string('%.1f%%', x*100)
|
|
|
|
|
|
|
|
|
|
|
|
def float_format(x):
|
|
|
|
return locale.format_string('%.2f', x)
|
|
|
|
|
|
|
|
|
|
|
|
def print_table(table):
|
|
|
|
if not table:
|
|
|
|
return
|
|
|
|
cell_sizes = [0] * len(table[0])
|
|
|
|
for i in range(len(table[0])):
|
|
|
|
for x in table:
|
|
|
|
if not x:
|
|
|
|
continue
|
|
|
|
if len(x[i]) > cell_sizes[i]:
|
|
|
|
cell_sizes[i] = len(x[i])
|
|
|
|
|
|
|
|
header_row_format = ' | '.join((f'{{:^{x}}}' for x in cell_sizes))
|
|
|
|
row_format = f'{{:<{cell_sizes[0]}}} | ' + \
|
|
|
|
' | '.join((f'{{:^{x}}}' for x in cell_sizes[1:]))
|
|
|
|
|
|
|
|
lines = []
|
|
|
|
for x in table:
|
|
|
|
if x == table[0]:
|
|
|
|
lines.append(header_row_format.format(*x))
|
|
|
|
continue
|
|
|
|
if x is None:
|
|
|
|
lines.append(None)
|
|
|
|
continue
|
|
|
|
lines.append(row_format.format(*x))
|
|
|
|
row_size = 0
|
|
|
|
for x in lines:
|
|
|
|
if x and len(x) > row_size:
|
|
|
|
row_size = len(x)
|
|
|
|
row_size += 1
|
|
|
|
lines.insert(1, '='*row_size)
|
|
|
|
lines.insert(0, '='*row_size)
|
|
|
|
lines.insert(len(lines), '='*row_size)
|
|
|
|
for i in range(len(lines)):
|
|
|
|
if lines[i] is None:
|
|
|
|
lines[i] = '-'*row_size
|
|
|
|
for x in lines:
|
|
|
|
print(x)
|
|
|
|
|
|
|
|
|
|
|
|
def print_data(species_data, trait_data):
|
|
|
|
species_printable_table = []
|
|
|
|
species_printable_table.append(("Gatunek", "Liczebność (%)"))
|
|
|
|
for x in species_data:
|
|
|
|
if x == species_data[-1]:
|
|
|
|
species_printable_table.append(None)
|
|
|
|
species_printable_table.append(
|
|
|
|
(x[0], "{} ({})".format(x[1], percentage_format(x[2]))))
|
|
|
|
|
|
|
|
trait_printable_table = []
|
|
|
|
trait_printable_table.append(
|
|
|
|
("Cecha", "Minimum", "Śr. arytm. (± odch. stand.)", "Mediana (Q1 - Q3)", "Maksimum"))
|
|
|
|
for i in range(len(trait_data)):
|
|
|
|
trait_printable_table.append((ATTRIBUTE_NAMES[i],
|
|
|
|
float_format(trait_data[i][0]),
|
|
|
|
"{} (±{})".format(float_format(
|
|
|
|
trait_data[i][1]), float_format(trait_data[i][2])),
|
|
|
|
"{} ({} - {})".format(
|
|
|
|
float_format(trait_data[i][3]),
|
|
|
|
float_format(trait_data[i][4]),
|
|
|
|
float_format(trait_data[i][5])),
|
|
|
|
float_format(trait_data[i][6])))
|
|
|
|
|
|
|
|
print_table(species_printable_table)
|
|
|
|
print()
|
|
|
|
print_table(trait_printable_table)
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
# read data
|
|
|
|
with open("data.csv", "r") as f:
|
|
|
|
data = f.read().splitlines()
|
|
|
|
|
|
|
|
# parse data
|
|
|
|
sepal_length_list = []
|
|
|
|
sepal_width_list = []
|
|
|
|
petal_length_list = []
|
|
|
|
petal_width_list = []
|
|
|
|
species_dict = {
|
|
|
|
'0': 0,
|
|
|
|
'1': 0,
|
|
|
|
'2': 0,
|
|
|
|
}
|
|
|
|
for line in data:
|
|
|
|
sepal_length, sepal_width, petal_length, petal_width, species = line.split(
|
|
|
|
',')
|
|
|
|
sepal_length_list.append(float(sepal_length))
|
|
|
|
sepal_width_list.append(float(sepal_width))
|
|
|
|
petal_length_list.append(float(petal_length))
|
|
|
|
petal_width_list.append(float(petal_width))
|
|
|
|
species_dict[species] += 1
|
|
|
|
|
|
|
|
sepal_length_list.sort()
|
|
|
|
sepal_width_list.sort()
|
|
|
|
petal_length_list.sort()
|
|
|
|
petal_width_list.sort()
|
|
|
|
|
|
|
|
# calculate results
|
|
|
|
species_data = calc_species_data(species_dict)
|
|
|
|
|
|
|
|
trait_data = []
|
|
|
|
|
|
|
|
for lst in [sepal_length_list, sepal_width_list, petal_length_list, petal_width_list]:
|
|
|
|
trait_data.append(calc_data(lst))
|
|
|
|
|
|
|
|
# print results
|
|
|
|
print_data(species_data, trait_data)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import os.path
|
|
|
|
assert os.path.isfile('data.csv')
|
|
|
|
main()
|