gender-survey-statistics/main.py

def load_data(filename):
    import pandas as pd
    return pd.read_csv(filename)


def get_questions(data):
    return [col for col in data.columns
            if col not in ["Timestamp", "Ваш пол", "Ваш возраст", "googlehui"]]


def get_counts(data, questions):
    women = data[data["Ваш пол"] == "женский"]
    men = data[data["Ваш пол"] == "мужской"]

    women_answers = {}
    men_answers = {}

    for q in questions:
        women_answers[q] = women[q].value_counts().to_dict()
        men_answers[q] = men[q].value_counts().to_dict()

    return women_answers, men_answers


def fisher_for_question(data, question):
    import pandas as pd
    from scipy.stats import fisher_exact

    results = {}

    for answer in data[question].dropna().unique():

        # бинаризация
        binary = data[question] == answer

        table = pd.crosstab(data["Ваш пол"], binary)

        if table.shape == (2, 2):
            _, p = fisher_exact(table)
            results[answer] = p

    return results


def chi2_for_question(data, question):
    import pandas as pd
    from scipy.stats import chi2_contingency

    table = pd.crosstab(data["Ваш пол"], data[question])

    chi2, p, _, _ = chi2_contingency(table)

    return p


def expand_counts(count_dict, sep=";"):
    from collections import Counter
    result = Counter()

    for key, value in count_dict.items():
        # разбиваем ключ по ";"
        items = [x.strip() for x in str(key).split(sep)]

        for item in items:
            result[item] += value

    return dict(result)


def expand_all_counts(data_dict):
    expanded = {}

    for question, answers in data_dict.items():
        expanded[question] = expand_counts(answers)

    return expanded


data = load_data("MEN.fixed.csv")
questions = get_questions(data)

# просто посмотреть частоты
women_ans, men_ans = get_counts(data, questions)

women_ans = expand_all_counts(women_ans)
men_ans = expand_all_counts(men_ans)

print("women_ans: ", women_ans)
print("men_ans: ", men_ans)

exit(0)

# статистика
for q in questions:
    fisher_res = fisher_for_question(data, q)
    chi2_p = chi2_for_question(data, q)

    print(f"\nВопрос: {q}")
    print("Фишер:", fisher_res)
    print("Хи-квадрат p:", chi2_p)