Подсчёт ответов, сломана статистика

2026-03-22 15:44:59 +04:00
commit 87addcda63
5 changed files with 201 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,108 @@
+def load_data(filename):
+    import pandas as pd
+    return pd.read_csv(filename)
+
+
+
+def get_questions(data):
+    return [col for col in data.columns 
+            if col not in ["Timestamp", "Ваш пол", "Ваш возраст", "googlehui"]]
+
+
+
+
+def get_counts(data, questions):
+    women = data[data["Ваш пол"] == "женский"]
+    men = data[data["Ваш пол"] == "мужской"]
+
+    women_answers = {}
+    men_answers = {}
+
+    for q in questions:
+        women_answers[q] = women[q].value_counts().to_dict()
+        men_answers[q] = men[q].value_counts().to_dict()
+
+    return women_answers, men_answers
+
+
+
+def fisher_for_question(data, question):
+    import pandas as pd
+    from scipy.stats import fisher_exact
+
+    results = {}
+
+    for answer in data[question].dropna().unique():
+        
+        # бинаризация
+        binary = data[question] == answer
+        
+        table = pd.crosstab(data["Ваш пол"], binary)
+
+        if table.shape == (2, 2):
+            _, p = fisher_exact(table)
+            results[answer] = p
+
+    return results
+
+
+
+def chi2_for_question(data, question):
+    import pandas as pd
+    from scipy.stats import chi2_contingency
+
+    table = pd.crosstab(data["Ваш пол"], data[question])
+
+    chi2, p, _, _ = chi2_contingency(table)
+
+    return p
+
+
+
+def expand_counts(count_dict, sep=";"):
+    from collections import Counter
+    result = Counter()
+
+    for key, value in count_dict.items():
+        # разбиваем ключ по ";"
+        items = [x.strip() for x in str(key).split(sep)]
+
+        for item in items:
+            result[item] += value
+
+    return dict(result)
+
+
+
+def expand_all_counts(data_dict):
+    expanded = {}
+
+    for question, answers in data_dict.items():
+        expanded[question] = expand_counts(answers)
+
+    return expanded
+
+
+
+data = load_data("MEN.fixed.csv")
+questions = get_questions(data)
+
+# просто посмотреть частоты
+women_ans, men_ans = get_counts(data, questions)
+
+women_ans = expand_all_counts(women_ans)
+men_ans = expand_all_counts(men_ans)
+
+print("women_ans: ", women_ans)
+print("men_ans: ", men_ans)
+
+exit(0)
+
+# статистика
+for q in questions:
+    fisher_res = fisher_for_question(data, q)
+    chi2_p = chi2_for_question(data, q)
+
+    print(f"\nВопрос: {q}")
+    print("Фишер:", fisher_res)
+    print("Хи-квадрат p:", chi2_p)