Source code for retentioneering.core.core_functions.compare

# * Copyright (C) 2020 Maxim Godzi, Anatoly Zaytsev, Retentioneering Team
# * This Source Code Form is subject to the terms of the Retentioneering Software Non-Exclusive License (License)
# * By using, sharing or editing this code you agree with the License terms and conditions.
# * You can obtain License text at https://github.com/retentioneering/retentioneering-tools/blob/master/LICENSE.md

from scipy.stats import ks_2samp, mannwhitneyu

from ...visualization import plot_compare
from statsmodels.stats.power import TTestIndPower
from statsmodels.stats.weightstats import ttest_ind
import numpy as np
import math

TESTS_LIST = ['ks_2samp', 'mannwhitneyu']


[docs]def compare(self, *, groups, function, test, group_names=('group_1', 'group_2'), alpha=0.05): """ Tests selected metric between two groups of users. Parameters ---------- groups: tuple (optional, default None) Must contain tuple of two elements (g_1, g_2): where g_1 and g_2 are collections of user_id`s (list, tuple or set). function: function(x) -> number Selected metrics. Must contain a function wich takes as an argument dataset for single user trajectory and returns a single numerical value. group_names: tuple (optional, default: ('group_1', 'group_2')) Names for selected groups g_1 and g_2. test: {‘mannwhitneyu’, 'ttest', ‘ks_2samp’} Test the null hypothesis that 2 independent samples are drawn from the same distribution. One-sided tests are used, meaning that distributions are compared 'less' or 'greater'. Rule of thumbs is: for discrete variables (like convertions or number of purchase) use Mann-Whitney (‘mannwhitneyu’) test or t-test (‘ttest’). For continious variables (like average_check) use Kolmogorov-Smirnov test ('ks_2samp'). alpha: float (optional, default 0.05) Selected level of significance. Returns ------- Prints statistical comparison between two groups over selected metric and test Plots a distribution for selected metrics for two groups """ # obtain two populations for each group index_col = self.retention_config['user_col'] data = self._obj g1 = data[data[index_col].isin(groups[0])].copy() g2 = data[data[index_col].isin(groups[1])].copy() # obtain two distributions: g1_data = g1.groupby(index_col).apply(function).dropna().values g2_data = g2.groupby(index_col).apply(function).dropna().values # plot graphs plot_compare.compare(num_data=(g1_data, g2_data), group_names=group_names) # calculate test statistics if test: print(f"{group_names[0]} (mean \u00B1 SD): {g1_data.mean():.3f} \u00B1 {g1_data.std():.3f}, n = {len(g1_data)}") print(f"{group_names[1]} (mean \u00B1 SD): {g2_data.mean():.3f} \u00B1 {g2_data.std():.3f}, n = {len(g2_data)}") p_val_norm, power_norm = _stat_test(g1_data, g2_data, test, alpha) p_val_rev, power_rev = _stat_test(g2_data, g1_data, test, alpha) if p_val_norm < p_val_rev: p_val = p_val_norm power = power_norm label_max = group_names[0] label_min = group_names[1] else: p_val = p_val_rev power = power_rev label_max = group_names[1] label_min = group_names[0] print(f"'{label_max}' is greater than '{label_min}' with P-value: {p_val:.5f}") print(f"power of the test: {100*power:.2f}%")
def _stat_test(data_max, data_min, test, alpha): # calculate effect size if (max(data_max) <= 1 and min(data_max) >= 0 and max(data_min) <= 1 and min(data_min) >= 0): # if analyze proportions use Cohen's h: effect_size = _cohenh(data_max, data_min) else: # for other variables use Cohen's d: effect_size = _cohend(data_max, data_min) # calculate power power = TTestIndPower().power(effect_size=effect_size, nobs1=len(data_max), ratio=len(data_min) / len(data_max), alpha=alpha, alternative='larger') if test == 'ks_2samp': p_val = ks_2samp(data_max, data_min, alternative='less')[1] elif test == 'mannwhitneyu': p_val = mannwhitneyu(data_max, data_min, alternative='greater')[1] elif test == 'ttest': p_val = ttest_ind(data_max, data_min, alternative='larger')[1] return p_val, power # function to calculate Cohen's d: def _cohend(d1, d2): n1, n2 = len(d1), len(d2) s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1) s = math.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2)) u1, u2 = np.mean(d1), np.mean(d2) return (u1 - u2) / s # function to calculate Cohen's h: def _cohenh(d1, d2): u1, u2 = np.mean(d1), np.mean(d2) return 2*(math.asin(math.sqrt(u1)) - math.asin(math.sqrt(u2)))