""" Downloaded from: https://gist.github.com/alimuldal/fbb19b73fa25423f02e8 """ import numpy as np from scipy import stats from itertools import combinations from statsmodels.stats.multitest import multipletests from statsmodels.stats.libqsturng import psturng import warnings from collections import namedtuple def kw_dunn(groups, pairs=None, alpha=0.05, method='bonf'): return KWDunn.compute(groups=groups, pairs=pairs, alpha=alpha, method=method) def chisqprob(chisq, df): """a helper function to deal with recent SciPy versions.""" return stats.chi2.sf(chisq, df) Dunn = namedtuple('Dunn', ["Z", "p_corrected", "alpha", "reject"] ) class KWDunn(namedtuple('_KWDunn', [ "H", "p_omnibus", "pairwise", ] )): @classmethod def compute(cls, groups, pairs=None, alpha=0.05, method='bonf'): H, p_omni, Z_pair, p_pair, rej = kw_dunn_raw(groups, to_compare=pairs, alpha=alpha, method=method) pairwise = {} if (pairs is None) or (len(pairs) == 0): pass else: for pair, Z, p, r in zip(pairs, Z_pair, p_pair, rej): pairwise[pair] = Dunn(Z, p, alpha, r) return cls(H, p_omni, pairwise) def kw_dunn_raw(groups, to_compare=None, alpha=0.05, method='bonf'): """ Kruskal-Wallis 1-way ANOVA with Dunn's multiple comparison test Arguments: --------------- groups: sequence arrays corresponding to k mutually independent samples from continuous populations to_compare: sequence tuples specifying the indices of pairs of groups to compare, e.g. [(0, 1), (0, 2)] would compare group 0 with 1 & 2. by default, all possible pairwise comparisons between groups are performed. alpha: float family-wise error rate used for correcting for multiple comparisons (see statsmodels.stats.multitest.multipletests for details) method: string method used to adjust p-values to account for multiple corrections (see statsmodels.stats.multitest.multipletests for options) Returns: --------------- H: float Kruskal-Wallis H-statistic p_omnibus: float p-value corresponding to the global null hypothesis that the medians of the groups are all equal Z_pairs: float array Z-scores computed for the absolute difference in mean ranks for each pairwise comparison p_corrected: float array corrected p-values for each pairwise comparison, corresponding to the null hypothesis that the pair of groups has equal medians. note that these are only meaningful if the global null hypothesis is rejected. reject: bool array True for pairs where the null hypothesis can be rejected for the given alpha Reference: --------------- Gibbons, J. D., & Chakraborti, S. (2011). Nonparametric Statistical Inference (5th ed., pp. 353-357). Boca Raton, FL: Chapman & Hall. """ # omnibus test (K-W ANOVA) # ------------------------------------------------------------------------- groups = [np.array(gg) for gg in groups] k = len(groups) n = np.array([len(gg) for gg in groups]) if np.any(n < 5): warnings.warn("Sample sizes < 5 are not recommended (K-W test assumes " "a chi square distribution)") allgroups = np.concatenate(groups) N = len(allgroups) ranked = stats.rankdata(allgroups) # correction factor for ties T = stats.tiecorrect(ranked) if T == 0: raise ValueError('All numbers are identical in kruskal') # sum of ranks for each group j = np.insert(np.cumsum(n), 0, 0) R = np.empty(k, dtype=np.float) for ii in range(k): R[ii] = ranked[j[ii]:j[ii + 1]].sum() # the Kruskal-Wallis H-statistic H = (12. / (N * (N + 1.))) * ((R ** 2.) / n).sum() - 3 * (N + 1) # apply correction factor for ties H /= T df_omnibus = k - 1 p_omnibus = chisqprob(H, df_omnibus) # multiple comparisons # ------------------------------------------------------------------------- # by default we compare every possible pair of groups if to_compare is None: to_compare = tuple(combinations(range(k), 2)) ncomp = len(to_compare) Z_pairs = np.empty(ncomp, dtype=np.float) p_uncorrected = np.empty(ncomp, dtype=np.float) Rmean = R / n for pp, (ii, jj) in enumerate(to_compare): # standardized score Zij = (np.abs(Rmean[ii] - Rmean[jj]) / np.sqrt((1. / 12.) * N * (N + 1) * (1. / n[ii] + 1. / n[jj]))) Z_pairs[pp] = Zij # corresponding p-values obtained from upper quantiles of the standard # normal distribution p_uncorrected = stats.norm.sf(Z_pairs) * 2. # correction for multiple comparisons reject, p_corrected, alphac_sidak, alphac_bonf = multipletests( p_uncorrected, method=method ) return H, p_omnibus, Z_pairs, p_corrected, reject