kw_dunn.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. """
  2. Downloaded from: https://gist.github.com/alimuldal/fbb19b73fa25423f02e8
  3. """
  4. import numpy as np
  5. from scipy import stats
  6. from itertools import combinations
  7. from statsmodels.stats.multitest import multipletests
  8. from statsmodels.stats.libqsturng import psturng
  9. import warnings
  10. from collections import namedtuple
  11. def kw_dunn(groups, pairs=None, alpha=0.05, method='bonf'):
  12. return KWDunn.compute(groups=groups,
  13. pairs=pairs,
  14. alpha=alpha,
  15. method=method)
  16. def chisqprob(chisq, df):
  17. """a helper function to deal with recent SciPy versions."""
  18. return stats.chi2.sf(chisq, df)
  19. Dunn = namedtuple('Dunn',
  20. ["Z", "p_corrected", "alpha", "reject"]
  21. )
  22. class KWDunn(namedtuple('_KWDunn',
  23. [
  24. "H",
  25. "p_omnibus",
  26. "pairwise",
  27. ]
  28. )):
  29. @classmethod
  30. def compute(cls, groups, pairs=None, alpha=0.05, method='bonf'):
  31. H, p_omni, Z_pair, p_pair, rej = kw_dunn_raw(groups, to_compare=pairs,
  32. alpha=alpha, method=method)
  33. pairwise = {}
  34. if (pairs is None) or (len(pairs) == 0):
  35. pass
  36. else:
  37. for pair, Z, p, r in zip(pairs, Z_pair, p_pair, rej):
  38. pairwise[pair] = Dunn(Z, p, alpha, r)
  39. return cls(H, p_omni, pairwise)
  40. def kw_dunn_raw(groups, to_compare=None, alpha=0.05, method='bonf'):
  41. """
  42. Kruskal-Wallis 1-way ANOVA with Dunn's multiple comparison test
  43. Arguments:
  44. ---------------
  45. groups: sequence
  46. arrays corresponding to k mutually independent samples from
  47. continuous populations
  48. to_compare: sequence
  49. tuples specifying the indices of pairs of groups to compare, e.g.
  50. [(0, 1), (0, 2)] would compare group 0 with 1 & 2. by default, all
  51. possible pairwise comparisons between groups are performed.
  52. alpha: float
  53. family-wise error rate used for correcting for multiple comparisons
  54. (see statsmodels.stats.multitest.multipletests for details)
  55. method: string
  56. method used to adjust p-values to account for multiple corrections (see
  57. statsmodels.stats.multitest.multipletests for options)
  58. Returns:
  59. ---------------
  60. H: float
  61. Kruskal-Wallis H-statistic
  62. p_omnibus: float
  63. p-value corresponding to the global null hypothesis that the medians of
  64. the groups are all equal
  65. Z_pairs: float array
  66. Z-scores computed for the absolute difference in mean ranks for each
  67. pairwise comparison
  68. p_corrected: float array
  69. corrected p-values for each pairwise comparison, corresponding to the
  70. null hypothesis that the pair of groups has equal medians. note that
  71. these are only meaningful if the global null hypothesis is rejected.
  72. reject: bool array
  73. True for pairs where the null hypothesis can be rejected for the given
  74. alpha
  75. Reference:
  76. ---------------
  77. Gibbons, J. D., & Chakraborti, S. (2011). Nonparametric Statistical
  78. Inference (5th ed., pp. 353-357). Boca Raton, FL: Chapman & Hall.
  79. """
  80. # omnibus test (K-W ANOVA)
  81. # -------------------------------------------------------------------------
  82. groups = [np.array(gg) for gg in groups]
  83. k = len(groups)
  84. n = np.array([len(gg) for gg in groups])
  85. if np.any(n < 5):
  86. warnings.warn("Sample sizes < 5 are not recommended (K-W test assumes "
  87. "a chi square distribution)")
  88. allgroups = np.concatenate(groups)
  89. N = len(allgroups)
  90. ranked = stats.rankdata(allgroups)
  91. # correction factor for ties
  92. T = stats.tiecorrect(ranked)
  93. if T == 0:
  94. raise ValueError('All numbers are identical in kruskal')
  95. # sum of ranks for each group
  96. j = np.insert(np.cumsum(n), 0, 0)
  97. R = np.empty(k, dtype=np.float)
  98. for ii in range(k):
  99. R[ii] = ranked[j[ii]:j[ii + 1]].sum()
  100. # the Kruskal-Wallis H-statistic
  101. H = (12. / (N * (N + 1.))) * ((R ** 2.) / n).sum() - 3 * (N + 1)
  102. # apply correction factor for ties
  103. H /= T
  104. df_omnibus = k - 1
  105. p_omnibus = chisqprob(H, df_omnibus)
  106. # multiple comparisons
  107. # -------------------------------------------------------------------------
  108. # by default we compare every possible pair of groups
  109. if to_compare is None:
  110. to_compare = tuple(combinations(range(k), 2))
  111. ncomp = len(to_compare)
  112. Z_pairs = np.empty(ncomp, dtype=np.float)
  113. p_uncorrected = np.empty(ncomp, dtype=np.float)
  114. Rmean = R / n
  115. for pp, (ii, jj) in enumerate(to_compare):
  116. # standardized score
  117. Zij = (np.abs(Rmean[ii] - Rmean[jj]) /
  118. np.sqrt((1. / 12.) * N * (N + 1) * (1. / n[ii] + 1. / n[jj])))
  119. Z_pairs[pp] = Zij
  120. # corresponding p-values obtained from upper quantiles of the standard
  121. # normal distribution
  122. p_uncorrected = stats.norm.sf(Z_pairs) * 2.
  123. # correction for multiple comparisons
  124. reject, p_corrected, alphac_sidak, alphac_bonf = multipletests(
  125. p_uncorrected, method=method
  126. )
  127. return H, p_omnibus, Z_pairs, p_corrected, reject