import scipy.cluster.hierarchy as sch import numpy as np def cluster_corr(corr_array, threshold=None, inplace=False): """ Rearranges the correlation matrix, corr_array, so that groups of highly correlated variables are next to eachother Parameters ---------- corr_array : pandas.DataFrame or numpy.ndarray a NxN correlation matrix Returns ------- corr_array : a NxN correlation matrix with the columns and rows rearranged linkage : linkage of distances labels : cluster labels idx : sorted incides for original labels """ pairwise_distances = sch.distance.pdist(corr_array) linkage = sch.linkage(pairwise_distances, method='complete') cluster_distance_threshold = pairwise_distances.max()/2 if threshold is None else threshold labels = sch.fcluster(linkage, cluster_distance_threshold, criterion='distance') idx = np.argsort(labels) if not inplace: corr_array = corr_array.copy() return corr_array[idx, :][:, idx], linkage, labels, idx