dataGen.m 2.7 KB

1
  1. %%% data generator clear close all %% Configs numDims = 100; numClusters = 10; stdClust = 0.5; %std within cluster - not so bad at 1 clusterSep = 25; % std of distribution from which clusters are drawn ptsPerCluster = 250; %% stds = stdClust .* ones(numClusters, 1); clustMeans = clusterSep*rand(numClusters, numDims); % original data is strictly pos. allPts_unshuff = []; clustIDs_unshuff = []; for clustIdx = 1:numClusters newBlob = []; for dim = 1:numDims newBlobDim = clustMeans(clustIdx,dim)+stds(clustIdx).*randn(ptsPerCluster, 1); newBlob = [newBlob newBlobDim]; %100 pts in 100D space end clustIDs_unshuff(end+1:end+ptsPerCluster) = clustIdx; allPts_unshuff = [allPts_unshuff; newBlob]; end allPts_dm_unshuff = allPts_unshuff; for dim = 1:numDims allPts_dm_unshuff(:,dim) = allPts_dm_unshuff(:,dim) - mean(allPts_dm_unshuff(:,dim)); end % calc clust means for use in dot product calculation clustMeans = zeros(size(clustMeans)); for clustIdx = 1:numClusters startPt = ((clustIdx-1)*ptsPerCluster) + 1; %first pt in cluster endPt = startPt + ptsPerCluster - 1; %last pt in cluster thisBlob = allPts_dm_unshuff(startPt:endPt, :); thisMean = mean(thisBlob, 1); clustMeans(clustIdx, :) = thisMean ./ norm(thisMean); %normalized end shuffIdx = randperm(ptsPerCluster*numClusters); allPts = allPts_unshuff(shuffIdx, :); allPts_dm = allPts_dm_unshuff(shuffIdx, :); clustIDs = clustIDs_unshuff(shuffIdx); ccDot = zeros(numClusters); ccDist = zeros(numClusters); for clust_i = 1:numClusters for clust_j = 1:numClusters ccDot(clust_i, clust_j) = abs(clustMeans(clust_i,:) * clustMeans(clust_j,:)'); ccDist(clust_i, clust_j) = norm(clustMeans(clust_i,:) - clustMeans(clust_j,:)); end end ptDist = zeros(numClusters*ptsPerCluster); for ptIdx = 1:size(allPts, 1) for ptJdx = 1:size(allPts, 1) ptDist(ptIdx, ptJdx) =... norm(allPts_unshuff(ptIdx,:) - allPts_unshuff(ptJdx,:)); end end if numDims == 2 figure(10); plot(allPts_dm(:,1)', allPts_dm(:,2)', '.') grid on; hold on; plot(0, 0, 'r*') hold off; end figure(1); % useful bc weight vectors should be as separate as possible imagesc(ccDot); colorbar title('Dot Products of Cluster Means') xlabel('Cluster') ylabel('Cluster') figure(2) imagesc(ccDist); colorbar title('Average Distance between Clusters') xlabel('Cluster') ylabel('Cluster') figure(3) imagesc(ptDist); colorbar title('Pointwise Distances') savename = ['genData' num2str(numDims) 'D_' num2str(numClusters)... 'c_' num2str(ptsPerCluster) 'ppc.mat']; clearvars -except allPts allPts_dm allPts_unshuff... allPts_dm_unshuff clustIDs clustIDs_unshuff savename save(savename)