kde.m 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. function [bandwidth,density,xmesh,cdf]=kde(data,n,MIN,MAX)
  2. % Reliable and extremely fast kernel density estimator for one-dimensional data;
  3. % Gaussian kernel is assumed and the bandwidth is chosen automatically;
  4. % Unlike many other implementations, this one is immune to problems
  5. % caused by multimodal densities with widely separated modes (see example). The
  6. % estimation does not deteriorate for multimodal densities, because we never assume
  7. % a parametric model for the data.
  8. % INPUTS:
  9. % data - a vector of data from which the density estimate is constructed;
  10. % n - the number of mesh points used in the uniform discretization of the
  11. % interval [MIN, MAX]; n has to be a power of two; if n is not a power of two, then
  12. % n is rounded up to the next power of two, i.e., n is set to n=2^ceil(log2(n));
  13. % the default value of n is n=2^12;
  14. % MIN, MAX - defines the interval [MIN,MAX] on which the density estimate is constructed;
  15. % the default values of MIN and MAX are:
  16. % MIN=min(data)-Range/10 and MAX=max(data)+Range/10, where Range=max(data)-min(data);
  17. % OUTPUTS:
  18. % bandwidth - the optimal bandwidth (Gaussian kernel assumed);
  19. % density - column vector of length 'n' with the values of the density
  20. % estimate at the grid points;
  21. % xmesh - the grid over which the density estimate is computed;
  22. % - If no output is requested, then the code automatically plots a graph of
  23. % the density estimate.
  24. % cdf - column vector of length 'n' with the values of the cdf
  25. % Reference:
  26. % Kernel density estimation via diffusion
  27. % Z. I. Botev, J. F. Grotowski, and D. P. Kroese (2010)
  28. % Annals of Statistics, Volume 38, Number 5, pages 2916-2957.
  29. %
  30. % Example:
  31. % data=[randn(100,1);randn(100,1)*2+35 ;randn(100,1)+55];
  32. % kde(data,2^14,min(data)-5,max(data)+5);
  33. data=data(:); %make data a column vector
  34. if nargin<2 % if n is not supplied switch to the default
  35. n=2^14;
  36. end
  37. n=2^ceil(log2(n)); % round up n to the next power of 2;
  38. if nargin<4 %define the default interval [MIN,MAX]
  39. minimum=min(data); maximum=max(data);
  40. Range=maximum-minimum;
  41. MIN=minimum-Range/2; MAX=maximum+Range/2;
  42. end
  43. % set up the grid over which the density estimate is computed;
  44. R=MAX-MIN; dx=R/(n-1); xmesh=MIN+[0:dx:R]; N=length(unique(data));
  45. %bin the data uniformly using the grid defined above;
  46. initial_data=histc(data,xmesh)/N; initial_data=initial_data/sum(initial_data);
  47. a=dct1d(initial_data); % discrete cosine transform of initial data
  48. % now compute the optimal bandwidth^2 using the referenced method
  49. I=[1:n-1]'.^2; a2=(a(2:end)/2).^2;
  50. % use fzero to solve the equation t=zeta*gamma^[5](t)
  51. t_star=root(@(t)fixed_point(t,N,I,a2),N);
  52. % smooth the discrete cosine transform of initial data using t_star
  53. a_t=a.*exp(-[0:n-1]'.^2*pi^2*t_star/2);
  54. % now apply the inverse discrete cosine transform
  55. if (nargout>1)|(nargout==0)
  56. density=idct1d(a_t)/R;
  57. end
  58. % take the rescaling of the data into account
  59. bandwidth=sqrt(t_star)*R;
  60. density(density<0)=eps; % remove negatives due to round-off error
  61. if nargout==0
  62. figure(1), plot(xmesh,density)
  63. end
  64. % for cdf estimation
  65. if nargout>3
  66. f=2*pi^2*sum(I.*a2.*exp(-I*pi^2*t_star));
  67. t_cdf=(sqrt(pi)*f*N)^(-2/3);
  68. % now get values of cdf on grid points using IDCT and cumsum function
  69. a_cdf=a.*exp(-[0:n-1]'.^2*pi^2*t_cdf/2);
  70. cdf=cumsum(idct1d(a_cdf))*(dx/R);
  71. % take the rescaling into account if the bandwidth value is required
  72. bandwidth_cdf=sqrt(t_cdf)*R;
  73. end
  74. end
  75. %################################################################
  76. function out=fixed_point(t,N,I,a2)
  77. % this implements the function t-zeta*gamma^[l](t)
  78. l=7;
  79. f=2*pi^(2*l)*sum(I.^l.*a2.*exp(-I*pi^2*t));
  80. for s=l-1:-1:2
  81. K0=prod([1:2:2*s-1])/sqrt(2*pi); const=(1+(1/2)^(s+1/2))/3;
  82. time=(2*const*K0/N/f)^(2/(3+2*s));
  83. f=2*pi^(2*s)*sum(I.^s.*a2.*exp(-I*pi^2*time));
  84. end
  85. out=t-(2*N*sqrt(pi)*f)^(-2/5);
  86. end
  87. %##############################################################
  88. function out = idct1d(data)
  89. % computes the inverse discrete cosine transform
  90. [nrows,ncols]=size(data);
  91. % Compute weights
  92. weights = nrows*exp(i*(0:nrows-1)*pi/(2*nrows)).';
  93. % Compute x tilde using equation (5.93) in Jain
  94. data = real(ifft(weights.*data));
  95. % Re-order elements of each column according to equations (5.93) and
  96. % (5.94) in Jain
  97. out = zeros(nrows,1);
  98. out(1:2:nrows) = data(1:nrows/2);
  99. out(2:2:nrows) = data(nrows:-1:nrows/2+1);
  100. % Reference:
  101. % A. K. Jain, "Fundamentals of Digital Image
  102. % Processing", pp. 150-153.
  103. end
  104. %##############################################################
  105. function data=dct1d(data)
  106. % computes the discrete cosine transform of the column vector data
  107. [nrows,ncols]= size(data);
  108. % Compute weights to multiply DFT coefficients
  109. weight = [1;2*(exp(-i*(1:nrows-1)*pi/(2*nrows))).'];
  110. % Re-order the elements of the columns of x
  111. data = [ data(1:2:end,:); data(end:-2:2,:) ];
  112. % Multiply FFT by weights:
  113. data= real(weight.* fft(data));
  114. end
  115. function t=root(f,N)
  116. % try to find smallest root whenever there is more than one
  117. N=50*(N<=50)+1050*(N>=1050)+N*((N<1050)&(N>50));
  118. tol=10^-12+0.01*(N-50)/1000;
  119. flag=0;
  120. while flag==0
  121. try
  122. t=fzero(f,[0,tol]);
  123. flag=1;
  124. catch
  125. tol=min(tol*2,.1); % double search interval
  126. end
  127. if tol==.1 % if all else fails
  128. t=fminbnd(@(x)abs(f(x)),0,.1); flag=1;
  129. end
  130. end
  131. end