locfit.m 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. function fit=locfit(varargin)
  2. % Smoothing noisy data using Local Regression and Likelihood.
  3. %
  4. % arguments still to add: dc maxit
  5. %
  6. % Usage: fit = locfit(x,y) % local regression fit of x and y.
  7. % fit = locfit(x) % density estimation of x.
  8. %
  9. % Smoothing with locfit is a two-step procedure. The locfit()
  10. % function evaluates the local regression smooth at a set of points
  11. % (can be specified through an evaluation structure). Then, use
  12. % the predict() function to interpolate this fit to other points.
  13. %
  14. % Additional arguments to locfit() are specified as 'name',value pairs, e.g.:
  15. % locfit( x, 'alpha',[0.7,1.5] , 'family','rate' , 'ev','grid' , 'mg',100 );
  16. %
  17. %
  18. % Data-related inputs:
  19. %
  20. % x is a vector or matrix of the independent (or predictor) variables.
  21. % Rows of x represent subjects, columns represent variables.
  22. % Generally, local regression would be used with 1-4 independent
  23. % variables. In higher dimensions, the curse-of-dimensionality,
  24. % as well as the difficulty of visualizing higher dimensional
  25. % surfaces, may limit usefulness.
  26. %
  27. % y is the column vector of the dependent (or response) variable.
  28. % For density families, 'y' is omitted.
  29. % NOTE: x and y are the first two arguments. All other arguments require
  30. % the 'name',value notation.
  31. %
  32. % 'weights' Prior weights for observations (reciprocal of variance, or
  33. % sample size).
  34. % 'cens' Censoring indicators for hazard rate or censored regression.
  35. % The coding is '1' (or 'TRUE') for a censored observation, and
  36. % '0' (or 'FALSE') for uncensored observations.
  37. % 'base' Baseline parameter estimate. If a baseline is provided,
  38. % the local regression model is fitted as
  39. % Y_i = b_i + m(x_i) + epsilon_i,
  40. % with Locfit estimating the m(x) term. For regression models,
  41. % this effectively subtracts b_i from Y_i. The advantage of the
  42. % 'base' formulation is that it extends to likelihood
  43. % regression models.
  44. % 'scale' A scale to apply to each variable. This is especially
  45. % important for multivariate fitting, where variables may be
  46. % measured in non-comparable units. It is also used to specify
  47. % the frequency for variables with the 'a' (angular) style.
  48. % 'sty' Character string (length d) of styles for each predictor variable.
  49. % n denotes `normal'; a denotes angular (or periodic); l and r
  50. % denotes one-sided left and right; c is conditionally parametric.
  51. %
  52. %
  53. % Smoothing Parameters and Bandwidths:
  54. % The bandwidth (or more accurately, half-width) of the smoothing window
  55. % controls the amount of smoothing. Locfit allows specification of constant
  56. % (fixed), nearest neighbor, certain locally adaptive variable bandwidths,
  57. % and combinations of these. Also related to the smoothing parameter
  58. % are the local polynmial degree and weight function.
  59. %
  60. % 'nn' 'Nearest neighbor' smoothing parameter. Specifying 'nn',0.5
  61. % means that the width of each smoothing neighborhood is chosen
  62. % to cover 50% of the data.
  63. %
  64. % 'h' A constant (or fixed) bandwidth parameter. For example, 'h',2
  65. % means that the smoothing windows have constant half-width
  66. % (or radius) 2. Note that h is applied after scaling.
  67. %
  68. % 'pen' penalty parameter for adaptive smoothing. Needs to be used
  69. % with care.
  70. %
  71. % 'alpha' The old way of specifying smoothing parameters, as used in
  72. % my book. alpha is equivalent to the vector [nn,h,pen].
  73. % If multiple componenents are non-zero, the largest corresponding
  74. % bandwidth is used. The default (if none of alpha,nn,h,pen
  75. % are provided) is [0.7 0 0].
  76. %
  77. % 'deg' Degree of local polynomial. Default: 2 (local quadratic).
  78. % Degrees 0 to 3 are supported by almost all parts of the
  79. % Locfit code. Higher degrees may work in some cases.
  80. %
  81. % 'kern' Weight function, default = 'tcub'. Other choices are
  82. % 'rect', 'trwt', 'tria', 'epan', 'bisq' and 'gauss'.
  83. % Choices may be restricted when derivatives are
  84. % required; e.g. for confidence bands and some bandwidth
  85. % selectors.
  86. %
  87. % 'kt' Kernel type, 'sph' (default); 'prod'. In multivariate
  88. % problems, 'prod' uses a simplified product model which
  89. % speeds up computations.
  90. %
  91. % 'acri' Criterion for adaptive bandwidth selection.
  92. %
  93. %
  94. % Derivative Estimation.
  95. % Generally I recommend caution when using derivative estimation
  96. % (and especially higher order derivative estimation) -- can you
  97. % really estimate derivatives from noisy data? Any derivative
  98. % estimate is inherently more dependent on an assumed smoothness
  99. % (expressed through the bandwidth) than the data. Warnings aside...
  100. %
  101. % 'deriv' Derivative estimation. 'deriv',1 specifies the first derivative
  102. % (or more correctly, an estimate of the local slope is returned.
  103. % 'deriv',[1 1] specifies the second derivative. For bivariate fits
  104. % 'deriv',2 specifies the first partial derivative wrt x2.
  105. % 'deriv',[1 2] is mixed second-order derivative.
  106. %
  107. % Fitting family.
  108. % 'family' is used to specify the local likelihood family.
  109. % Regression-type families are 'gaussian', 'binomial',
  110. % 'poisson', 'gamma' and 'geom'. If the family is preceded
  111. % by a q (e.g. 'qgauss', or 'qpois') then quasi-likelihood is
  112. % used; in particular, a dispersion estimate is computed.
  113. % Preceding by an 'r' makes an attempt at robust (outlier-resistant)
  114. % estimation. Combining q and r (e.g. 'family','qrpois') may
  115. % work, if you're lucky.
  116. % Density estimation-type families are 'dens', 'rate' and 'hazard'
  117. % (hazard or failure rate). Note that `dens' scales the output
  118. % to be a statistical density estimate (i.e. scaled to integrate
  119. % to 1). 'rate' estimates the rate or intensity function (events
  120. % per unit time, or events per unit area), which may be called
  121. % density in some fields.
  122. % The default family is 'qgauss' if a response (y argument) has been
  123. % provided, and 'dens' if no response is given.
  124. % 'link' Link function for local likelihood fitting. Depending on the
  125. % family, choices may be 'ident', 'log', 'logit',
  126. % 'inverse', 'sqrt' and 'arcsin'.
  127. %
  128. % Evaluation structures.
  129. % By default, locfit chooses a set of points, depending on the data
  130. % and smoothing parameters, to evaluate at. This is controlled by
  131. % the evaluation structure.
  132. % 'ev' Specify the evaluation structure. Default is 'tree'.
  133. % Other choices include 'phull' (triangulation), 'grid' (a grid
  134. % of points), 'data' (each data point), 'crossval' (data,
  135. % but use leave-one-out cross validation), 'none' (no evaluation
  136. % points, effectively producing the global parametric fit).
  137. % Alternatively, a vector/matrix of evaluation points may be
  138. % provided.
  139. % (kd trees not currently supported in mlocfit)
  140. % 'll' and 'ur' -- row vectors specifying the upper and lower limits
  141. % for the bounding box used by the evaluation structure.
  142. % They default to the data range.
  143. % 'mg' For the 'grid' evaluation structure, 'mg' specifies the
  144. % number of points on each margin. Default 10. Can be either a
  145. % single number or vector.
  146. % 'cut' Refinement parameter for adaptive partitions. Default 0.8;
  147. % smaller values result in more refined partitions.
  148. % 'maxk' Controls space assignment for evaluation structures. For the
  149. % adaptive evaluation structures, it is impossible to be sure
  150. % in advance how many vertices will be generated. If you get
  151. % warnings about `Insufficient vertex space', Locfit's default
  152. % assigment can be increased by increasing 'maxk'. The default
  153. % is 'maxk','100'.
  154. %
  155. % 'xlim' For density estimation, Locfit allows the density to be
  156. % supported on a bounded interval (or rectangle, in more than
  157. % one dimension). The format should be [ll;ul] (ie, matrix with
  158. % two rows, d columns) where ll is the lower left corner of
  159. % the rectangle, and ur is the upper right corner.
  160. % One-sided bounds, such as [0,infty), are not supported, but can be
  161. % effectively specified by specifying a very large upper
  162. % bound.
  163. %
  164. % 'module' either 'name' or {'name','/path/to/module',parameters}.
  165. %
  166. % Density Estimation
  167. % 'renorm',1 will attempt to renormalize the local likelihood
  168. % density estimate so that it integrates to 1. The llde
  169. % (specified by 'family','dens') is scaled to estimate the
  170. % density, but since the estimation is pointwise, there is
  171. % no guarantee that the resulting density integrates exactly
  172. % to 1. Renormalization attempts to achieve this.
  173. %
  174. % The output of locfit() is a Matlab structure:
  175. %
  176. % fit.data.x (n*d)
  177. % fit.data.y (n*1)
  178. % fit.data.weights (n*1 or 1*1)
  179. % fit.data.censor (n*1 or 1*1)
  180. % fit.data.baseline (n*1 or 1*1)
  181. % fit.data.style (string length d)
  182. % fit.data.scales (1*d)
  183. % fit.data.xlim (2*d)
  184. %
  185. % fit.evaluation_structure.type (string)
  186. % fit.evaluation_structure.module.name (string)
  187. % fit.evaluation_structure.module.directory (string)
  188. % fit.evaluation_structure.module.parameters (string)
  189. % fit.evaluation_structure.lower_left (numeric 1*d)
  190. % fit.evaluation_structure.upper_right (numeric 1*d)
  191. % fit.evaluation_structure.grid (numeric 1*d)
  192. % fit.evaluation_structure.cut (numeric 1*d)
  193. % fit.evaluation_structure.maxk
  194. % fit.evaluation_structure.derivative
  195. %
  196. % fit.smoothing_parameters.alpha = (nn h pen) vector
  197. % fit.smoothing_parameters.adaptive_criterion (string)
  198. % fit.smoothing_parameters.degree (numeric)
  199. % fit.smoothing_parameters.family (string)
  200. % fit.smoothing_parameters.link (string)
  201. % fit.smoothing_parameters.kernel (string)
  202. % fit.smoothing_parameters.kernel_type (string)
  203. % fit.smoothing_parameters.deren
  204. % fit.smoothing_parameters.deit
  205. % fit.smoothing_parameters.demint
  206. % fit.smoothing_parameters.debug
  207. %
  208. % fit.fit_points.evaluation_points (d*nv matrix)
  209. % fit.fit_points.fitted_values (matrix, nv rows, many columns)
  210. % fit.fit_points.evaluation_vectors.cell
  211. % fit.fit_points.evaluation_vectors.splitvar
  212. % fit.fit_points.evaluation_vectors.lo
  213. % fit.fit_points.evaluation_vectors.hi
  214. % fit.fit_points.fit_limits (d*2 matrix)
  215. % fit.fit_points.family_link (numeric values)
  216. % fit.fit_points.kappa (likelihood, degrees of freedom, etc)
  217. %
  218. % fit.parametric_component
  219. %
  220. %
  221. % The OLD format:
  222. %
  223. % fit{1} = data.
  224. % fit{2} = evaluation structure.
  225. % fit{3} = smoothing parameter structure.
  226. % fit{4}{1} = fit points matrix.
  227. % fit{4}{2} = matrix of fitted values etc.
  228. % Note that these are not back-transformed, and may have the
  229. % parametric component removed.
  230. % (exact content varies according to module).
  231. % fit{4}{3} = various details of the evaluation points.
  232. % fit{4}{4} = fit limits.
  233. % fit{4}{5} = family,link.
  234. % fit{5} = parametric component values.
  235. %
  236. % Minimal input validation
  237. if nargin < 1
  238. error( 'At least one input argument required' );
  239. end
  240. xdata = double(varargin{1});
  241. d = size(xdata,2);
  242. n = size(xdata,1);
  243. if ((nargin>1) && (~ischar(varargin{2})))
  244. ydata = double(varargin{2});
  245. if (any(size(ydata) ~= [n 1])); error('y must be n*1 column vector'); end;
  246. family = 'qgauss';
  247. na = 3;
  248. else
  249. ydata = 0;
  250. family = 'density';
  251. na = 2;
  252. end;
  253. if mod(nargin-na,2)==0
  254. error( 'All arguments other than x, y must be name,value pairs' );
  255. end
  256. wdata = ones(n,1);
  257. cdata = 0;
  258. base = 0;
  259. style = 'n';
  260. scale = 1;
  261. xl = zeros(2,d);
  262. alpha = [0 0 0];
  263. deg = 2;
  264. link = 'default';
  265. acri = 'none';
  266. kern = 'tcub';
  267. kt = 'sph';
  268. deren = 0;
  269. deit = 'default';
  270. demint= 20;
  271. debug = 0;
  272. ev = 'tree';
  273. ll = zeros(1,d);
  274. ur = zeros(1,d);
  275. mg = 10;
  276. maxk = 100;
  277. deriv=0;
  278. cut = 0.8;
  279. mdl = struct('name','std', 'directory','', 'parameters',0 );
  280. while na < length(varargin)
  281. inc = 0;
  282. if (varargin{na}=='y')
  283. ydata = double(varargin{na+1});
  284. family = 'qgauss';
  285. inc = 2;
  286. if (any(size(ydata) ~= [n 1])); error('y must be n*1 column vector'); end;
  287. end
  288. if (strcmp(varargin{na},'weights'))
  289. wdata = double(varargin{na+1});
  290. inc = 2;
  291. if (any(size(wdata) ~= [n 1])); error('weights must be n*1 column vector'); end;
  292. end
  293. if (strcmp(varargin{na},'cens'))
  294. cdata = double(varargin{na+1});
  295. inc = 2;
  296. if (any(size(cdata) ~= [n 1])); error('cens must be n*1 column vector'); end;
  297. end
  298. if (strcmp(varargin{na},'base')) % numeric vector, n*1 or 1*1.
  299. base = double(varargin{na+1});
  300. if (length(base)==1); base = base*ones(n,1); end;
  301. inc = 2;
  302. end
  303. if (strcmp(varargin{na},'style')) % character string of length d.
  304. style = varargin{na+1};
  305. inc = 2;
  306. end;
  307. if (strcmp(varargin{na},'scale')) % row vector, length 1 or d.
  308. scale = varargin{na+1};
  309. if (scale==0)
  310. scale = zeros(1,d);
  311. for i=1:d
  312. scale(i) = sqrt(var(xdata(:,i)));
  313. end;
  314. end;
  315. inc = 2;
  316. end;
  317. if (strcmp(varargin{na},'xlim')) % 2*d numeric matrix.
  318. xl = varargin{na+1};
  319. inc = 2;
  320. end
  321. if (strcmp(varargin{na},'alpha')) % row vector of length 1, 2 or 3.
  322. alpha = [varargin{na+1} 0 0 0];
  323. alpha = alpha(1:3);
  324. inc = 2;
  325. end
  326. if (strcmp(varargin{na},'nn')) % scalar
  327. alpha(1) = varargin{na+1};
  328. inc = 2;
  329. end
  330. if (strcmp(varargin{na},'h')) % scalar
  331. alpha(2) = varargin{na+1};
  332. inc = 2;
  333. end;
  334. if (strcmp(varargin{na},'pen')) % scalar
  335. alpha(3) = varargin{na+1};
  336. inc = 2;
  337. end;
  338. if (strcmp(varargin{na},'acri')) % string
  339. acri = varargin{na+1};
  340. inc = 2;
  341. end
  342. if (strcmp(varargin{na},'deg')) % positive integer.
  343. deg = varargin{na+1};
  344. inc = 2;
  345. end;
  346. if (strcmp(varargin{na},'family')) % character string.
  347. family = varargin{na+1};
  348. inc = 2;
  349. end;
  350. if (strcmp(varargin{na},'link')) % character string.
  351. link = varargin{na+1};
  352. inc = 2;
  353. end;
  354. if (strcmp(varargin{na},'kern')) % character string.
  355. kern = varargin{na+1};
  356. inc = 2;
  357. end;
  358. if (strcmp(varargin{na},'kt')) % character string.
  359. kt = varargin{na+1};
  360. inc = 2;
  361. end;
  362. if (strcmp(varargin{na},'ev')) % char. string, or matrix with d columns.
  363. ev = varargin{na+1};
  364. if (isnumeric(ev)); ev = ev'; end;
  365. inc = 2;
  366. end;
  367. if (strcmp(varargin{na},'ll')) % row vector of length d.
  368. ll = varargin{na+1};
  369. inc = 2;
  370. end;
  371. if (strcmp(varargin{na},'ur')) % row vector of length d.
  372. ur = varargin{na+1};
  373. inc = 2;
  374. end;
  375. if (strcmp(varargin{na},'mg')) % row vector of length d.
  376. mg = varargin{na+1};
  377. inc = 2;
  378. end;
  379. if (strcmp(varargin{na},'cut')) % positive scalar.
  380. cut = varargin{na+1};
  381. inc = 2;
  382. end;
  383. if (strcmp(varargin{na},'module')) % string.
  384. mdl = struct('name',varargin{na+1}, 'directory','', 'parameters',0 );
  385. inc = 2;
  386. end;
  387. if (strcmp(varargin{na},'maxk')) % positive integer.
  388. maxk = varargin{na+1};
  389. inc = 2;
  390. end;
  391. if (strcmp(varargin{na},'deriv')) % numeric row vector, up to deg elements.
  392. deriv = varargin{na+1};
  393. inc = 2;
  394. end;
  395. if (strcmp(varargin{na},'renorm')) % density renormalization.
  396. deren = varargin{na+1};
  397. inc = 2;
  398. end;
  399. if (strcmp(varargin{na},'itype')) % density - integration type.
  400. deit = varargin{na+1};
  401. inc = 2;
  402. end;
  403. if (strcmp(varargin{na},'mint')) % density - # of integration points.
  404. demint = varargin{na+1};
  405. inc = 2;
  406. end;
  407. if (strcmp(varargin{na},'debug')) % debug level.
  408. debug = varargin{na+1};
  409. inc = 2;
  410. end;
  411. if (inc==0)
  412. disp(varargin{na});
  413. error('Unknown Input Argument.');
  414. end;
  415. na=na+inc;
  416. end
  417. fit.data.x = xdata;
  418. fit.data.y = ydata;
  419. fit.data.weights = wdata;
  420. fit.data.censor = cdata;
  421. fit.data.baseline = base;
  422. fit.data.style = style;
  423. fit.data.scales = scale;
  424. fit.data.xlim = xl;
  425. fit.evaluation_structure.type = ev;
  426. fit.evaluation_structure.module = mdl;
  427. fit.evaluation_structure.lower_left = ll;
  428. fit.evaluation_structure.upper_right = ur;
  429. fit.evaluation_structure.grid = mg;
  430. fit.evaluation_structure.cut = cut;
  431. fit.evaluation_structure.maxk = maxk;
  432. fit.evaluation_structure.derivative = deriv;
  433. if (alpha==0); alpha = [0.7 0 0]; end;
  434. fit.smoothing_parameters.alpha = alpha;
  435. fit.smoothing_parameters.adaptive_criterion = acri;
  436. fit.smoothing_parameters.degree = deg;
  437. fit.smoothing_parameters.family = family;
  438. fit.smoothing_parameters.link = link;
  439. fit.smoothing_parameters.kernel = kern;
  440. fit.smoothing_parameters.kernel_type = kt;
  441. fit.smoothing_parameters.deren = deren;
  442. fit.smoothing_parameters.deit = deit;
  443. fit.smoothing_parameters.demint = demint;
  444. fit.smoothing_parameters.debug = debug;
  445. [fpc pcomp] = mexlf(fit.data,fit.evaluation_structure,fit.smoothing_parameters);
  446. fit.fit_points = fpc;
  447. fit.parametric_component = pcomp;
  448. return