doxysearch.php 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329
  1. <?php
  2. /******************************************************************************
  3. *
  4. * $Id:$
  5. *
  6. * Copyright (C) 1997-2003 by Dimitri van Heesch.
  7. *
  8. * Permission to use, copy, modify, and distribute this software and its
  9. * documentation under the terms of the GNU General Public License is hereby
  10. * granted. No representations are made about the suitability of this software
  11. * for any purpose. It is provided "as is" without express or implied warranty.
  12. * See the GNU General Public License for more details.
  13. *
  14. */
  15. function readInt($file)
  16. {
  17. $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));
  18. $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));
  19. return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;
  20. }
  21. function readString($file)
  22. {
  23. $result="";
  24. while (ord($c=fgetc($file))) $result.=$c;
  25. return $result;
  26. }
  27. function readHeader($file)
  28. {
  29. $header =fgetc($file); $header.=fgetc($file);
  30. $header.=fgetc($file); $header.=fgetc($file);
  31. return $header;
  32. }
  33. function computeIndex($word)
  34. {
  35. if (strlen($word)<2) return -1;
  36. // high char of the index
  37. $hi = ord($word{0});
  38. if ($hi==0) return -1;
  39. // low char of the index
  40. $lo = ord($word{1});
  41. if ($lo==0) return -1;
  42. // return index
  43. return $hi*256+$lo;
  44. }
  45. function search($file,$word,&$statsList)
  46. {
  47. $index = computeIndex($word);
  48. if ($index!=-1) // found a valid index
  49. {
  50. fseek($file,$index*4+4); // 4 bytes per entry, skip header
  51. $index = readInt($file);
  52. if ($index) // found words matching first two characters
  53. {
  54. $start=sizeof($statsList);
  55. $count=$start;
  56. fseek($file,$index);
  57. $w = readString($file);
  58. while ($w)
  59. {
  60. $statIdx = readInt($file);
  61. if ($word==substr($w,0,strlen($word)))
  62. { // found word that matches (as substring)
  63. $statsList[$count++]=array(
  64. "word"=>$word,
  65. "match"=>$w,
  66. "index"=>$statIdx,
  67. "full"=>strlen($w)==strlen($word),
  68. "docs"=>array()
  69. );
  70. }
  71. $w = readString($file);
  72. }
  73. $totalFreq=0;
  74. for ($count=$start;$count<sizeof($statsList);$count++)
  75. {
  76. $statInfo = &$statsList[$count];
  77. fseek($file,$statInfo["index"]);
  78. $numDocs = readInt($file);
  79. $docInfo = array();
  80. // read docs info + occurrence frequency of the word
  81. for ($i=0;$i<$numDocs;$i++)
  82. {
  83. $idx=readInt($file);
  84. $freq=readInt($file);
  85. $docInfo[$i]=array("idx"=>$idx,"freq"=>$freq,"rank"=>0.0);
  86. $totalFreq+=$freq;
  87. if ($statInfo["full"]) $totalfreq+=$freq;
  88. }
  89. // read name an url info for the doc
  90. for ($i=0;$i<$numDocs;$i++)
  91. {
  92. fseek($file,$docInfo[$i]["idx"]);
  93. $docInfo[$i]["name"]=readString($file);
  94. $docInfo[$i]["url"]=readString($file);
  95. }
  96. $statInfo["docs"]=$docInfo;
  97. }
  98. for ($count=$start;$count<sizeof($statsList);$count++)
  99. {
  100. $statInfo = &$statsList[$count];
  101. for ($i=0;$i<sizeof($statInfo["docs"]);$i++)
  102. {
  103. $docInfo = &$statInfo["docs"];
  104. // compute frequency rank of the word in each doc
  105. $statInfo["docs"][$i]["rank"]=
  106. (float)$docInfo[$i]["freq"]/$totalFreq;
  107. }
  108. }
  109. }
  110. }
  111. return $statsList;
  112. }
  113. function combine_results($results,&$docs)
  114. {
  115. foreach ($results as $wordInfo)
  116. {
  117. $docsList = &$wordInfo["docs"];
  118. foreach ($docsList as $di)
  119. {
  120. $key=$di["url"];
  121. $rank=$di["rank"];
  122. if (in_array($key, array_keys($docs)))
  123. {
  124. $docs[$key]["rank"]+=$rank;
  125. $docs[$key]["rank"]*=2; // multiple matches increases rank
  126. }
  127. else
  128. {
  129. $docs[$key] = array("url"=>$key,
  130. "name"=>$di["name"],
  131. "rank"=>$rank
  132. );
  133. }
  134. $docs[$key]["words"][] = array(
  135. "word"=>$wordInfo["word"],
  136. "match"=>$wordInfo["match"],
  137. "freq"=>$di["freq"]
  138. );
  139. }
  140. }
  141. return $docs;
  142. }
  143. function normalize_ranking(&$docs)
  144. {
  145. $maxRank = 0.0000001;
  146. // compute maximal rank
  147. foreach ($docs as $doc)
  148. {
  149. if ($doc["rank"]>$maxRank)
  150. {
  151. $maxRank=$doc["rank"];
  152. }
  153. }
  154. reset($docs);
  155. // normalize rankings
  156. while (list ($key, $val) = each ($docs))
  157. {
  158. $docs[$key]["rank"]*=100/$maxRank;
  159. }
  160. }
  161. function filter_results($docs,&$requiredWords,&$forbiddenWords)
  162. {
  163. $filteredDocs=array();
  164. while (list ($key, $val) = each ($docs))
  165. {
  166. $words = &$docs[$key]["words"];
  167. $copy=1; // copy entry by default
  168. if (sizeof($requiredWords)>0)
  169. {
  170. foreach ($requiredWords as $reqWord)
  171. {
  172. $found=0;
  173. foreach ($words as $wordInfo)
  174. {
  175. $found = $wordInfo["word"]==$reqWord;
  176. if ($found) break;
  177. }
  178. if (!$found)
  179. {
  180. $copy=0; // document contains none of the required words
  181. break;
  182. }
  183. }
  184. }
  185. if (sizeof($forbiddenWords)>0)
  186. {
  187. foreach ($words as $wordInfo)
  188. {
  189. if (in_array($wordInfo["word"],$forbiddenWords))
  190. {
  191. $copy=0; // document contains a forbidden word
  192. break;
  193. }
  194. }
  195. }
  196. if ($copy) $filteredDocs[$key]=$docs[$key];
  197. }
  198. return $filteredDocs;
  199. }
  200. function compare_rank($a,$b)
  201. {
  202. return ($a["rank"]>$b["rank"]) ? -1 : 1;
  203. }
  204. function sort_results($docs,&$sorted)
  205. {
  206. $sorted = $docs;
  207. usort($sorted,"compare_rank");
  208. return $sorted;
  209. }
  210. function report_results(&$docs)
  211. {
  212. echo "<table cellspacing=\"2\">\n";
  213. echo " <tr>\n";
  214. echo " <td colspan=\"2\"><h2>Search Results</h2></td>\n";
  215. echo " </tr>\n";
  216. $numDocs = sizeof($docs);
  217. if ($numDocs==0)
  218. {
  219. echo " <tr>\n";
  220. echo " <td colspan=\"2\">".matches_text(0)."</td>\n";
  221. echo " </tr>\n";
  222. }
  223. else
  224. {
  225. echo " <tr>\n";
  226. echo " <td colspan=\"2\">".matches_text($numDocs);
  227. echo "\n";
  228. echo " </td>\n";
  229. echo " </tr>\n";
  230. $num=1;
  231. foreach ($docs as $doc)
  232. {
  233. echo " <tr>\n";
  234. echo " <td align=\"right\">$num.</td>";
  235. echo "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";
  236. echo " <tr>\n";
  237. echo " <td></td><td class=\"tiny\">Matches: ";
  238. foreach ($doc["words"] as $wordInfo)
  239. {
  240. $word = $wordInfo["word"];
  241. $matchRight = substr($wordInfo["match"],strlen($word));
  242. echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";
  243. }
  244. echo " </td>\n";
  245. echo " </tr>\n";
  246. $num++;
  247. }
  248. }
  249. echo "</table>\n";
  250. }
  251. function matches_text($num)
  252. {
  253. if ($num==0)
  254. {
  255. return 'Sorry, no documents matching your query.';
  256. }
  257. else if ($num==1)
  258. {
  259. return 'Found 1 document matching your query.';
  260. }
  261. else // $num>1
  262. {
  263. return 'Found '.$num.' documents matching your query. Showing best matches first.';
  264. }
  265. }
  266. function main($idxfile)
  267. {
  268. if(strcmp('4.1.0', phpversion()) > 0)
  269. {
  270. die("Error: PHP version 4.1.0 or above required!");
  271. }
  272. if (!($file=fopen($idxfile,"rb")))
  273. {
  274. die("Error: Search index file could NOT be opened!");
  275. }
  276. if (readHeader($file)!="DOXS")
  277. {
  278. die("Error: Header of index file is invalid!");
  279. }
  280. $query="";
  281. if (array_key_exists("query", $_GET))
  282. {
  283. $query=$_GET["query"];
  284. }
  285. $results = array();
  286. $requiredWords = array();
  287. $forbiddenWords = array();
  288. $foundWords = array();
  289. $word=strtolower(strtok($query," "));
  290. while ($word) // for each word in the search query
  291. {
  292. if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }
  293. if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }
  294. if (!in_array($word,$foundWords))
  295. {
  296. $foundWords[]=$word;
  297. search($file,$word,$results);
  298. }
  299. $word=strtolower(strtok(" "));
  300. }
  301. $docs = array();
  302. combine_results($results,$docs);
  303. // filter out documents with forbidden word or that do not contain
  304. // required words
  305. $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);
  306. // normalize rankings so they are in the range [0-100]
  307. normalize_ranking($filteredDocs);
  308. // sort the results based on rank
  309. $sorted = array();
  310. sort_results($filteredDocs,$sorted);
  311. // report results to the user
  312. report_results($sorted);
  313. fclose($file);
  314. }
  315. ?>