123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329 |
- <?php
- /******************************************************************************
- *
- * $Id:$
- *
- * Copyright (C) 1997-2003 by Dimitri van Heesch.
- *
- * Permission to use, copy, modify, and distribute this software and its
- * documentation under the terms of the GNU General Public License is hereby
- * granted. No representations are made about the suitability of this software
- * for any purpose. It is provided "as is" without express or implied warranty.
- * See the GNU General Public License for more details.
- *
- */
- function readInt($file)
- {
- $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));
- $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));
- return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;
- }
- function readString($file)
- {
- $result="";
- while (ord($c=fgetc($file))) $result.=$c;
- return $result;
- }
- function readHeader($file)
- {
- $header =fgetc($file); $header.=fgetc($file);
- $header.=fgetc($file); $header.=fgetc($file);
- return $header;
- }
- function computeIndex($word)
- {
- if (strlen($word)<2) return -1;
- // high char of the index
- $hi = ord($word{0});
- if ($hi==0) return -1;
- // low char of the index
- $lo = ord($word{1});
- if ($lo==0) return -1;
- // return index
- return $hi*256+$lo;
- }
- function search($file,$word,&$statsList)
- {
- $index = computeIndex($word);
- if ($index!=-1) // found a valid index
- {
- fseek($file,$index*4+4); // 4 bytes per entry, skip header
- $index = readInt($file);
- if ($index) // found words matching first two characters
- {
- $start=sizeof($statsList);
- $count=$start;
- fseek($file,$index);
- $w = readString($file);
- while ($w)
- {
- $statIdx = readInt($file);
- if ($word==substr($w,0,strlen($word)))
- { // found word that matches (as substring)
- $statsList[$count++]=array(
- "word"=>$word,
- "match"=>$w,
- "index"=>$statIdx,
- "full"=>strlen($w)==strlen($word),
- "docs"=>array()
- );
- }
- $w = readString($file);
- }
- $totalFreq=0;
- for ($count=$start;$count<sizeof($statsList);$count++)
- {
- $statInfo = &$statsList[$count];
- fseek($file,$statInfo["index"]);
- $numDocs = readInt($file);
- $docInfo = array();
- // read docs info + occurrence frequency of the word
- for ($i=0;$i<$numDocs;$i++)
- {
- $idx=readInt($file);
- $freq=readInt($file);
- $docInfo[$i]=array("idx"=>$idx,"freq"=>$freq,"rank"=>0.0);
- $totalFreq+=$freq;
- if ($statInfo["full"]) $totalfreq+=$freq;
- }
- // read name an url info for the doc
- for ($i=0;$i<$numDocs;$i++)
- {
- fseek($file,$docInfo[$i]["idx"]);
- $docInfo[$i]["name"]=readString($file);
- $docInfo[$i]["url"]=readString($file);
- }
- $statInfo["docs"]=$docInfo;
- }
- for ($count=$start;$count<sizeof($statsList);$count++)
- {
- $statInfo = &$statsList[$count];
- for ($i=0;$i<sizeof($statInfo["docs"]);$i++)
- {
- $docInfo = &$statInfo["docs"];
- // compute frequency rank of the word in each doc
- $statInfo["docs"][$i]["rank"]=
- (float)$docInfo[$i]["freq"]/$totalFreq;
- }
- }
- }
- }
- return $statsList;
- }
- function combine_results($results,&$docs)
- {
- foreach ($results as $wordInfo)
- {
- $docsList = &$wordInfo["docs"];
- foreach ($docsList as $di)
- {
- $key=$di["url"];
- $rank=$di["rank"];
- if (in_array($key, array_keys($docs)))
- {
- $docs[$key]["rank"]+=$rank;
- $docs[$key]["rank"]*=2; // multiple matches increases rank
- }
- else
- {
- $docs[$key] = array("url"=>$key,
- "name"=>$di["name"],
- "rank"=>$rank
- );
- }
- $docs[$key]["words"][] = array(
- "word"=>$wordInfo["word"],
- "match"=>$wordInfo["match"],
- "freq"=>$di["freq"]
- );
- }
- }
- return $docs;
- }
- function normalize_ranking(&$docs)
- {
- $maxRank = 0.0000001;
- // compute maximal rank
- foreach ($docs as $doc)
- {
- if ($doc["rank"]>$maxRank)
- {
- $maxRank=$doc["rank"];
- }
- }
- reset($docs);
- // normalize rankings
- while (list ($key, $val) = each ($docs))
- {
- $docs[$key]["rank"]*=100/$maxRank;
- }
- }
- function filter_results($docs,&$requiredWords,&$forbiddenWords)
- {
- $filteredDocs=array();
- while (list ($key, $val) = each ($docs))
- {
- $words = &$docs[$key]["words"];
- $copy=1; // copy entry by default
- if (sizeof($requiredWords)>0)
- {
- foreach ($requiredWords as $reqWord)
- {
- $found=0;
- foreach ($words as $wordInfo)
- {
- $found = $wordInfo["word"]==$reqWord;
- if ($found) break;
- }
- if (!$found)
- {
- $copy=0; // document contains none of the required words
- break;
- }
- }
- }
- if (sizeof($forbiddenWords)>0)
- {
- foreach ($words as $wordInfo)
- {
- if (in_array($wordInfo["word"],$forbiddenWords))
- {
- $copy=0; // document contains a forbidden word
- break;
- }
- }
- }
- if ($copy) $filteredDocs[$key]=$docs[$key];
- }
- return $filteredDocs;
- }
- function compare_rank($a,$b)
- {
- return ($a["rank"]>$b["rank"]) ? -1 : 1;
- }
- function sort_results($docs,&$sorted)
- {
- $sorted = $docs;
- usort($sorted,"compare_rank");
- return $sorted;
- }
- function report_results(&$docs)
- {
- echo "<table cellspacing=\"2\">\n";
- echo " <tr>\n";
- echo " <td colspan=\"2\"><h2>Search Results</h2></td>\n";
- echo " </tr>\n";
- $numDocs = sizeof($docs);
- if ($numDocs==0)
- {
- echo " <tr>\n";
- echo " <td colspan=\"2\">".matches_text(0)."</td>\n";
- echo " </tr>\n";
- }
- else
- {
- echo " <tr>\n";
- echo " <td colspan=\"2\">".matches_text($numDocs);
- echo "\n";
- echo " </td>\n";
- echo " </tr>\n";
- $num=1;
- foreach ($docs as $doc)
- {
- echo " <tr>\n";
- echo " <td align=\"right\">$num.</td>";
- echo "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";
- echo " <tr>\n";
- echo " <td></td><td class=\"tiny\">Matches: ";
- foreach ($doc["words"] as $wordInfo)
- {
- $word = $wordInfo["word"];
- $matchRight = substr($wordInfo["match"],strlen($word));
- echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";
- }
- echo " </td>\n";
- echo " </tr>\n";
- $num++;
- }
- }
- echo "</table>\n";
- }
- function matches_text($num)
- {
- if ($num==0)
- {
- return 'Sorry, no documents matching your query.';
- }
- else if ($num==1)
- {
- return 'Found 1 document matching your query.';
- }
- else // $num>1
- {
- return 'Found '.$num.' documents matching your query. Showing best matches first.';
- }
- }
- function main($idxfile)
- {
- if(strcmp('4.1.0', phpversion()) > 0)
- {
- die("Error: PHP version 4.1.0 or above required!");
- }
- if (!($file=fopen($idxfile,"rb")))
- {
- die("Error: Search index file could NOT be opened!");
- }
- if (readHeader($file)!="DOXS")
- {
- die("Error: Header of index file is invalid!");
- }
- $query="";
- if (array_key_exists("query", $_GET))
- {
- $query=$_GET["query"];
- }
- $results = array();
- $requiredWords = array();
- $forbiddenWords = array();
- $foundWords = array();
- $word=strtolower(strtok($query," "));
- while ($word) // for each word in the search query
- {
- if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }
- if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }
- if (!in_array($word,$foundWords))
- {
- $foundWords[]=$word;
- search($file,$word,$results);
- }
- $word=strtolower(strtok(" "));
- }
- $docs = array();
- combine_results($results,$docs);
- // filter out documents with forbidden word or that do not contain
- // required words
- $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);
- // normalize rankings so they are in the range [0-100]
- normalize_ranking($filteredDocs);
- // sort the results based on rank
- $sorted = array();
- sort_results($filteredDocs,$sorted);
- // report results to the user
- report_results($sorted);
- fclose($file);
- }
- ?>
|