Root/
Source at commit 1322 created 12 years 8 months ago. By meklort, Add doxygen to utils folder | |
---|---|
1 | function readInt($file)␊ |
2 | {␊ |
3 | $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));␊ |
4 | $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));␊ |
5 | return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;␊ |
6 | }␊ |
7 | ␊ |
8 | function readString($file)␊ |
9 | {␊ |
10 | $result="";␊ |
11 | while (ord($c=fgetc($file))) $result.=$c;␊ |
12 | return $result;␊ |
13 | }␊ |
14 | ␊ |
15 | function readHeader($file)␊ |
16 | {␊ |
17 | $header =fgetc($file); $header.=fgetc($file);␊ |
18 | $header.=fgetc($file); $header.=fgetc($file);␊ |
19 | return $header;␊ |
20 | }␊ |
21 | ␊ |
22 | function computeIndex($word)␊ |
23 | {␊ |
24 | // Simple hashing that allows for substring search␊ |
25 | if (strlen($word)<2) return -1;␊ |
26 | // high char of the index␊ |
27 | $hi = ord($word{0});␊ |
28 | if ($hi==0) return -1;␊ |
29 | // low char of the index␊ |
30 | $lo = ord($word{1});␊ |
31 | if ($lo==0) return -1;␊ |
32 | // return index␊ |
33 | return $hi*256+$lo;␊ |
34 | }␊ |
35 | ␊ |
36 | function search($file,$word,&$statsList)␊ |
37 | {␊ |
38 | $index = computeIndex($word);␊ |
39 | if ($index!=-1) // found a valid index␊ |
40 | {␊ |
41 | fseek($file,$index*4+4); // 4 bytes per entry, skip header␊ |
42 | $index = readInt($file);␊ |
43 | if ($index) // found words matching the hash key␊ |
44 | {␊ |
45 | $start=sizeof($statsList);␊ |
46 | $count=$start;␊ |
47 | fseek($file,$index);␊ |
48 | $w = readString($file);␊ |
49 | while ($w)␊ |
50 | {␊ |
51 | $statIdx = readInt($file);␊ |
52 | if ($word==substr($w,0,strlen($word)))␊ |
53 | { // found word that matches (as substring)␊ |
54 | $statsList[$count++]=array(␊ |
55 | "word"=>$word,␊ |
56 | "match"=>$w,␊ |
57 | "index"=>$statIdx,␊ |
58 | "full"=>strlen($w)==strlen($word),␊ |
59 | "docs"=>array()␊ |
60 | );␊ |
61 | }␊ |
62 | $w = readString($file);␊ |
63 | }␊ |
64 | $totalHi=0;␊ |
65 | $totalFreqHi=0;␊ |
66 | $totalFreqLo=0;␊ |
67 | for ($count=$start;$count<sizeof($statsList);$count++)␊ |
68 | {␊ |
69 | $statInfo = &$statsList[$count];␊ |
70 | $multiplier = 1;␊ |
71 | // whole word matches have a double weight␊ |
72 | if ($statInfo["full"]) $multiplier=2;␊ |
73 | fseek($file,$statInfo["index"]); ␊ |
74 | $numDocs = readInt($file);␊ |
75 | $docInfo = array();␊ |
76 | // read docs info + occurrence frequency of the word␊ |
77 | for ($i=0;$i<$numDocs;$i++)␊ |
78 | {␊ |
79 | $idx=readInt($file); ␊ |
80 | $freq=readInt($file); ␊ |
81 | $docInfo[$i]=array("idx" => $idx,␊ |
82 | "freq" => $freq>>1,␊ |
83 | "rank" => 0.0,␊ |
84 | "hi" => $freq&1␊ |
85 | );␊ |
86 | if ($freq&1) // word occurs in high priority doc␊ |
87 | {␊ |
88 | $totalHi++;␊ |
89 | $totalFreqHi+=$freq*$multiplier;␊ |
90 | }␊ |
91 | else // word occurs in low priority doc␊ |
92 | {␊ |
93 | $totalFreqLo+=$freq*$multiplier;␊ |
94 | }␊ |
95 | }␊ |
96 | // read name and url info for the doc␊ |
97 | for ($i=0;$i<$numDocs;$i++)␊ |
98 | {␊ |
99 | fseek($file,$docInfo[$i]["idx"]);␊ |
100 | $docInfo[$i]["name"]=readString($file);␊ |
101 | $docInfo[$i]["url"]=readString($file);␊ |
102 | }␊ |
103 | $statInfo["docs"]=$docInfo;␊ |
104 | }␊ |
105 | $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;␊ |
106 | for ($count=$start;$count<sizeof($statsList);$count++)␊ |
107 | {␊ |
108 | $statInfo = &$statsList[$count];␊ |
109 | $multiplier = 1;␊ |
110 | // whole word matches have a double weight␊ |
111 | if ($statInfo["full"]) $multiplier=2;␊ |
112 | for ($i=0;$i<sizeof($statInfo["docs"]);$i++)␊ |
113 | {␊ |
114 | $docInfo = &$statInfo["docs"];␊ |
115 | // compute frequency rank of the word in each doc␊ |
116 | $freq=$docInfo[$i]["freq"];␊ |
117 | if ($docInfo[$i]["hi"])␊ |
118 | {␊ |
119 | $statInfo["docs"][$i]["rank"]=␊ |
120 | (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;␊ |
121 | }␊ |
122 | else␊ |
123 | {␊ |
124 | $statInfo["docs"][$i]["rank"]=␊ |
125 | (float)($freq*$multiplier)/$totalFreq;␊ |
126 | }␊ |
127 | }␊ |
128 | }␊ |
129 | }␊ |
130 | }␊ |
131 | return $statsList;␊ |
132 | }␊ |
133 | ␊ |
134 | function combine_results($results,&$docs)␊ |
135 | {␊ |
136 | foreach ($results as $wordInfo)␊ |
137 | {␊ |
138 | $docsList = &$wordInfo["docs"];␊ |
139 | foreach ($docsList as $di)␊ |
140 | {␊ |
141 | $key=$di["url"];␊ |
142 | $rank=$di["rank"];␊ |
143 | if (isset($docs[$key]))␊ |
144 | {␊ |
145 | $docs[$key]["rank"]+=$rank;␊ |
146 | }␊ |
147 | else␊ |
148 | {␊ |
149 | $docs[$key] = array("url"=>$key,␊ |
150 | "name"=>$di["name"],␊ |
151 | "rank"=>$rank␊ |
152 | );␊ |
153 | }␊ |
154 | $docs[$key]["words"][] = array(␊ |
155 | "word"=>$wordInfo["word"],␊ |
156 | "match"=>$wordInfo["match"],␊ |
157 | "freq"=>$di["freq"]␊ |
158 | );␊ |
159 | }␊ |
160 | }␊ |
161 | return $docs;␊ |
162 | }␊ |
163 | ␊ |
164 | function filter_results($docs,&$requiredWords,&$forbiddenWords)␊ |
165 | {␊ |
166 | $filteredDocs=array();␊ |
167 | while (list ($key, $val) = each ($docs)) ␊ |
168 | {␊ |
169 | $words = &$docs[$key]["words"];␊ |
170 | $copy=1; // copy entry by default␊ |
171 | if (sizeof($requiredWords)>0)␊ |
172 | {␊ |
173 | foreach ($requiredWords as $reqWord)␊ |
174 | {␊ |
175 | $found=0;␊ |
176 | foreach ($words as $wordInfo)␊ |
177 | { ␊ |
178 | $found = $wordInfo["word"]==$reqWord;␊ |
179 | if ($found) break;␊ |
180 | }␊ |
181 | if (!$found) ␊ |
182 | {␊ |
183 | $copy=0; // document contains none of the required words␊ |
184 | break;␊ |
185 | }␊ |
186 | }␊ |
187 | }␊ |
188 | if (sizeof($forbiddenWords)>0)␊ |
189 | {␊ |
190 | foreach ($words as $wordInfo)␊ |
191 | {␊ |
192 | if (in_array($wordInfo["word"],$forbiddenWords))␊ |
193 | {␊ |
194 | $copy=0; // document contains a forbidden word␊ |
195 | break;␊ |
196 | }␊ |
197 | }␊ |
198 | }␊ |
199 | if ($copy) $filteredDocs[$key]=$docs[$key];␊ |
200 | }␊ |
201 | return $filteredDocs;␊ |
202 | }␊ |
203 | ␊ |
204 | function compare_rank($a,$b)␊ |
205 | {␊ |
206 | if ($a["rank"] == $b["rank"]) ␊ |
207 | {␊ |
208 | return 0;␊ |
209 | }␊ |
210 | return ($a["rank"]>$b["rank"]) ? -1 : 1; ␊ |
211 | }␊ |
212 | ␊ |
213 | function sort_results($docs,&$sorted)␊ |
214 | {␊ |
215 | $sorted = $docs;␊ |
216 | usort($sorted,"compare_rank");␊ |
217 | return $sorted;␊ |
218 | }␊ |
219 | ␊ |
220 | function report_results(&$docs)␊ |
221 | {␊ |
222 | echo "<div class=\"header\">";␊ |
223 | echo " <div class=\"headertitle\">\n";␊ |
224 | echo " <h1>".search_results()."</h1>\n";␊ |
225 | echo " </div>\n";␊ |
226 | echo "</div>\n";␊ |
227 | echo "<div class=\"searchresults\">\n";␊ |
228 | echo "<table cellspacing=\"2\">\n";␊ |
229 | $numDocs = sizeof($docs);␊ |
230 | if ($numDocs==0)␊ |
231 | {␊ |
232 | echo " <tr>\n";␊ |
233 | echo " <td colspan=\"2\">".matches_text(0)."</td>\n";␊ |
234 | echo " </tr>\n";␊ |
235 | }␊ |
236 | else␊ |
237 | {␊ |
238 | echo " <tr>\n";␊ |
239 | echo " <td colspan=\"2\">".matches_text($numDocs);␊ |
240 | echo "\n";␊ |
241 | echo " </td>\n";␊ |
242 | echo " </tr>\n";␊ |
243 | $num=1;␊ |
244 | foreach ($docs as $doc)␊ |
245 | {␊ |
246 | echo " <tr>\n";␊ |
247 | echo " <td align=\"right\">$num.</td>";␊ |
248 | echo "<td><a class=\"el\" href=\"".$doc["url"]."\">".$doc["name"]."</a></td>\n";␊ |
249 | echo " <tr>\n";␊ |
250 | echo " <td></td><td class=\"tiny\">".report_matches()." ";␊ |
251 | foreach ($doc["words"] as $wordInfo)␊ |
252 | {␊ |
253 | $word = $wordInfo["word"];␊ |
254 | $matchRight = substr($wordInfo["match"],strlen($word));␊ |
255 | echo "<b>$word</b>$matchRight(".$wordInfo["freq"].") ";␊ |
256 | }␊ |
257 | echo " </td>\n";␊ |
258 | echo " </tr>\n";␊ |
259 | $num++;␊ |
260 | }␊ |
261 | }␊ |
262 | echo "</table>\n";␊ |
263 | }␊ |
264 | ␊ |
265 | function main()␊ |
266 | {␊ |
267 | if(strcmp('4.1.0', phpversion()) > 0) ␊ |
268 | {␊ |
269 | die("Error: PHP version 4.1.0 or above required!");␊ |
270 | }␊ |
271 | if (!($file=fopen("search/search.idx","rb"))) ␊ |
272 | {␊ |
273 | die("Error: Search index file could NOT be opened!");␊ |
274 | }␊ |
275 | if (readHeader($file)!="DOXS")␊ |
276 | {␊ |
277 | die("Error: Header of index file is invalid!");␊ |
278 | }␊ |
279 | $query="";␊ |
280 | if (array_key_exists("query", $_GET))␊ |
281 | {␊ |
282 | $query=$_GET["query"];␊ |
283 | }␊ |
284 | end_form(preg_replace("/[^a-zA-Z0-9\-\_\.]/i", " ", $query ));␊ |
285 | $results = array();␊ |
286 | $requiredWords = array();␊ |
287 | $forbiddenWords = array();␊ |
288 | $foundWords = array();␊ |
289 | $word=strtok($query," ");␊ |
290 | while ($word) // for each word in the search query␊ |
291 | {␊ |
292 | if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }␊ |
293 | if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }␊ |
294 | if (!in_array($word,$foundWords))␊ |
295 | {␊ |
296 | $foundWords[]=$word;␊ |
297 | search($file,strtolower($word),$results);␊ |
298 | }␊ |
299 | $word=strtok(" ");␊ |
300 | }␊ |
301 | $docs = array();␊ |
302 | combine_results($results,$docs);␊ |
303 | // filter out documents with forbidden word or that do not contain␊ |
304 | // required words␊ |
305 | $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);␊ |
306 | // sort the results based on rank␊ |
307 | $sorted = array();␊ |
308 | sort_results($filteredDocs,$sorted);␊ |
309 | // report results to the user␊ |
310 | report_results($sorted);␊ |
311 | echo "</div>\n";␊ |
312 | end_page();␊ |
313 | fclose($file);␊ |
314 | }␊ |
315 | ␊ |
316 | main();␊ |
317 | ␊ |
318 |