Root/
Source at commit 1322 created 12 years 7 months ago. By meklort, Add doxygen to utils folder | |
---|---|
1 | "function readInt($file)\n"␊ |
2 | "{\n"␊ |
3 | " $b1 = ord(fgetc($file)); $b2 = ord(fgetc($file));\n"␊ |
4 | " $b3 = ord(fgetc($file)); $b4 = ord(fgetc($file));\n"␊ |
5 | " return ($b1<<24)|($b2<<16)|($b3<<8)|$b4;\n"␊ |
6 | "}\n"␊ |
7 | "\n"␊ |
8 | "function readString($file)\n"␊ |
9 | "{\n"␊ |
10 | " $result=\"\";\n"␊ |
11 | " while (ord($c=fgetc($file))) $result.=$c;\n"␊ |
12 | " return $result;\n"␊ |
13 | "}\n"␊ |
14 | "\n"␊ |
15 | "function readHeader($file)\n"␊ |
16 | "{\n"␊ |
17 | " $header =fgetc($file); $header.=fgetc($file);\n"␊ |
18 | " $header.=fgetc($file); $header.=fgetc($file);\n"␊ |
19 | " return $header;\n"␊ |
20 | "}\n"␊ |
21 | "\n"␊ |
22 | "function computeIndex($word)\n"␊ |
23 | "{\n"␊ |
24 | " // Simple hashing that allows for substring search\n"␊ |
25 | " if (strlen($word)<2) return -1;\n"␊ |
26 | " // high char of the index\n"␊ |
27 | " $hi = ord($word{0});\n"␊ |
28 | " if ($hi==0) return -1;\n"␊ |
29 | " // low char of the index\n"␊ |
30 | " $lo = ord($word{1});\n"␊ |
31 | " if ($lo==0) return -1;\n"␊ |
32 | " // return index\n"␊ |
33 | " return $hi*256+$lo;\n"␊ |
34 | "}\n"␊ |
35 | "\n"␊ |
36 | "function search($file,$word,&$statsList)\n"␊ |
37 | "{\n"␊ |
38 | " $index = computeIndex($word);\n"␊ |
39 | " if ($index!=-1) // found a valid index\n"␊ |
40 | " {\n"␊ |
41 | " fseek($file,$index*4+4); // 4 bytes per entry, skip header\n"␊ |
42 | " $index = readInt($file);\n"␊ |
43 | " if ($index) // found words matching the hash key\n"␊ |
44 | " {\n"␊ |
45 | " $start=sizeof($statsList);\n"␊ |
46 | " $count=$start;\n"␊ |
47 | " fseek($file,$index);\n"␊ |
48 | " $w = readString($file);\n"␊ |
49 | " while ($w)\n"␊ |
50 | " {\n"␊ |
51 | " $statIdx = readInt($file);\n"␊ |
52 | " if ($word==substr($w,0,strlen($word)))\n"␊ |
53 | " { // found word that matches (as substring)\n"␊ |
54 | " $statsList[$count++]=array(\n"␊ |
55 | " \"word\"=>$word,\n"␊ |
56 | " \"match\"=>$w,\n"␊ |
57 | " \"index\"=>$statIdx,\n"␊ |
58 | " \"full\"=>strlen($w)==strlen($word),\n"␊ |
59 | " \"docs\"=>array()\n"␊ |
60 | " );\n"␊ |
61 | " }\n"␊ |
62 | " $w = readString($file);\n"␊ |
63 | " }\n"␊ |
64 | " $totalHi=0;\n"␊ |
65 | " $totalFreqHi=0;\n"␊ |
66 | " $totalFreqLo=0;\n"␊ |
67 | " for ($count=$start;$count<sizeof($statsList);$count++)\n"␊ |
68 | " {\n"␊ |
69 | " $statInfo = &$statsList[$count];\n"␊ |
70 | " $multiplier = 1;\n"␊ |
71 | " // whole word matches have a double weight\n"␊ |
72 | " if ($statInfo[\"full\"]) $multiplier=2;\n"␊ |
73 | " fseek($file,$statInfo[\"index\"]); \n"␊ |
74 | " $numDocs = readInt($file);\n"␊ |
75 | " $docInfo = array();\n"␊ |
76 | " // read docs info + occurrence frequency of the word\n"␊ |
77 | " for ($i=0;$i<$numDocs;$i++)\n"␊ |
78 | " {\n"␊ |
79 | " $idx=readInt($file); \n"␊ |
80 | " $freq=readInt($file); \n"␊ |
81 | " $docInfo[$i]=array(\"idx\" => $idx,\n"␊ |
82 | " \"freq\" => $freq>>1,\n"␊ |
83 | " \"rank\" => 0.0,\n"␊ |
84 | " \"hi\" => $freq&1\n"␊ |
85 | " );\n"␊ |
86 | " if ($freq&1) // word occurs in high priority doc\n"␊ |
87 | " {\n"␊ |
88 | " $totalHi++;\n"␊ |
89 | " $totalFreqHi+=$freq*$multiplier;\n"␊ |
90 | " }\n"␊ |
91 | " else // word occurs in low priority doc\n"␊ |
92 | " {\n"␊ |
93 | " $totalFreqLo+=$freq*$multiplier;\n"␊ |
94 | " }\n"␊ |
95 | " }\n"␊ |
96 | " // read name and url info for the doc\n"␊ |
97 | " for ($i=0;$i<$numDocs;$i++)\n"␊ |
98 | " {\n"␊ |
99 | " fseek($file,$docInfo[$i][\"idx\"]);\n"␊ |
100 | " $docInfo[$i][\"name\"]=readString($file);\n"␊ |
101 | " $docInfo[$i][\"url\"]=readString($file);\n"␊ |
102 | " }\n"␊ |
103 | " $statInfo[\"docs\"]=$docInfo;\n"␊ |
104 | " }\n"␊ |
105 | " $totalFreq=($totalHi+1)*$totalFreqLo + $totalFreqHi;\n"␊ |
106 | " for ($count=$start;$count<sizeof($statsList);$count++)\n"␊ |
107 | " {\n"␊ |
108 | " $statInfo = &$statsList[$count];\n"␊ |
109 | " $multiplier = 1;\n"␊ |
110 | " // whole word matches have a double weight\n"␊ |
111 | " if ($statInfo[\"full\"]) $multiplier=2;\n"␊ |
112 | " for ($i=0;$i<sizeof($statInfo[\"docs\"]);$i++)\n"␊ |
113 | " {\n"␊ |
114 | " $docInfo = &$statInfo[\"docs\"];\n"␊ |
115 | " // compute frequency rank of the word in each doc\n"␊ |
116 | " $freq=$docInfo[$i][\"freq\"];\n"␊ |
117 | " if ($docInfo[$i][\"hi\"])\n"␊ |
118 | " {\n"␊ |
119 | " $statInfo[\"docs\"][$i][\"rank\"]=\n"␊ |
120 | " (float)($freq*$multiplier+$totalFreqLo)/$totalFreq;\n"␊ |
121 | " }\n"␊ |
122 | " else\n"␊ |
123 | " {\n"␊ |
124 | " $statInfo[\"docs\"][$i][\"rank\"]=\n"␊ |
125 | " (float)($freq*$multiplier)/$totalFreq;\n"␊ |
126 | " }\n"␊ |
127 | " }\n"␊ |
128 | " }\n"␊ |
129 | " }\n"␊ |
130 | " }\n"␊ |
131 | " return $statsList;\n"␊ |
132 | "}\n"␊ |
133 | "\n"␊ |
134 | "function combine_results($results,&$docs)\n"␊ |
135 | "{\n"␊ |
136 | " foreach ($results as $wordInfo)\n"␊ |
137 | " {\n"␊ |
138 | " $docsList = &$wordInfo[\"docs\"];\n"␊ |
139 | " foreach ($docsList as $di)\n"␊ |
140 | " {\n"␊ |
141 | " $key=$di[\"url\"];\n"␊ |
142 | " $rank=$di[\"rank\"];\n"␊ |
143 | " if (isset($docs[$key]))\n"␊ |
144 | " {\n"␊ |
145 | " $docs[$key][\"rank\"]+=$rank;\n"␊ |
146 | " }\n"␊ |
147 | " else\n"␊ |
148 | " {\n"␊ |
149 | " $docs[$key] = array(\"url\"=>$key,\n"␊ |
150 | " \"name\"=>$di[\"name\"],\n"␊ |
151 | " \"rank\"=>$rank\n"␊ |
152 | " );\n"␊ |
153 | " }\n"␊ |
154 | " $docs[$key][\"words\"][] = array(\n"␊ |
155 | " \"word\"=>$wordInfo[\"word\"],\n"␊ |
156 | " \"match\"=>$wordInfo[\"match\"],\n"␊ |
157 | " \"freq\"=>$di[\"freq\"]\n"␊ |
158 | " );\n"␊ |
159 | " }\n"␊ |
160 | " }\n"␊ |
161 | " return $docs;\n"␊ |
162 | "}\n"␊ |
163 | "\n"␊ |
164 | "function filter_results($docs,&$requiredWords,&$forbiddenWords)\n"␊ |
165 | "{\n"␊ |
166 | " $filteredDocs=array();\n"␊ |
167 | " while (list ($key, $val) = each ($docs)) \n"␊ |
168 | " {\n"␊ |
169 | " $words = &$docs[$key][\"words\"];\n"␊ |
170 | " $copy=1; // copy entry by default\n"␊ |
171 | " if (sizeof($requiredWords)>0)\n"␊ |
172 | " {\n"␊ |
173 | " foreach ($requiredWords as $reqWord)\n"␊ |
174 | " {\n"␊ |
175 | " $found=0;\n"␊ |
176 | " foreach ($words as $wordInfo)\n"␊ |
177 | " { \n"␊ |
178 | " $found = $wordInfo[\"word\"]==$reqWord;\n"␊ |
179 | " if ($found) break;\n"␊ |
180 | " }\n"␊ |
181 | " if (!$found) \n"␊ |
182 | " {\n"␊ |
183 | " $copy=0; // document contains none of the required words\n"␊ |
184 | " break;\n"␊ |
185 | " }\n"␊ |
186 | " }\n"␊ |
187 | " }\n"␊ |
188 | " if (sizeof($forbiddenWords)>0)\n"␊ |
189 | " {\n"␊ |
190 | " foreach ($words as $wordInfo)\n"␊ |
191 | " {\n"␊ |
192 | " if (in_array($wordInfo[\"word\"],$forbiddenWords))\n"␊ |
193 | " {\n"␊ |
194 | " $copy=0; // document contains a forbidden word\n"␊ |
195 | " break;\n"␊ |
196 | " }\n"␊ |
197 | " }\n"␊ |
198 | " }\n"␊ |
199 | " if ($copy) $filteredDocs[$key]=$docs[$key];\n"␊ |
200 | " }\n"␊ |
201 | " return $filteredDocs;\n"␊ |
202 | "}\n"␊ |
203 | "\n"␊ |
204 | "function compare_rank($a,$b)\n"␊ |
205 | "{\n"␊ |
206 | " if ($a[\"rank\"] == $b[\"rank\"]) \n"␊ |
207 | " {\n"␊ |
208 | " return 0;\n"␊ |
209 | " }\n"␊ |
210 | " return ($a[\"rank\"]>$b[\"rank\"]) ? -1 : 1; \n"␊ |
211 | "}\n"␊ |
212 | "\n"␊ |
213 | "function sort_results($docs,&$sorted)\n"␊ |
214 | "{\n"␊ |
215 | " $sorted = $docs;\n"␊ |
216 | " usort($sorted,\"compare_rank\");\n"␊ |
217 | " return $sorted;\n"␊ |
218 | "}\n"␊ |
219 | "\n"␊ |
220 | "function report_results(&$docs)\n"␊ |
221 | "{\n"␊ |
222 | " echo \"<div class=\\\"header\\\">\";\n"␊ |
223 | " echo \" <div class=\\\"headertitle\\\">\\n\";\n"␊ |
224 | " echo \" <h1>\".search_results().\"</h1>\\n\";\n"␊ |
225 | " echo \" </div>\\n\";\n"␊ |
226 | " echo \"</div>\\n\";\n"␊ |
227 | " echo \"<div class=\\\"searchresults\\\">\\n\";\n"␊ |
228 | " echo \"<table cellspacing=\\\"2\\\">\\n\";\n"␊ |
229 | " $numDocs = sizeof($docs);\n"␊ |
230 | " if ($numDocs==0)\n"␊ |
231 | " {\n"␊ |
232 | " echo \" <tr>\\n\";\n"␊ |
233 | " echo \" <td colspan=\\\"2\\\">\".matches_text(0).\"</td>\\n\";\n"␊ |
234 | " echo \" </tr>\\n\";\n"␊ |
235 | " }\n"␊ |
236 | " else\n"␊ |
237 | " {\n"␊ |
238 | " echo \" <tr>\\n\";\n"␊ |
239 | " echo \" <td colspan=\\\"2\\\">\".matches_text($numDocs);\n"␊ |
240 | " echo \"\\n\";\n"␊ |
241 | " echo \" </td>\\n\";\n"␊ |
242 | " echo \" </tr>\\n\";\n"␊ |
243 | " $num=1;\n"␊ |
244 | " foreach ($docs as $doc)\n"␊ |
245 | " {\n"␊ |
246 | " echo \" <tr>\\n\";\n"␊ |
247 | " echo \" <td align=\\\"right\\\">$num.</td>\";\n"␊ |
248 | " echo \"<td><a class=\\\"el\\\" href=\\\"\".$doc[\"url\"].\"\\\">\".$doc[\"name\"].\"</a></td>\\n\";\n"␊ |
249 | " echo \" <tr>\\n\";\n"␊ |
250 | " echo \" <td></td><td class=\\\"tiny\\\">\".report_matches().\" \";\n"␊ |
251 | " foreach ($doc[\"words\"] as $wordInfo)\n"␊ |
252 | " {\n"␊ |
253 | " $word = $wordInfo[\"word\"];\n"␊ |
254 | " $matchRight = substr($wordInfo[\"match\"],strlen($word));\n"␊ |
255 | " echo \"<b>$word</b>$matchRight(\".$wordInfo[\"freq\"].\") \";\n"␊ |
256 | " }\n"␊ |
257 | " echo \" </td>\\n\";\n"␊ |
258 | " echo \" </tr>\\n\";\n"␊ |
259 | " $num++;\n"␊ |
260 | " }\n"␊ |
261 | " }\n"␊ |
262 | " echo \"</table>\\n\";\n"␊ |
263 | "}\n"␊ |
264 | "\n"␊ |
265 | "function main()\n"␊ |
266 | "{\n"␊ |
267 | " if(strcmp('4.1.0', phpversion()) > 0) \n"␊ |
268 | " {\n"␊ |
269 | " die(\"Error: PHP version 4.1.0 or above required!\");\n"␊ |
270 | " }\n"␊ |
271 | " if (!($file=fopen(\"search/search.idx\",\"rb\"))) \n"␊ |
272 | " {\n"␊ |
273 | " die(\"Error: Search index file could NOT be opened!\");\n"␊ |
274 | " }\n"␊ |
275 | " if (readHeader($file)!=\"DOXS\")\n"␊ |
276 | " {\n"␊ |
277 | " die(\"Error: Header of index file is invalid!\");\n"␊ |
278 | " }\n"␊ |
279 | " $query=\"\";\n"␊ |
280 | " if (array_key_exists(\"query\", $_GET))\n"␊ |
281 | " {\n"␊ |
282 | " $query=$_GET[\"query\"];\n"␊ |
283 | " }\n"␊ |
284 | " end_form(preg_replace(\"/[^a-zA-Z0-9\\-\\_\\.]/i\", \" \", $query ));\n"␊ |
285 | " $results = array();\n"␊ |
286 | " $requiredWords = array();\n"␊ |
287 | " $forbiddenWords = array();\n"␊ |
288 | " $foundWords = array();\n"␊ |
289 | " $word=strtok($query,\" \");\n"␊ |
290 | " while ($word) // for each word in the search query\n"␊ |
291 | " {\n"␊ |
292 | " if (($word{0}=='+')) { $word=substr($word,1); $requiredWords[]=$word; }\n"␊ |
293 | " if (($word{0}=='-')) { $word=substr($word,1); $forbiddenWords[]=$word; }\n"␊ |
294 | " if (!in_array($word,$foundWords))\n"␊ |
295 | " {\n"␊ |
296 | " $foundWords[]=$word;\n"␊ |
297 | " search($file,strtolower($word),$results);\n"␊ |
298 | " }\n"␊ |
299 | " $word=strtok(\" \");\n"␊ |
300 | " }\n"␊ |
301 | " $docs = array();\n"␊ |
302 | " combine_results($results,$docs);\n"␊ |
303 | " // filter out documents with forbidden word or that do not contain\n"␊ |
304 | " // required words\n"␊ |
305 | " $filteredDocs = filter_results($docs,$requiredWords,$forbiddenWords);\n"␊ |
306 | " // sort the results based on rank\n"␊ |
307 | " $sorted = array();\n"␊ |
308 | " sort_results($filteredDocs,$sorted);\n"␊ |
309 | " // report results to the user\n"␊ |
310 | " report_results($sorted);\n"␊ |
311 | " echo \"</div>\\n\";\n"␊ |
312 | " end_page();\n"␊ |
313 | " fclose($file);\n"␊ |
314 | "}\n"␊ |
315 | "\n"␊ |
316 | "main();\n"␊ |
317 | "\n"␊ |
318 |