Root/
Source at commit 1322 created 12 years 8 months ago. By meklort, Add doxygen to utils folder | |
---|---|
1 | /******************************************************************************␊ |
2 | *␊ |
3 | * $Id: searchindex.cpp,v 1.7 2001/03/19 19:27:41 root Exp $␊ |
4 | *␊ |
5 | * Copyright (C) 1997-2011 by Dimitri van Heesch.␊ |
6 | *␊ |
7 | * Permission to use, copy, modify, and distribute this software and its␊ |
8 | * documentation under the terms of the GNU General Public License is hereby ␊ |
9 | * granted. No representations are made about the suitability of this software ␊ |
10 | * for any purpose. It is provided "as is" without express or implied warranty.␊ |
11 | * See the GNU General Public License for more details.␊ |
12 | *␊ |
13 | * Documents produced by Doxygen are derivative works derived from the␊ |
14 | * input used in their production; they are not affected by this license.␊ |
15 | *␊ |
16 | */␊ |
17 | ␊ |
18 | #include "qtbc.h"␊ |
19 | #include "searchindex.h"␊ |
20 | #include "config.h"␊ |
21 | #include "util.h"␊ |
22 | #include <qfile.h>␊ |
23 | #include <ctype.h>␊ |
24 | #include <qregexp.h>␊ |
25 | ␊ |
26 | ␊ |
27 | // file format: (all multi-byte values are stored in big endian format)␊ |
28 | // 4 byte header␊ |
29 | // 256*256*4 byte index (4 bytes)␊ |
30 | // for each index entry: a zero terminated list of words ␊ |
31 | // for each word: a \0 terminated string + 4 byte offset to the stats info␊ |
32 | // padding bytes to align at 4 byte boundary␊ |
33 | // for each word: the number of urls (4 bytes) ␊ |
34 | // + for each url containing the word 8 bytes statistics␊ |
35 | // (4 bytes index to url string + 4 bytes frequency counter)␊ |
36 | // for each url: a \0 terminated string␊ |
37 | ␊ |
38 | const int numIndexEntries = 256*256;␊ |
39 | ␊ |
40 | //--------------------------------------------------------------------␊ |
41 | ␊ |
42 | IndexWord::IndexWord(const char *word) : m_word(word), m_urls(17)␊ |
43 | {␊ |
44 | m_urls.setAutoDelete(TRUE);␊ |
45 | //printf("IndexWord::IndexWord(%s)\n",word);␊ |
46 | }␊ |
47 | ␊ |
48 | void IndexWord::addUrlIndex(int idx,bool hiPriority)␊ |
49 | {␊ |
50 | //printf("IndexWord::addUrlIndex(%d,%d)\n",idx,hiPriority);␊ |
51 | URLInfo *ui = m_urls.find(idx);␊ |
52 | if (ui==0)␊ |
53 | {␊ |
54 | //printf("URLInfo::URLInfo(%d)\n",idx);␊ |
55 | ui=new URLInfo(idx,0);␊ |
56 | m_urls.insert(idx,ui);␊ |
57 | }␊ |
58 | ui->freq+=2;␊ |
59 | if (hiPriority) ui->freq|=1; // mark as high priority document␊ |
60 | }␊ |
61 | ␊ |
62 | //--------------------------------------------------------------------␊ |
63 | ␊ |
64 | SearchIndex::SearchIndex() : m_words(328829), m_index(numIndexEntries), m_urlIndex(-1)␊ |
65 | {␊ |
66 | int i;␊ |
67 | m_words.setAutoDelete(TRUE);␊ |
68 | m_urls.setAutoDelete(TRUE);␊ |
69 | m_index.setAutoDelete(TRUE);␊ |
70 | for (i=0;i<numIndexEntries;i++) m_index.insert(i,new QList<IndexWord>);␊ |
71 | }␊ |
72 | ␊ |
73 | void SearchIndex::setCurrentDoc(const char *name,const char *baseName,const char *anchor)␊ |
74 | {␊ |
75 | if (name==0 || baseName==0) return;␊ |
76 | //printf("SearchIndex::setCurrentDoc(%s,%s,%s)\n",name,baseName,anchor);␊ |
77 | QCString url=baseName+Config_getString("HTML_FILE_EXTENSION");␊ |
78 | if (anchor) url+=(QCString)"#"+anchor; ␊ |
79 | m_urlIndex++;␊ |
80 | m_urls.insert(m_urlIndex,new URL(name,url));␊ |
81 | }␊ |
82 | ␊ |
83 | static int charsToIndex(const char *word)␊ |
84 | {␊ |
85 | if (word==0) return -1;␊ |
86 | ␊ |
87 | // Fast string hashing algorithm␊ |
88 | //register ushort h=0;␊ |
89 | //const char *k = word;␊ |
90 | //ushort mask=0xfc00;␊ |
91 | //while ( *k ) ␊ |
92 | //{␊ |
93 | // h = (h&mask)^(h<<6)^(*k++);␊ |
94 | //}␊ |
95 | //return h;␊ |
96 | ␊ |
97 | // Simple hashing that allows for substring searching␊ |
98 | uint c1=word[0];␊ |
99 | if (c1==0) return -1;␊ |
100 | uint c2=word[1];␊ |
101 | if (c2==0) return -1;␊ |
102 | return c1*256+c2;␊ |
103 | }␊ |
104 | ␊ |
105 | void SearchIndex::addWord(const char *word,bool hiPriority,bool recurse)␊ |
106 | {␊ |
107 | static QRegExp nextPart("[_a-z:][A-Z]");␊ |
108 | //printf("SearchIndex::addWord(%s,%d)\n",word,hiPriority);␊ |
109 | QString wStr(word);␊ |
110 | if (wStr.isEmpty()) return;␊ |
111 | wStr=wStr.lower();␊ |
112 | IndexWord *w = m_words[wStr];␊ |
113 | if (w==0)␊ |
114 | {␊ |
115 | int idx=charsToIndex(wStr);␊ |
116 | if (idx<0) return;␊ |
117 | w = new IndexWord(wStr);␊ |
118 | //fprintf(stderr,"addWord(%s) at index %d\n",word,idx);␊ |
119 | m_index[idx]->append(w);␊ |
120 | m_words.insert(wStr,w);␊ |
121 | }␊ |
122 | w->addUrlIndex(m_urlIndex,hiPriority);␊ |
123 | int i;␊ |
124 | bool found=FALSE;␊ |
125 | if (!recurse) // the first time we check if we can strip the prefix␊ |
126 | {␊ |
127 | i=getPrefixIndex(word);␊ |
128 | if (i>0)␊ |
129 | {␊ |
130 | addWord(word+i,hiPriority,TRUE);␊ |
131 | found=TRUE;␊ |
132 | }␊ |
133 | }␊ |
134 | if (!found) // no prefix stripped␊ |
135 | {␊ |
136 | if ((i=nextPart.match(word))>=1)␊ |
137 | {␊ |
138 | addWord(word+i+1,hiPriority,TRUE);␊ |
139 | }␊ |
140 | }␊ |
141 | }␊ |
142 | ␊ |
143 | ␊ |
144 | static void writeInt(QFile &f,int index)␊ |
145 | {␊ |
146 | f.putch(((uint)index)>>24);␊ |
147 | f.putch((((uint)index)>>16)&0xff);␊ |
148 | f.putch((((uint)index)>>8)&0xff);␊ |
149 | f.putch(((uint)index)&0xff);␊ |
150 | }␊ |
151 | ␊ |
152 | static void writeString(QFile &f,const char *s)␊ |
153 | {␊ |
154 | const char *p = s;␊ |
155 | while (*p) f.putch(*p++);␊ |
156 | f.putch(0);␊ |
157 | }␊ |
158 | ␊ |
159 | void SearchIndex::write(const char *fileName)␊ |
160 | {␊ |
161 | int i;␊ |
162 | int size=4; // for the header␊ |
163 | size+=4*numIndexEntries; // for the index␊ |
164 | int wordsOffset = size;␊ |
165 | // first pass: compute the size of the wordlist␊ |
166 | for (i=0;i<numIndexEntries;i++)␊ |
167 | {␊ |
168 | QList<IndexWord> *wlist = m_index[i];␊ |
169 | if (!wlist->isEmpty())␊ |
170 | {␊ |
171 | QListIterator<IndexWord> iwi(*wlist);␊ |
172 | IndexWord *iw;␊ |
173 | for (iwi.toFirst();(iw=iwi.current());++iwi)␊ |
174 | {␊ |
175 | int ws = iw->word().length()+1; ␊ |
176 | size+=ws+4; // word + url info list offset␊ |
177 | }␊ |
178 | size+=1; // zero list terminator␊ |
179 | }␊ |
180 | }␊ |
181 | ␊ |
182 | // second pass: compute the offsets in the index␊ |
183 | int indexOffsets[numIndexEntries];␊ |
184 | int offset=wordsOffset;␊ |
185 | for (i=0;i<numIndexEntries;i++)␊ |
186 | {␊ |
187 | QList<IndexWord> *wlist = m_index[i];␊ |
188 | if (!wlist->isEmpty())␊ |
189 | {␊ |
190 | indexOffsets[i]=offset;␊ |
191 | QListIterator<IndexWord> iwi(*wlist);␊ |
192 | IndexWord *iw;␊ |
193 | for (iwi.toFirst();(iw=iwi.current());++iwi)␊ |
194 | {␊ |
195 | offset+= iw->word().length()+1; ␊ |
196 | offset+=4; // word + offset to url info array ␊ |
197 | }␊ |
198 | offset+=1; // zero list terminator␊ |
199 | }␊ |
200 | else␊ |
201 | {␊ |
202 | indexOffsets[i]=0;␊ |
203 | }␊ |
204 | }␊ |
205 | int padding = size;␊ |
206 | size = (size+3)&~3; // round up to 4 byte boundary␊ |
207 | padding = size - padding;␊ |
208 | ␊ |
209 | //int statsOffset = size;␊ |
210 | QDictIterator<IndexWord> wdi(m_words);␊ |
211 | //IndexWord *iw;␊ |
212 | int *wordStatOffsets = new int[m_words.count()];␊ |
213 | ␊ |
214 | int count=0;␊ |
215 | ␊ |
216 | // third pass: compute offset to stats info for each word␊ |
217 | for (i=0;i<numIndexEntries;i++)␊ |
218 | {␊ |
219 | QList<IndexWord> *wlist = m_index[i];␊ |
220 | if (!wlist->isEmpty())␊ |
221 | {␊ |
222 | QListIterator<IndexWord> iwi(*wlist);␊ |
223 | IndexWord *iw;␊ |
224 | for (iwi.toFirst();(iw=iwi.current());++iwi)␊ |
225 | {␊ |
226 | //printf("wordStatOffsets[%d]=%d\n",count,size);␊ |
227 | wordStatOffsets[count++] = size;␊ |
228 | size+=4+iw->urls().count()*8; // count + (url_index,freq) per url␊ |
229 | }␊ |
230 | }␊ |
231 | }␊ |
232 | int *urlOffsets = new int[m_urls.count()];␊ |
233 | //int urlsOffset = size;␊ |
234 | QIntDictIterator<URL> udi(m_urls);␊ |
235 | URL *url;␊ |
236 | for (udi.toFirst();(url=udi.current());++udi)␊ |
237 | {␊ |
238 | urlOffsets[udi.currentKey()]=size;␊ |
239 | size+=url->name.length()+1+␊ |
240 | url->url.length()+1;␊ |
241 | }␊ |
242 | //printf("Total size %x bytes (word=%x stats=%x urls=%x)\n",size,wordsOffset,statsOffset,urlsOffset);␊ |
243 | QFile f(fileName);␊ |
244 | if (f.open(IO_WriteOnly))␊ |
245 | {␊ |
246 | // write header␊ |
247 | f.putch('D'); f.putch('O'); f.putch('X'); f.putch('S');␊ |
248 | // write index␊ |
249 | for (i=0;i<numIndexEntries;i++)␊ |
250 | {␊ |
251 | writeInt(f,indexOffsets[i]);␊ |
252 | }␊ |
253 | // write word lists␊ |
254 | count=0;␊ |
255 | for (i=0;i<numIndexEntries;i++)␊ |
256 | {␊ |
257 | QList<IndexWord> *wlist = m_index[i];␊ |
258 | if (!wlist->isEmpty())␊ |
259 | {␊ |
260 | QListIterator<IndexWord> iwi(*wlist);␊ |
261 | IndexWord *iw;␊ |
262 | for (iwi.toFirst();(iw=iwi.current());++iwi)␊ |
263 | {␊ |
264 | writeString(f,iw->word());␊ |
265 | writeInt(f,wordStatOffsets[count++]);␊ |
266 | }␊ |
267 | f.putch(0);␊ |
268 | }␊ |
269 | }␊ |
270 | // write extra padding bytes␊ |
271 | for (i=0;i<padding;i++) f.putch(0);␊ |
272 | // write word statistics␊ |
273 | for (i=0;i<numIndexEntries;i++)␊ |
274 | {␊ |
275 | QList<IndexWord> *wlist = m_index[i];␊ |
276 | if (!wlist->isEmpty())␊ |
277 | {␊ |
278 | QListIterator<IndexWord> iwi(*wlist);␊ |
279 | IndexWord *iw;␊ |
280 | for (iwi.toFirst();(iw=iwi.current());++iwi)␊ |
281 | {␊ |
282 | int numUrls = iw->urls().count();␊ |
283 | writeInt(f,numUrls);␊ |
284 | QIntDictIterator<URLInfo> uli(iw->urls());␊ |
285 | URLInfo *ui;␊ |
286 | for (uli.toFirst();(ui=uli.current());++uli)␊ |
287 | {␊ |
288 | writeInt(f,urlOffsets[ui->urlIdx]);␊ |
289 | writeInt(f,ui->freq);␊ |
290 | }␊ |
291 | }␊ |
292 | }␊ |
293 | }␊ |
294 | // write urls␊ |
295 | QIntDictIterator<URL> udi(m_urls);␊ |
296 | URL *url;␊ |
297 | for (udi.toFirst();(url=udi.current());++udi)␊ |
298 | {␊ |
299 | writeString(f,url->name);␊ |
300 | writeString(f,url->url);␊ |
301 | }␊ |
302 | }␊ |
303 | ␊ |
304 | delete[] urlOffsets;␊ |
305 | delete[] wordStatOffsets;␊ |
306 | }␊ |
307 | ␊ |
308 |