1 | /*␊ |
2 | * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.␊ |
3 | *␊ |
4 | * @APPLE_LICENSE_HEADER_START@␊ |
5 | * ␊ |
6 | * The contents of this file constitute Original Code as defined in and␊ |
7 | * are subject to the Apple Public Source License Version 2.0 (the␊ |
8 | * "License"). You may not use this file except in compliance with the␊ |
9 | * License. Please obtain a copy of the License at␊ |
10 | * http://www.apple.com/publicsource and read it before using this file.␊ |
11 | * ␊ |
12 | * This Original Code and all software distributed under the License are␊ |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER␊ |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,␊ |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,␊ |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the␊ |
17 | * License for the specific language governing rights and limitations␊ |
18 | * under the License.␊ |
19 | * ␊ |
20 | * @APPLE_LICENSE_HEADER_END@␊ |
21 | */␊ |
22 | /*␊ |
23 | * HFSCompare.c - Functions for working with and comparing HFS nams.␊ |
24 | *␊ |
25 | * Copyright (c) 1999-2000 Apple Computer, Inc.␊ |
26 | *␊ |
27 | * DRI: Josh de Cesare␊ |
28 | */␊ |
29 | ␊ |
30 | #include "sl.h"␊ |
31 | #include "hfs_CaseTables.h"␊ |
32 | #include "hfs.h"␊ |
33 | ␊ |
34 | #if ! UNCOMPRESSED␊ |
35 | ␊ |
36 | static unsigned short *␊ |
37 | UncompressStructure(struct compressed_block *bp, int count, int size)␊ |
38 | {␊ |
39 | unsigned short *out = malloc(size);␊ |
40 | ␉if (!out) ␊ |
41 | ␉{␊ |
42 | ␉␉stop("UncompressStructure unable to allocate memory\n");␊ |
43 | ␉␉return 0;␊ |
44 | ␉}␊ |
45 | ␉bzero(out,size);␊ |
46 | unsigned short *op = out;␊ |
47 | unsigned short data;␊ |
48 | int i, j;␊ |
49 | ␉␊ |
50 | for (i=0; i<count; i++, bp++) {␊ |
51 | // If this happens (it shouldn't) please fix size and/or double check that count really is␊ |
52 | // the number of elements in the array.␊ |
53 | // This was a very hard bug to find, so please leave this code here.␊ |
54 | if(out + size <= op + bp->count)␊ |
55 | {␊ |
56 | stop("HFS+ Unicode tables are malformed\n");␊ |
57 | return 0;␊ |
58 | }␊ |
59 | data = bp->data;␊ |
60 | for (j=0; j<bp->count; j++) {␊ |
61 | *op++ = data;␊ |
62 | if (bp->type == kTypeAscending) data++;␊ |
63 | else if (bp->type == kTypeAscending256) data += 256;␊ |
64 | }␊ |
65 | }␊ |
66 | return out;␊ |
67 | }␊ |
68 | ␊ |
69 | static void␊ |
70 | InitCompareTables(void)␊ |
71 | {␊ |
72 | if (gCompareTable == 0) {␊ |
73 | gCompareTable = UncompressStructure(gCompareTableCompressed,␊ |
74 | kCompareTableNBlocks, kCompareTableDataSize);␊ |
75 | gLowerCaseTable = UncompressStructure(gLowerCaseTableCompressed,␊ |
76 | ␉␉␉␉␉␉␉␉␉␉␉ kLowerCaseTableNBlocks, kLowerCaseTableDataSize);␊ |
77 | }␊ |
78 | }␊ |
79 | ␊ |
80 | #endif /* ! UNCOMPRESSED */␊ |
81 | ␊ |
82 | //_______________________________________________________________________␊ |
83 | //␊ |
84 | //␉Routine:␉FastRelString␊ |
85 | //␊ |
86 | //␉Output:␉␉returns -1 if str1 < str2␊ |
87 | //␉␉␉␉returns 1 if str1 > str2␊ |
88 | //␉␉␉␉return␉ 0 if equal␊ |
89 | //␊ |
90 | //_______________________________________________________________________␊ |
91 | ␊ |
92 | int32_t␉FastRelString(u_int8_t * str1, u_int8_t * str2)␊ |
93 | {␊ |
94 | ␉int32_t bestGuess;␊ |
95 | ␉u_int8_t length, length2;␊ |
96 | ␉␊ |
97 | #if ! UNCOMPRESED␊ |
98 | ␉InitCompareTables();␊ |
99 | #endif␊ |
100 | ␉␊ |
101 | ␉length = *(str1++);␊ |
102 | ␉length2 = *(str2++);␊ |
103 | ␉␊ |
104 | ␉if (length == length2)␊ |
105 | ␉␉bestGuess = 0;␊ |
106 | ␉else if (length < length2)␊ |
107 | ␉␉bestGuess = -1;␊ |
108 | ␉else␊ |
109 | ␉{␊ |
110 | ␉␉bestGuess = 1;␊ |
111 | ␉␉length = length2;␊ |
112 | ␉}␊ |
113 | ␉␊ |
114 | ␉while (length--)␊ |
115 | ␉{␊ |
116 | ␉␉u_int32_t aChar, bChar;␊ |
117 | ␉␉␊ |
118 | ␉␉aChar = *(str1++);␊ |
119 | ␉␉bChar = *(str2++);␊ |
120 | ␉␉␊ |
121 | ␉␉if (aChar != bChar)␉/* If they don't match exacly, do case conversion */␊ |
122 | ␉␉{␊ |
123 | ␉␉␉u_int16_t aSortWord, bSortWord;␊ |
124 | ␉␉␉␊ |
125 | ␉␉␉aSortWord = gCompareTable[aChar];␊ |
126 | ␉␉␉bSortWord = gCompareTable[bChar];␊ |
127 | ␉␉␉␊ |
128 | ␉␉␉if (aSortWord > bSortWord)␊ |
129 | ␉␉␉␉return 1;␊ |
130 | ␉␉␉␊ |
131 | ␉␉␉if (aSortWord < bSortWord)␊ |
132 | ␉␉␉␉return -1;␊ |
133 | ␉␉}␊ |
134 | ␉␉␊ |
135 | ␉␉/*␊ |
136 | ␉␉ * If characters match exactly, then go on to next character␊ |
137 | ␉␉ * immediately without doing any extra work.␊ |
138 | ␉␉ */␊ |
139 | ␉}␊ |
140 | ␉␊ |
141 | ␉/* if you got to here, then return bestGuess */␊ |
142 | ␉return bestGuess;␊ |
143 | }␉␊ |
144 | ␊ |
145 | ␊ |
146 | //␊ |
147 | //␉FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering␊ |
148 | //␊ |
149 | //␉ IF␉␉␉␉RESULT␊ |
150 | //␉--------------------------␊ |
151 | //␉str1 < str2␉␉=>␉-1␊ |
152 | //␉str1 = str2␉␉=>␉ 0␊ |
153 | //␉str1 > str2␉␉=>␉+1␊ |
154 | //␊ |
155 | //␉The lower case table starts with 256 entries (one for each of the upper bytes␊ |
156 | //␉of the original Unicode char). If that entry is zero, then all characters with␊ |
157 | //␉that upper byte are already case folded. If the entry is non-zero, then it is␊ |
158 | //␉the _index_ (not byte offset) of the start of the sub-table for the characters␊ |
159 | //␉with that upper byte. All ignorable characters are folded to the value zero.␊ |
160 | //␊ |
161 | //␉In pseudocode:␊ |
162 | //␊ |
163 | //␉␉Let c = source Unicode character␊ |
164 | //␉␉Let table[] = lower case table␊ |
165 | //␊ |
166 | //␉␉lower = table[highbyte(c)]␊ |
167 | //␉␉if (lower == 0)␊ |
168 | //␉␉␉lower = c␊ |
169 | //␉␉else␊ |
170 | //␉␉␉lower = table[lower+lowbyte(c)]␊ |
171 | //␊ |
172 | //␉␉if (lower == 0)␊ |
173 | //␉␉␉ignore this character␊ |
174 | //␊ |
175 | //␉To handle ignorable characters, we now need a loop to find the next valid character.␊ |
176 | //␉Also, we can't pre-compute the number of characters to compare; the string length might␊ |
177 | //␉be larger than the number of non-ignorable characters. Further, we must be able to handle␊ |
178 | //␉ignorable characters at any point in the string, including as the first or last characters.␊ |
179 | //␉We use a zero value as a sentinel to detect both end-of-string and ignorable characters.␊ |
180 | //␉Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,␊ |
181 | //␉the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is␊ |
182 | //␉an invalid Unicode character).␊ |
183 | //␊ |
184 | //␉Pseudocode:␊ |
185 | //␊ |
186 | //␉␉while (1) {␊ |
187 | //␉␉␉c1 = GetNextValidChar(str1)␉␉␉//␉returns zero if at end of string␊ |
188 | //␉␉␉c2 = GetNextValidChar(str2)␊ |
189 | //␊ |
190 | //␉␉␉if (c1 != c2) break␉␉␉␉␉//␉found a difference␊ |
191 | //␊ |
192 | //␉␉␉if (c1 == 0)␉␉␉␉␉␉//␉reached end of string on both strings at once?␊ |
193 | //␉␉␉␉return 0;␉␉␉␉␉␉//␉yes, so strings are equal␊ |
194 | //␉␉}␊ |
195 | //␊ |
196 | //␉␉// When we get here, c1 != c2. So, we just need to determine which one is less.␊ |
197 | //␉␉if (c1 < c2)␊ |
198 | //␉␉␉return -1;␊ |
199 | //␉␉else␊ |
200 | //␉␉␉return 1;␊ |
201 | //␊ |
202 | ␊ |
203 | int32_t FastUnicodeCompare( u_int16_t * str1, register u_int32_t length1,␊ |
204 | ␉␉␉␉␉␉ u_int16_t * str2, register u_int32_t length2, int byte_order )␊ |
205 | {␊ |
206 | ␉register u_int16_t c1,c2;␊ |
207 | ␉register u_int16_t temp;␊ |
208 | ␉␊ |
209 | #if ! UNCOMPRESSED␊ |
210 | ␉InitCompareTables();␊ |
211 | #endif␊ |
212 | ␉␊ |
213 | ␉while (1) {␊ |
214 | ␉␉/* Set default values for c1, c2 in case there are no more valid chars */␊ |
215 | ␉␉c1 = 0;␊ |
216 | ␉␉c2 = 0;␊ |
217 | ␉␉␊ |
218 | ␉␉/* Find next non-ignorable char from str1, or zero if no more */␊ |
219 | ␉␉while (length1 && c1 == 0) {␊ |
220 | ␉␉␉if (byte_order == OSBigEndian)␊ |
221 | ␉␉␉␉c1 = SWAP_BE16(*(str1++));␊ |
222 | ␉␉␉else␊ |
223 | ␉␉␉␉c1 = SWAP_LE16(*(str1++));␊ |
224 | ␉␉␉--length1;␊ |
225 | ␉␉␉if ((temp = gLowerCaseTable[c1>>8]) != 0)␉␉// is there a subtable for this upper byte?␊ |
226 | ␉␉␉␉c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];␉// yes, so fold the char␊ |
227 | ␉␉}␊ |
228 | ␉␉␊ |
229 | ␉␉/* Find next non-ignorable char from str2, or zero if no more */␊ |
230 | ␉␉while (length2 && c2 == 0) {␊ |
231 | ␉␉␉if (byte_order == OSBigEndian)␊ |
232 | ␉␉␉␉c2 = SWAP_BE16(*(str2++));␊ |
233 | ␉␉␉else␊ |
234 | ␉␉␉␉c2 = SWAP_LE16(*(str2++));␊ |
235 | ␉␉␉--length2;␊ |
236 | ␉␉␉if ((temp = gLowerCaseTable[c2>>8]) != 0)␉␉// is there a subtable for this upper byte?␊ |
237 | ␉␉␉␉c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];␉// yes, so fold the char␊ |
238 | ␉␉}␊ |
239 | ␉␉␊ |
240 | ␉␉if (c1 != c2)␉/* found a difference, so stop looping */␊ |
241 | ␉␉␉break;␊ |
242 | ␉␉␊ |
243 | ␉␉if (c1 == 0)␉␉/* did we reach the end of both strings at the same time? */␊ |
244 | ␉␉␉return 0;␉/* yes, so strings are equal */␊ |
245 | ␉}␊ |
246 | ␉␊ |
247 | ␉if (c1 < c2)␊ |
248 | ␉␉return -1;␊ |
249 | ␉else␊ |
250 | ␉␉return 1;␊ |
251 | }␊ |
252 | ␊ |
253 | ␊ |
254 | //␊ |
255 | // BinaryUnicodeCompare - Compare two Unicode strings; produce a relative ordering␊ |
256 | // Compared using a 16-bit binary comparison (no case folding)␊ |
257 | //␊ |
258 | int32_t BinaryUnicodeCompare (u_int16_t * str1, u_int32_t length1,␊ |
259 | u_int16_t * str2, u_int32_t length2)␊ |
260 | {␊ |
261 | ␉register u_int16_t c1, c2;␊ |
262 | ␉int32_t bestGuess;␊ |
263 | ␉u_int32_t length;␊ |
264 | ␉␊ |
265 | ␉bestGuess = 0;␊ |
266 | ␉␊ |
267 | ␉if (length1 < length2) {␊ |
268 | ␉␉length = length1;␊ |
269 | ␉␉--bestGuess;␊ |
270 | ␉} else if (length1 > length2) {␊ |
271 | ␉␉length = length2;␊ |
272 | ␉␉++bestGuess;␊ |
273 | ␉} else {␊ |
274 | ␉␉length = length1;␊ |
275 | ␉}␊ |
276 | ␉␊ |
277 | ␉while (length--) {␊ |
278 | ␉␉c1 = *(str1++);␊ |
279 | ␉␉c2 = *(str2++);␊ |
280 | ␉␉␊ |
281 | ␉␉if (c1 > c2)␊ |
282 | ␉␉␉return (1);␊ |
283 | ␉␉if (c1 < c2)␊ |
284 | ␉␉␉return (-1);␊ |
285 | ␉}␊ |
286 | ␉␊ |
287 | ␉return (bestGuess);␊ |
288 | }␊ |
289 | ␊ |
290 | ␊ |
291 | /*␊ |
292 | * UTF-8 (UCS Transformation Format)␊ |
293 | *␊ |
294 | * The following subset of UTF-8 is used to encode UCS-2 filenames. It␊ |
295 | * requires a maximum of three 3 bytes per UCS-2 character. Only the␊ |
296 | * shortest encoding required to represent the significant UCS-2 bits␊ |
297 | * is legal.␊ |
298 | * ␊ |
299 | * UTF-8 Multibyte Codes␊ |
300 | *␊ |
301 | * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)␊ |
302 | * -------------------------------------------------------------------␊ |
303 | * 1 7 0x0000 0x007F 0xxxxxxx␊ |
304 | * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx␊ |
305 | * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx␊ |
306 | * -------------------------------------------------------------------␊ |
307 | */␊ |
308 | ␊ |
309 | ␊ |
310 | /*␊ |
311 | * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a␊ |
312 | * null terminated UTF-8 string at utf8p.␊ |
313 | *␊ |
314 | * ucslen is the number of UCS-2 input characters (not bytes)␊ |
315 | * bufsize is the size of the output buffer in bytes␊ |
316 | */␊ |
317 | void␊ |
318 | utf_encodestr( const u_int16_t * ucsp, int ucslen,␊ |
319 | ␉␉␉ u_int8_t * utf8p, u_int32_t bufsize, int byte_order )␊ |
320 | {␊ |
321 | ␉u_int8_t *bufend;␊ |
322 | ␉u_int16_t ucs_ch;␊ |
323 | ␉␊ |
324 | ␉bufend = utf8p + bufsize;␊ |
325 | ␉␊ |
326 | ␉while (ucslen-- > 0) {␊ |
327 | ␉␉if (byte_order == OSBigEndian)␊ |
328 | ␉␉ ucs_ch = SWAP_BE16(*ucsp++);␊ |
329 | ␉␉else␊ |
330 | ␉␉␉ucs_ch = SWAP_LE16(*ucsp++);␊ |
331 | ␉␉␊ |
332 | ␉␉if (ucs_ch < 0x0080) {␊ |
333 | ␉␉␉if (utf8p >= bufend)␊ |
334 | ␉␉␉␉break;␊ |
335 | ␉␉␉if (ucs_ch == '\0')␊ |
336 | ␉␉␉␉continue;␉/* skip over embedded NULLs */␊ |
337 | ␉␉␉*utf8p++ = ucs_ch;␊ |
338 | ␉␉␉␊ |
339 | ␉␉} else if (ucs_ch < 0x800) {␊ |
340 | ␉␉␉if ((utf8p + 1) >= bufend)␊ |
341 | ␉␉␉␉break;␊ |
342 | ␉␉␉*utf8p++ = (ucs_ch >> 6) | 0xc0;␊ |
343 | ␉␉␉*utf8p++ = (ucs_ch & 0x3f) | 0x80;␊ |
344 | ␉␉␉␊ |
345 | ␉␉} else {␊ |
346 | ␉␉␉if ((utf8p + 2) >= bufend)␊ |
347 | ␉␉␉␉break;␊ |
348 | ␉␉␉*utf8p++ = (ucs_ch >> 12) | 0xe0;␊ |
349 | ␉␉␉*utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;␊ |
350 | ␉␉␉*utf8p++ = ((ucs_ch) & 0x3f) | 0x80;␊ |
351 | ␉␉}␊ |
352 | ␉}␊ |
353 | ␉␊ |
354 | ␉*utf8p = '\0';␊ |
355 | }␊ |
356 | ␊ |
357 | ␊ |
358 | /*␊ |
359 | * utf_decodestr - Decodes the null terminated UTF-8 string at␊ |
360 | * utf8p into a UCS-2 (Unicode) string at ucsp.␊ |
361 | *␊ |
362 | * ucslen is the number of UCS-2 output characters (not bytes)␊ |
363 | * bufsize is the size of the output buffer in bytes␊ |
364 | */␊ |
365 | void utf_decodestr(const u_int8_t * utf8p, u_int16_t * ucsp, u_int16_t * ucslen, u_int32_t bufsize, int byte_order)␊ |
366 | {␊ |
367 | ␉u_int16_t *bufstart;␊ |
368 | ␉u_int16_t *bufend;␊ |
369 | ␉u_int16_t ucs_ch;␊ |
370 | ␉u_int8_t byte;␊ |
371 | ␉␊ |
372 | ␉bufstart = ucsp;␊ |
373 | ␉bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);␊ |
374 | ␉␊ |
375 | ␉while ((byte = *utf8p++) != '\0') {␊ |
376 | ␉␉if (ucsp >= bufend)␊ |
377 | ␉␉␉break;␊ |
378 | ␉␉␊ |
379 | ␉␉/* check for ascii */␊ |
380 | ␉␉if (byte < 0x80) {␊ |
381 | ␉␉␉ucs_ch = byte;␊ |
382 | ␉␉␉␊ |
383 | ␉␉␉if (byte_order == OSBigEndian)␊ |
384 | ␉␉␉␉*ucsp++ = SWAP_BE16(ucs_ch);␊ |
385 | ␉␉␉else␊ |
386 | ␉␉␉␉*ucsp++ = SWAP_LE16(ucs_ch);␊ |
387 | ␉␉␉␊ |
388 | ␉␉␉continue;␊ |
389 | ␉␉}␊ |
390 | ␉␉␊ |
391 | ␉␉switch (byte & 0xf0) {␊ |
392 | ␉␉␉␉/* 2 byte sequence*/␊ |
393 | ␉␉␉case 0xc0:␊ |
394 | ␉␉␉case 0xd0:␊ |
395 | ␉␉␉␉/* extract bits 6 - 10 from first byte */␊ |
396 | ␉␉␉␉ucs_ch = (byte & 0x1F) << 6; ␊ |
397 | ␉␉␉␉break;␊ |
398 | ␉␉␉␉/* 3 byte sequence*/␊ |
399 | ␉␉␉case 0xe0:␊ |
400 | ␉␉␉␉/* extract bits 12 - 15 from first byte */␊ |
401 | ␉␉␉␉ucs_ch = (byte & 0x0F) << 6;␊ |
402 | ␉␉␉␉␊ |
403 | ␉␉␉␉/* extract bits 6 - 11 from second byte */␊ |
404 | ␉␉␉␉if (((byte = *utf8p++) & 0xc0) != 0x80)␊ |
405 | ␉␉␉␉␉goto stop;␊ |
406 | ␉␉␉␉␊ |
407 | ␉␉␉␉ucs_ch += (byte & 0x3F);␊ |
408 | ␉␉␉␉ucs_ch <<= 6;␊ |
409 | ␉␉␉␉break;␊ |
410 | ␉␉␉default:␊ |
411 | ␉␉␉␉goto stop;␊ |
412 | ␉␉}␊ |
413 | ␉␉␊ |
414 | ␉␉/* extract bits 0 - 5 from final byte */␊ |
415 | ␉␉if (((byte = *utf8p++) & 0xc0) != 0x80)␊ |
416 | ␉␉␉goto stop;␊ |
417 | ␉␉ucs_ch += (byte & 0x3F); ␊ |
418 | ␉␉␊ |
419 | ␉␉if (byte_order == OSBigEndian)␊ |
420 | ␉␉␉*ucsp++ = SWAP_BE16(ucs_ch);␊ |
421 | ␉␉else␊ |
422 | ␉␉␉*ucsp++ = SWAP_LE16(ucs_ch);␊ |
423 | ␉}␊ |
424 | stop:␊ |
425 | ␉if (byte_order == OSBigEndian)␊ |
426 | ␉␉*ucslen = SWAP_BE16(ucsp - bufstart);␊ |
427 | ␉else␊ |
428 | ␉␉*ucslen = SWAP_LE16(ucsp - bufstart);␊ |
429 | }␊ |
430 | |