1 | /*␊ |
2 | * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.␊ |
3 | *␊ |
4 | * @APPLE_LICENSE_HEADER_START@␊ |
5 | * ␊ |
6 | * The contents of this file constitute Original Code as defined in and␊ |
7 | * are subject to the Apple Public Source License Version 2.0 (the␊ |
8 | * "License"). You may not use this file except in compliance with the␊ |
9 | * License. Please obtain a copy of the License at␊ |
10 | * http://www.apple.com/publicsource and read it before using this file.␊ |
11 | * ␊ |
12 | * This Original Code and all software distributed under the License are␊ |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER␊ |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,␊ |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,␊ |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the␊ |
17 | * License for the specific language governing rights and limitations␊ |
18 | * under the License.␊ |
19 | * ␊ |
20 | * @APPLE_LICENSE_HEADER_END@␊ |
21 | */␊ |
22 | /*␊ |
23 | * HFSCompare.c - Functions for working with and comparing HFS nams.␊ |
24 | *␊ |
25 | * Copyright (c) 1999-2000 Apple Computer, Inc.␊ |
26 | *␊ |
27 | * DRI: Josh de Cesare␊ |
28 | */␊ |
29 | ␊ |
30 | #include "sl.h"␊ |
31 | #include "hfs_CaseTables.h"␊ |
32 | #include "hfs.h"␊ |
33 | ␊ |
34 | #if ! UNCOMPRESSED␊ |
35 | ␊ |
36 | static unsigned short *␊ |
37 | UncompressStructure(struct compressed_block *bp, int count, int size)␊ |
38 | {␊ |
39 | unsigned short *out = malloc(size);␊ |
40 | ␉if (!out) ␊ |
41 | ␉{␊ |
42 | ␉␉stop("UncompressStructure unable to allocate memory\n");␊ |
43 | ␉␉return 0;␊ |
44 | ␉}␊ |
45 | unsigned short *op = out;␊ |
46 | unsigned short data;␊ |
47 | int i, j;␊ |
48 | ␊ |
49 | for (i=0; i<count; i++, bp++) {␊ |
50 | // If this happens (it shouldn't) please fix size and/or double check that count really is␊ |
51 | // the number of elements in the array.␊ |
52 | // This was a very hard bug to find, so please leave this code here.␊ |
53 | if(out + size <= op + bp->count)␊ |
54 | {␊ |
55 | stop("HFS+ Unicode tables are malformed\n");␊ |
56 | return 0;␊ |
57 | }␊ |
58 | data = bp->data;␊ |
59 | for (j=0; j<bp->count; j++) {␊ |
60 | *op++ = data;␊ |
61 | if (bp->type == kTypeAscending) data++;␊ |
62 | else if (bp->type == kTypeAscending256) data += 256;␊ |
63 | }␊ |
64 | }␊ |
65 | return out;␊ |
66 | }␊ |
67 | ␊ |
68 | static void␊ |
69 | InitCompareTables(void)␊ |
70 | {␊ |
71 | if (gCompareTable == 0) {␊ |
72 | gCompareTable = UncompressStructure(gCompareTableCompressed,␊ |
73 | kCompareTableNBlocks, kCompareTableDataSize);␊ |
74 | gLowerCaseTable = UncompressStructure(gLowerCaseTableCompressed,␊ |
75 | kLowerCaseTableNBlocks, kLowerCaseTableDataSize);␊ |
76 | }␊ |
77 | }␊ |
78 | ␊ |
79 | #endif /* ! UNCOMPRESSED */␊ |
80 | ␊ |
81 | //_______________________________________________________________________␊ |
82 | //␊ |
83 | //␉Routine:␉FastRelString␊ |
84 | //␊ |
85 | //␉Output:␉␉returns -1 if str1 < str2␊ |
86 | //␉␉␉␉returns 1 if str1 > str2␊ |
87 | //␉␉␉␉return␉ 0 if equal␊ |
88 | //␊ |
89 | //_______________________________________________________________________␊ |
90 | ␊ |
91 | int32_t␉FastRelString(u_int8_t * str1, u_int8_t * str2)␊ |
92 | {␊ |
93 | ␉int32_t bestGuess;␊ |
94 | ␉u_int8_t length, length2;␊ |
95 | ␊ |
96 | #if ! UNCOMPRESED␊ |
97 | InitCompareTables();␊ |
98 | #endif␊ |
99 | ␊ |
100 | ␉length = *(str1++);␊ |
101 | ␉length2 = *(str2++);␊ |
102 | ␊ |
103 | ␉if (length == length2)␊ |
104 | ␉␉bestGuess = 0;␊ |
105 | ␉else if (length < length2)␊ |
106 | ␉␉bestGuess = -1;␊ |
107 | ␉else␊ |
108 | ␉{␊ |
109 | ␉␉bestGuess = 1;␊ |
110 | ␉␉length = length2;␊ |
111 | ␉}␊ |
112 | ␊ |
113 | ␉while (length--)␊ |
114 | ␉{␊ |
115 | ␉␉u_int32_t aChar, bChar;␊ |
116 | ␊ |
117 | ␉␉aChar = *(str1++);␊ |
118 | ␉␉bChar = *(str2++);␊ |
119 | ␉␉␊ |
120 | ␉␉if (aChar != bChar)␉/* If they don't match exacly, do case conversion */␊ |
121 | ␉␉{␊ |
122 | ␉␉␉u_int16_t aSortWord, bSortWord;␊ |
123 | ␊ |
124 | ␉␉␉aSortWord = gCompareTable[aChar];␊ |
125 | ␉␉␉bSortWord = gCompareTable[bChar];␊ |
126 | ␊ |
127 | ␉␉␉if (aSortWord > bSortWord)␊ |
128 | ␉␉␉␉return 1;␊ |
129 | ␊ |
130 | ␉␉␉if (aSortWord < bSortWord)␊ |
131 | ␉␉␉␉return -1;␊ |
132 | ␉␉}␊ |
133 | ␉␉␊ |
134 | ␉␉/*␊ |
135 | ␉␉ * If characters match exactly, then go on to next character␊ |
136 | ␉␉ * immediately without doing any extra work.␊ |
137 | ␉␉ */␊ |
138 | ␉}␊ |
139 | ␉␊ |
140 | ␉/* if you got to here, then return bestGuess */␊ |
141 | ␉return bestGuess;␊ |
142 | }␉␊ |
143 | ␊ |
144 | ␊ |
145 | //␊ |
146 | //␉FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering␊ |
147 | //␊ |
148 | //␉ IF␉␉␉␉RESULT␊ |
149 | //␉--------------------------␊ |
150 | //␉str1 < str2␉␉=>␉-1␊ |
151 | //␉str1 = str2␉␉=>␉ 0␊ |
152 | //␉str1 > str2␉␉=>␉+1␊ |
153 | //␊ |
154 | //␉The lower case table starts with 256 entries (one for each of the upper bytes␊ |
155 | //␉of the original Unicode char). If that entry is zero, then all characters with␊ |
156 | //␉that upper byte are already case folded. If the entry is non-zero, then it is␊ |
157 | //␉the _index_ (not byte offset) of the start of the sub-table for the characters␊ |
158 | //␉with that upper byte. All ignorable characters are folded to the value zero.␊ |
159 | //␊ |
160 | //␉In pseudocode:␊ |
161 | //␊ |
162 | //␉␉Let c = source Unicode character␊ |
163 | //␉␉Let table[] = lower case table␊ |
164 | //␊ |
165 | //␉␉lower = table[highbyte(c)]␊ |
166 | //␉␉if (lower == 0)␊ |
167 | //␉␉␉lower = c␊ |
168 | //␉␉else␊ |
169 | //␉␉␉lower = table[lower+lowbyte(c)]␊ |
170 | //␊ |
171 | //␉␉if (lower == 0)␊ |
172 | //␉␉␉ignore this character␊ |
173 | //␊ |
174 | //␉To handle ignorable characters, we now need a loop to find the next valid character.␊ |
175 | //␉Also, we can't pre-compute the number of characters to compare; the string length might␊ |
176 | //␉be larger than the number of non-ignorable characters. Further, we must be able to handle␊ |
177 | //␉ignorable characters at any point in the string, including as the first or last characters.␊ |
178 | //␉We use a zero value as a sentinel to detect both end-of-string and ignorable characters.␊ |
179 | //␉Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,␊ |
180 | //␉the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is␊ |
181 | //␉an invalid Unicode character).␊ |
182 | //␊ |
183 | //␉Pseudocode:␊ |
184 | //␊ |
185 | //␉␉while (1) {␊ |
186 | //␉␉␉c1 = GetNextValidChar(str1)␉␉␉//␉returns zero if at end of string␊ |
187 | //␉␉␉c2 = GetNextValidChar(str2)␊ |
188 | //␊ |
189 | //␉␉␉if (c1 != c2) break␉␉␉␉␉//␉found a difference␊ |
190 | //␊ |
191 | //␉␉␉if (c1 == 0)␉␉␉␉␉␉//␉reached end of string on both strings at once?␊ |
192 | //␉␉␉␉return 0;␉␉␉␉␉␉//␉yes, so strings are equal␊ |
193 | //␉␉}␊ |
194 | //␊ |
195 | //␉␉// When we get here, c1 != c2. So, we just need to determine which one is less.␊ |
196 | //␉␉if (c1 < c2)␊ |
197 | //␉␉␉return -1;␊ |
198 | //␉␉else␊ |
199 | //␉␉␉return 1;␊ |
200 | //␊ |
201 | ␊ |
202 | int32_t FastUnicodeCompare( u_int16_t * str1, register u_int32_t length1,␊ |
203 | u_int16_t * str2, register u_int32_t length2, int byte_order )␊ |
204 | {␊ |
205 | ␉register u_int16_t c1,c2;␊ |
206 | ␉register u_int16_t temp;␊ |
207 | ␊ |
208 | #if ! UNCOMPRESSED␊ |
209 | InitCompareTables();␊ |
210 | #endif␊ |
211 | ␊ |
212 | ␉while (1) {␊ |
213 | ␉␉/* Set default values for c1, c2 in case there are no more valid chars */␊ |
214 | ␉␉c1 = 0;␊ |
215 | ␉␉c2 = 0;␊ |
216 | ␊ |
217 | ␉␉/* Find next non-ignorable char from str1, or zero if no more */␊ |
218 | ␉␉while (length1 && c1 == 0) {␊ |
219 | ␉␉␉if (byte_order == OSBigEndian)␊ |
220 | ␉␉␉␉c1 = SWAP_BE16(*(str1++));␊ |
221 | ␉␉␉else␊ |
222 | ␉␉␉␉c1 = SWAP_LE16(*(str1++));␊ |
223 | ␉␉␉--length1;␊ |
224 | ␉␉␉if ((temp = gLowerCaseTable[c1>>8]) != 0)␉␉// is there a subtable for this upper byte?␊ |
225 | ␉␉␉␉c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];␉// yes, so fold the char␊ |
226 | ␉␉}␊ |
227 | ␊ |
228 | ␉␉/* Find next non-ignorable char from str2, or zero if no more */␊ |
229 | ␉␉while (length2 && c2 == 0) {␊ |
230 | ␉␉␉if (byte_order == OSBigEndian)␊ |
231 | ␉␉␉␉c2 = SWAP_BE16(*(str2++));␊ |
232 | ␉␉␉else␊ |
233 | ␉␉␉␉c2 = SWAP_LE16(*(str2++));␊ |
234 | ␉␉␉--length2;␊ |
235 | ␉␉␉if ((temp = gLowerCaseTable[c2>>8]) != 0)␉␉// is there a subtable for this upper byte?␊ |
236 | ␉␉␉␉c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];␉// yes, so fold the char␊ |
237 | ␉␉}␊ |
238 | ␊ |
239 | ␉␉if (c1 != c2)␉/* found a difference, so stop looping */␊ |
240 | ␉␉␉break;␊ |
241 | ␉␉␊ |
242 | ␉␉if (c1 == 0)␉␉/* did we reach the end of both strings at the same time? */␊ |
243 | ␉␉␉return 0;␉/* yes, so strings are equal */␊ |
244 | ␉}␊ |
245 | ␉␊ |
246 | ␉if (c1 < c2)␊ |
247 | ␉␉return -1;␊ |
248 | ␉else␊ |
249 | ␉␉return 1;␊ |
250 | }␊ |
251 | ␊ |
252 | ␊ |
253 | //␊ |
254 | // BinaryUnicodeCompare - Compare two Unicode strings; produce a relative ordering␊ |
255 | // Compared using a 16-bit binary comparison (no case folding)␊ |
256 | //␊ |
257 | int32_t BinaryUnicodeCompare (u_int16_t * str1, u_int32_t length1,␊ |
258 | u_int16_t * str2, u_int32_t length2)␊ |
259 | {␊ |
260 | register u_int16_t c1, c2;␊ |
261 | int32_t bestGuess;␊ |
262 | u_int32_t length;␊ |
263 | ␊ |
264 | bestGuess = 0;␊ |
265 | ␊ |
266 | if (length1 < length2) {␊ |
267 | length = length1;␊ |
268 | --bestGuess;␊ |
269 | } else if (length1 > length2) {␊ |
270 | length = length2;␊ |
271 | ++bestGuess;␊ |
272 | } else {␊ |
273 | length = length1;␊ |
274 | }␊ |
275 | ␊ |
276 | while (length--) {␊ |
277 | c1 = *(str1++);␊ |
278 | c2 = *(str2++);␊ |
279 | ␊ |
280 | if (c1 > c2)␊ |
281 | return (1);␊ |
282 | if (c1 < c2)␊ |
283 | return (-1);␊ |
284 | }␊ |
285 | ␊ |
286 | return (bestGuess);␊ |
287 | }␊ |
288 | ␊ |
289 | ␊ |
290 | /*␊ |
291 | * UTF-8 (UCS Transformation Format)␊ |
292 | *␊ |
293 | * The following subset of UTF-8 is used to encode UCS-2 filenames. It␊ |
294 | * requires a maximum of three 3 bytes per UCS-2 character. Only the␊ |
295 | * shortest encoding required to represent the significant UCS-2 bits␊ |
296 | * is legal.␊ |
297 | * ␊ |
298 | * UTF-8 Multibyte Codes␊ |
299 | *␊ |
300 | * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)␊ |
301 | * -------------------------------------------------------------------␊ |
302 | * 1 7 0x0000 0x007F 0xxxxxxx␊ |
303 | * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx␊ |
304 | * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx␊ |
305 | * -------------------------------------------------------------------␊ |
306 | */␊ |
307 | ␊ |
308 | ␊ |
309 | /*␊ |
310 | * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a␊ |
311 | * null terminated UTF-8 string at utf8p.␊ |
312 | *␊ |
313 | * ucslen is the number of UCS-2 input characters (not bytes)␊ |
314 | * bufsize is the size of the output buffer in bytes␊ |
315 | */␊ |
316 | void␊ |
317 | utf_encodestr( const u_int16_t * ucsp, int ucslen,␊ |
318 | u_int8_t * utf8p, u_int32_t bufsize, int byte_order )␊ |
319 | {␊ |
320 | ␉u_int8_t *bufend;␊ |
321 | ␉u_int16_t ucs_ch;␊ |
322 | ␊ |
323 | ␉bufend = utf8p + bufsize;␊ |
324 | ␊ |
325 | ␉while (ucslen-- > 0) {␊ |
326 | if (byte_order == OSBigEndian)␊ |
327 | ␉␉ ucs_ch = SWAP_BE16(*ucsp++);␊ |
328 | else␊ |
329 | ucs_ch = SWAP_LE16(*ucsp++);␊ |
330 | ␊ |
331 | ␉␉if (ucs_ch < 0x0080) {␊ |
332 | ␉␉␉if (utf8p >= bufend)␊ |
333 | ␉␉␉␉break;␊ |
334 | ␉␉␉if (ucs_ch == '\0')␊ |
335 | ␉␉␉␉continue;␉/* skip over embedded NULLs */␊ |
336 | ␉␉␉*utf8p++ = ucs_ch;␊ |
337 | ␊ |
338 | ␉␉} else if (ucs_ch < 0x800) {␊ |
339 | ␉␉␉if ((utf8p + 1) >= bufend)␊ |
340 | ␉␉␉␉break;␊ |
341 | ␉␉␉*utf8p++ = (ucs_ch >> 6) | 0xc0;␊ |
342 | ␉␉␉*utf8p++ = (ucs_ch & 0x3f) | 0x80;␊ |
343 | ␊ |
344 | ␉␉} else {␊ |
345 | ␉␉␉if ((utf8p + 2) >= bufend)␊ |
346 | ␉␉␉␉break;␊ |
347 | ␉␉␉*utf8p++ = (ucs_ch >> 12) | 0xe0;␊ |
348 | ␉␉␉*utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;␊ |
349 | ␉␉␉*utf8p++ = ((ucs_ch) & 0x3f) | 0x80;␊ |
350 | ␉␉}␊ |
351 | ␉}␊ |
352 | ␊ |
353 | ␉*utf8p = '\0';␊ |
354 | }␊ |
355 | ␊ |
356 | ␊ |
357 | /*␊ |
358 | * utf_decodestr - Decodes the null terminated UTF-8 string at␊ |
359 | * utf8p into a UCS-2 (Unicode) string at ucsp.␊ |
360 | *␊ |
361 | * ucslen is the number of UCS-2 output characters (not bytes)␊ |
362 | * bufsize is the size of the output buffer in bytes␊ |
363 | */␊ |
364 | void utf_decodestr(const u_int8_t * utf8p, u_int16_t * ucsp, u_int16_t * ucslen, u_int32_t bufsize, int byte_order)␊ |
365 | {␊ |
366 | ␉u_int16_t *bufstart;␊ |
367 | ␉u_int16_t *bufend;␊ |
368 | ␉u_int16_t ucs_ch;␊ |
369 | ␉u_int8_t byte;␊ |
370 | ␊ |
371 | ␉bufstart = ucsp;␊ |
372 | ␉bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);␊ |
373 | ␊ |
374 | ␉while ((byte = *utf8p++) != '\0') {␊ |
375 | ␉␉if (ucsp >= bufend)␊ |
376 | ␉␉␉break;␊ |
377 | ␊ |
378 | ␉␉/* check for ascii */␊ |
379 | ␉␉if (byte < 0x80) {␊ |
380 | ␉␉␉ucs_ch = byte;␊ |
381 | ␉␉␉␊ |
382 | if (byte_order == OSBigEndian)␊ |
383 | ␉␉ *ucsp++ = SWAP_BE16(ucs_ch);␊ |
384 | else␊ |
385 | *ucsp++ = SWAP_LE16(ucs_ch);␊ |
386 | ␉␉␉␊ |
387 | ␉␉␉continue;␊ |
388 | ␉␉}␊ |
389 | ␊ |
390 | ␉␉switch (byte & 0xf0) {␊ |
391 | ␉␉/* 2 byte sequence*/␊ |
392 | ␉␉case 0xc0:␊ |
393 | ␉␉case 0xd0:␊ |
394 | ␉␉␉/* extract bits 6 - 10 from first byte */␊ |
395 | ␉␉␉ucs_ch = (byte & 0x1F) << 6; ␊ |
396 | ␉␉␉break;␊ |
397 | ␉␉/* 3 byte sequence*/␊ |
398 | ␉␉case 0xe0:␊ |
399 | ␉␉␉/* extract bits 12 - 15 from first byte */␊ |
400 | ␉␉␉ucs_ch = (byte & 0x0F) << 6;␊ |
401 | ␊ |
402 | ␉␉␉/* extract bits 6 - 11 from second byte */␊ |
403 | ␉␉␉if (((byte = *utf8p++) & 0xc0) != 0x80)␊ |
404 | ␉␉␉␉goto stop;␊ |
405 | ␊ |
406 | ␉␉␉ucs_ch += (byte & 0x3F);␊ |
407 | ␉␉␉ucs_ch <<= 6;␊ |
408 | ␉␉␉break;␊ |
409 | ␉␉default:␊ |
410 | ␉␉␉goto stop;␊ |
411 | ␉␉}␊ |
412 | ␊ |
413 | ␉␉/* extract bits 0 - 5 from final byte */␊ |
414 | ␉␉if (((byte = *utf8p++) & 0xc0) != 0x80)␊ |
415 | ␉␉␉goto stop;␊ |
416 | ␉␉ucs_ch += (byte & 0x3F); ␊ |
417 | ␊ |
418 | if (byte_order == OSBigEndian)␊ |
419 | *ucsp++ = SWAP_BE16(ucs_ch);␊ |
420 | else␊ |
421 | *ucsp++ = SWAP_LE16(ucs_ch);␊ |
422 | ␉}␊ |
423 | stop:␊ |
424 | if (byte_order == OSBigEndian)␊ |
425 | *ucslen = SWAP_BE16(ucsp - bufstart);␊ |
426 | else␊ |
427 | *ucslen = SWAP_LE16(ucsp - bufstart);␊ |
428 | }␊ |
429 | |