1 | /*␊ |
2 | * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.␊ |
3 | *␊ |
4 | * @APPLE_LICENSE_HEADER_START@␊ |
5 | * ␊ |
6 | * The contents of this file constitute Original Code as defined in and␊ |
7 | * are subject to the Apple Public Source License Version 2.0 (the␊ |
8 | * "License"). You may not use this file except in compliance with the␊ |
9 | * License. Please obtain a copy of the License at␊ |
10 | * http://www.apple.com/publicsource and read it before using this file.␊ |
11 | * ␊ |
12 | * This Original Code and all software distributed under the License are␊ |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER␊ |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,␊ |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,␊ |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the␊ |
17 | * License for the specific language governing rights and limitations␊ |
18 | * under the License.␊ |
19 | * ␊ |
20 | * @APPLE_LICENSE_HEADER_END@␊ |
21 | */␊ |
22 | /*␊ |
23 | * HFSCompare.c - Functions for working with and comparing HFS nams.␊ |
24 | *␊ |
25 | * Copyright (c) 1999-2000 Apple Computer, Inc.␊ |
26 | *␊ |
27 | * DRI: Josh de Cesare␊ |
28 | */␊ |
29 | ␊ |
30 | //#include <sl.h>␊ |
31 | #include "sl.h" //Azi:???␊ |
32 | #include "hfs_CaseTables.h"␊ |
33 | ␊ |
34 | #if ! UNCOMPRESSED␊ |
35 | ␊ |
36 | static unsigned short *␊ |
37 | UncompressStructure(struct compressed_block *bp, int count, int size)␊ |
38 | {␊ |
39 | unsigned short *out = malloc(size);␊ |
40 | unsigned short *op = out;␊ |
41 | unsigned short data;␊ |
42 | int i, j;␊ |
43 | ␊ |
44 | for (i=0; i<count; i++, bp++) {␊ |
45 | // If this happens (it shouldn't) please fix size and/or double check that count really is␊ |
46 | // the number of elements in the array.␊ |
47 | // This was a very hard bug to find, so please leave this code here.␊ |
48 | if(out + size <= op + bp->count)␊ |
49 | {␊ |
50 | stop("HFS+ Unicode tables are malformed\n");␊ |
51 | }␊ |
52 | data = bp->data;␊ |
53 | for (j=0; j<bp->count; j++) {␊ |
54 | *op++ = data;␊ |
55 | if (bp->type == kTypeAscending) data++;␊ |
56 | else if (bp->type == kTypeAscending256) data += 256;␊ |
57 | }␊ |
58 | }␊ |
59 | return out;␊ |
60 | }␊ |
61 | ␊ |
62 | static void␊ |
63 | InitCompareTables(void)␊ |
64 | {␊ |
65 | if (gCompareTable == 0) {␊ |
66 | gCompareTable = UncompressStructure(gCompareTableCompressed,␊ |
67 | kCompareTableNBlocks, kCompareTableDataSize);␊ |
68 | gLowerCaseTable = UncompressStructure(gLowerCaseTableCompressed,␊ |
69 | kLowerCaseTableNBlocks, kLowerCaseTableDataSize);␊ |
70 | }␊ |
71 | }␊ |
72 | ␊ |
73 | #endif /* ! UNCOMPRESSED */␊ |
74 | ␊ |
75 | //_______________________________________________________________________␊ |
76 | //␊ |
77 | //␉Routine:␉FastRelString␊ |
78 | //␊ |
79 | //␉Output:␉␉returns -1 if str1 < str2␊ |
80 | //␉␉␉␉returns 1 if str1 > str2␊ |
81 | //␉␉␉␉return␉ 0 if equal␊ |
82 | //␊ |
83 | //_______________________________________________________________________␊ |
84 | ␊ |
85 | int32_t␉FastRelString(u_int8_t * str1, u_int8_t * str2)␊ |
86 | {␊ |
87 | ␉int32_t bestGuess;␊ |
88 | ␉u_int8_t length, length2;␊ |
89 | ␊ |
90 | #if ! UNCOMPRESED␊ |
91 | InitCompareTables();␊ |
92 | #endif␊ |
93 | ␊ |
94 | ␉length = *(str1++);␊ |
95 | ␉length2 = *(str2++);␊ |
96 | ␊ |
97 | ␉if (length == length2)␊ |
98 | ␉␉bestGuess = 0;␊ |
99 | ␉else if (length < length2)␊ |
100 | ␉␉bestGuess = -1;␊ |
101 | ␉else␊ |
102 | ␉{␊ |
103 | ␉␉bestGuess = 1;␊ |
104 | ␉␉length = length2;␊ |
105 | ␉}␊ |
106 | ␊ |
107 | ␉while (length--)␊ |
108 | ␉{␊ |
109 | ␉␉u_int32_t aChar, bChar;␊ |
110 | ␊ |
111 | ␉␉aChar = *(str1++);␊ |
112 | ␉␉bChar = *(str2++);␊ |
113 | ␉␉␊ |
114 | ␉␉if (aChar != bChar)␉/* If they don't match exacly, do case conversion */␊ |
115 | ␉␉{␊ |
116 | ␉␉␉u_int16_t aSortWord, bSortWord;␊ |
117 | ␊ |
118 | ␉␉␉aSortWord = gCompareTable[aChar];␊ |
119 | ␉␉␉bSortWord = gCompareTable[bChar];␊ |
120 | ␊ |
121 | ␉␉␉if (aSortWord > bSortWord)␊ |
122 | ␉␉␉␉return 1;␊ |
123 | ␊ |
124 | ␉␉␉if (aSortWord < bSortWord)␊ |
125 | ␉␉␉␉return -1;␊ |
126 | ␉␉}␊ |
127 | ␉␉␊ |
128 | ␉␉/*␊ |
129 | ␉␉ * If characters match exactly, then go on to next character␊ |
130 | ␉␉ * immediately without doing any extra work.␊ |
131 | ␉␉ */␊ |
132 | ␉}␊ |
133 | ␉␊ |
134 | ␉/* if you got to here, then return bestGuess */␊ |
135 | ␉return bestGuess;␊ |
136 | }␉␊ |
137 | ␊ |
138 | ␊ |
139 | //␊ |
140 | //␉FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering␊ |
141 | //␊ |
142 | //␉ IF␉␉␉␉RESULT␊ |
143 | //␉--------------------------␊ |
144 | //␉str1 < str2␉␉=>␉-1␊ |
145 | //␉str1 = str2␉␉=>␉ 0␊ |
146 | //␉str1 > str2␉␉=>␉+1␊ |
147 | //␊ |
148 | //␉The lower case table starts with 256 entries (one for each of the upper bytes␊ |
149 | //␉of the original Unicode char). If that entry is zero, then all characters with␊ |
150 | //␉that upper byte are already case folded. If the entry is non-zero, then it is␊ |
151 | //␉the _index_ (not byte offset) of the start of the sub-table for the characters␊ |
152 | //␉with that upper byte. All ignorable characters are folded to the value zero.␊ |
153 | //␊ |
154 | //␉In pseudocode:␊ |
155 | //␊ |
156 | //␉␉Let c = source Unicode character␊ |
157 | //␉␉Let table[] = lower case table␊ |
158 | //␊ |
159 | //␉␉lower = table[highbyte(c)]␊ |
160 | //␉␉if (lower == 0)␊ |
161 | //␉␉␉lower = c␊ |
162 | //␉␉else␊ |
163 | //␉␉␉lower = table[lower+lowbyte(c)]␊ |
164 | //␊ |
165 | //␉␉if (lower == 0)␊ |
166 | //␉␉␉ignore this character␊ |
167 | //␊ |
168 | //␉To handle ignorable characters, we now need a loop to find the next valid character.␊ |
169 | //␉Also, we can't pre-compute the number of characters to compare; the string length might␊ |
170 | //␉be larger than the number of non-ignorable characters. Further, we must be able to handle␊ |
171 | //␉ignorable characters at any point in the string, including as the first or last characters.␊ |
172 | //␉We use a zero value as a sentinel to detect both end-of-string and ignorable characters.␊ |
173 | //␉Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,␊ |
174 | //␉the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is␊ |
175 | //␉an invalid Unicode character).␊ |
176 | //␊ |
177 | //␉Pseudocode:␊ |
178 | //␊ |
179 | //␉␉while (1) {␊ |
180 | //␉␉␉c1 = GetNextValidChar(str1)␉␉␉//␉returns zero if at end of string␊ |
181 | //␉␉␉c2 = GetNextValidChar(str2)␊ |
182 | //␊ |
183 | //␉␉␉if (c1 != c2) break␉␉␉␉␉//␉found a difference␊ |
184 | //␊ |
185 | //␉␉␉if (c1 == 0)␉␉␉␉␉␉//␉reached end of string on both strings at once?␊ |
186 | //␉␉␉␉return 0;␉␉␉␉␉␉//␉yes, so strings are equal␊ |
187 | //␉␉}␊ |
188 | //␊ |
189 | //␉␉// When we get here, c1 != c2. So, we just need to determine which one is less.␊ |
190 | //␉␉if (c1 < c2)␊ |
191 | //␉␉␉return -1;␊ |
192 | //␉␉else␊ |
193 | //␉␉␉return 1;␊ |
194 | //␊ |
195 | ␊ |
196 | int32_t FastUnicodeCompare( u_int16_t * str1, register u_int32_t length1,␊ |
197 | u_int16_t * str2, register u_int32_t length2, int byte_order )␊ |
198 | {␊ |
199 | ␉register u_int16_t c1,c2;␊ |
200 | ␉register u_int16_t temp;␊ |
201 | ␊ |
202 | #if ! UNCOMPRESSED␊ |
203 | InitCompareTables();␊ |
204 | #endif␊ |
205 | ␊ |
206 | ␉while (1) {␊ |
207 | ␉␉/* Set default values for c1, c2 in case there are no more valid chars */␊ |
208 | ␉␉c1 = 0;␊ |
209 | ␉␉c2 = 0;␊ |
210 | ␊ |
211 | ␉␉/* Find next non-ignorable char from str1, or zero if no more */␊ |
212 | ␉␉while (length1 && c1 == 0) {␊ |
213 | ␉␉␉if (byte_order == OSBigEndian)␊ |
214 | ␉␉␉␉c1 = SWAP_BE16(*(str1++));␊ |
215 | ␉␉␉else␊ |
216 | ␉␉␉␉c1 = SWAP_LE16(*(str1++));␊ |
217 | ␉␉␉--length1;␊ |
218 | ␉␉␉if ((temp = gLowerCaseTable[c1>>8]) != 0)␉␉// is there a subtable for this upper byte?␊ |
219 | ␉␉␉␉c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];␉// yes, so fold the char␊ |
220 | ␉␉}␊ |
221 | ␊ |
222 | ␉␉/* Find next non-ignorable char from str2, or zero if no more */␊ |
223 | ␉␉while (length2 && c2 == 0) {␊ |
224 | ␉␉␉if (byte_order == OSBigEndian)␊ |
225 | ␉␉␉␉c2 = SWAP_BE16(*(str2++));␊ |
226 | ␉␉␉else␊ |
227 | ␉␉␉␉c2 = SWAP_LE16(*(str2++));␊ |
228 | ␉␉␉--length2;␊ |
229 | ␉␉␉if ((temp = gLowerCaseTable[c2>>8]) != 0)␉␉// is there a subtable for this upper byte?␊ |
230 | ␉␉␉␉c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];␉// yes, so fold the char␊ |
231 | ␉␉}␊ |
232 | ␊ |
233 | ␉␉if (c1 != c2)␉/* found a difference, so stop looping */␊ |
234 | ␉␉␉break;␊ |
235 | ␉␉␊ |
236 | ␉␉if (c1 == 0)␉␉/* did we reach the end of both strings at the same time? */␊ |
237 | ␉␉␉return 0;␉/* yes, so strings are equal */␊ |
238 | ␉}␊ |
239 | ␉␊ |
240 | ␉if (c1 < c2)␊ |
241 | ␉␉return -1;␊ |
242 | ␉else␊ |
243 | ␉␉return 1;␊ |
244 | }␊ |
245 | ␊ |
246 | ␊ |
247 | //␊ |
248 | // BinaryUnicodeCompare - Compare two Unicode strings; produce a relative ordering␊ |
249 | // Compared using a 16-bit binary comparison (no case folding)␊ |
250 | //␊ |
251 | int32_t BinaryUnicodeCompare (u_int16_t * str1, u_int32_t length1,␊ |
252 | u_int16_t * str2, u_int32_t length2)␊ |
253 | {␊ |
254 | register u_int16_t c1, c2;␊ |
255 | int32_t bestGuess;␊ |
256 | u_int32_t length;␊ |
257 | ␊ |
258 | bestGuess = 0;␊ |
259 | ␊ |
260 | if (length1 < length2) {␊ |
261 | length = length1;␊ |
262 | --bestGuess;␊ |
263 | } else if (length1 > length2) {␊ |
264 | length = length2;␊ |
265 | ++bestGuess;␊ |
266 | } else {␊ |
267 | length = length1;␊ |
268 | }␊ |
269 | ␊ |
270 | while (length--) {␊ |
271 | c1 = *(str1++);␊ |
272 | c2 = *(str2++);␊ |
273 | ␊ |
274 | if (c1 > c2)␊ |
275 | return (1);␊ |
276 | if (c1 < c2)␊ |
277 | return (-1);␊ |
278 | }␊ |
279 | ␊ |
280 | return (bestGuess);␊ |
281 | }␊ |
282 | ␊ |
283 | ␊ |
284 | /*␊ |
285 | * UTF-8 (UCS Transformation Format)␊ |
286 | *␊ |
287 | * The following subset of UTF-8 is used to encode UCS-2 filenames. It␊ |
288 | * requires a maximum of three 3 bytes per UCS-2 character. Only the␊ |
289 | * shortest encoding required to represent the significant UCS-2 bits␊ |
290 | * is legal.␊ |
291 | * ␊ |
292 | * UTF-8 Multibyte Codes␊ |
293 | *␊ |
294 | * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)␊ |
295 | * -------------------------------------------------------------------␊ |
296 | * 1 7 0x0000 0x007F 0xxxxxxx␊ |
297 | * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx␊ |
298 | * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx␊ |
299 | * -------------------------------------------------------------------␊ |
300 | */␊ |
301 | ␊ |
302 | ␊ |
303 | /*␊ |
304 | * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a␊ |
305 | * null terminated UTF-8 string at utf8p.␊ |
306 | *␊ |
307 | * ucslen is the number of UCS-2 input characters (not bytes)␊ |
308 | * bufsize is the size of the output buffer in bytes␊ |
309 | */␊ |
310 | void␊ |
311 | utf_encodestr( const u_int16_t * ucsp, int ucslen,␊ |
312 | u_int8_t * utf8p, u_int32_t bufsize, int byte_order )␊ |
313 | {␊ |
314 | ␉u_int8_t *bufend;␊ |
315 | ␉u_int16_t ucs_ch;␊ |
316 | ␊ |
317 | ␉bufend = utf8p + bufsize;␊ |
318 | ␊ |
319 | ␉while (ucslen-- > 0) {␊ |
320 | if (byte_order == OSBigEndian)␊ |
321 | ␉␉ ucs_ch = SWAP_BE16(*ucsp++);␊ |
322 | else␊ |
323 | ucs_ch = SWAP_LE16(*ucsp++);␊ |
324 | ␊ |
325 | ␉␉if (ucs_ch < 0x0080) {␊ |
326 | ␉␉␉if (utf8p >= bufend)␊ |
327 | ␉␉␉␉break;␊ |
328 | ␉␉␉if (ucs_ch == '\0')␊ |
329 | ␉␉␉␉continue;␉/* skip over embedded NULLs */␊ |
330 | ␉␉␉*utf8p++ = ucs_ch;␊ |
331 | ␊ |
332 | ␉␉} else if (ucs_ch < 0x800) {␊ |
333 | ␉␉␉if ((utf8p + 1) >= bufend)␊ |
334 | ␉␉␉␉break;␊ |
335 | ␉␉␉*utf8p++ = (ucs_ch >> 6) | 0xc0;␊ |
336 | ␉␉␉*utf8p++ = (ucs_ch & 0x3f) | 0x80;␊ |
337 | ␊ |
338 | ␉␉} else {␊ |
339 | ␉␉␉if ((utf8p + 2) >= bufend)␊ |
340 | ␉␉␉␉break;␊ |
341 | ␉␉␉*utf8p++ = (ucs_ch >> 12) | 0xe0;␊ |
342 | ␉␉␉*utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;␊ |
343 | ␉␉␉*utf8p++ = ((ucs_ch) & 0x3f) | 0x80;␊ |
344 | ␉␉}␊ |
345 | ␉}␊ |
346 | ␊ |
347 | ␉*utf8p = '\0';␊ |
348 | }␊ |
349 | ␊ |
350 | ␊ |
351 | /*␊ |
352 | * utf_decodestr - Decodes the null terminated UTF-8 string at␊ |
353 | * utf8p into a UCS-2 (Unicode) string at ucsp.␊ |
354 | *␊ |
355 | * ucslen is the number of UCS-2 output characters (not bytes)␊ |
356 | * bufsize is the size of the output buffer in bytes␊ |
357 | */␊ |
358 | void utf_decodestr(const u_int8_t * utf8p, u_int16_t * ucsp, u_int16_t * ucslen, u_int32_t bufsize, int byte_order)␊ |
359 | {␊ |
360 | ␉u_int16_t *bufstart;␊ |
361 | ␉u_int16_t *bufend;␊ |
362 | ␉u_int16_t ucs_ch;␊ |
363 | ␉u_int8_t byte;␊ |
364 | ␊ |
365 | ␉bufstart = ucsp;␊ |
366 | ␉bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);␊ |
367 | ␊ |
368 | ␉while ((byte = *utf8p++) != '\0') {␊ |
369 | ␉␉if (ucsp >= bufend)␊ |
370 | ␉␉␉break;␊ |
371 | ␊ |
372 | ␉␉/* check for ascii */␊ |
373 | ␉␉if (byte < 0x80) {␊ |
374 | ␉␉␉ucs_ch = byte;␊ |
375 | ␉␉␉␊ |
376 | if (byte_order == OSBigEndian)␊ |
377 | ␉␉ *ucsp++ = SWAP_BE16(ucs_ch);␊ |
378 | else␊ |
379 | *ucsp++ = SWAP_LE16(ucs_ch);␊ |
380 | ␉␉␉␊ |
381 | ␉␉␉continue;␊ |
382 | ␉␉}␊ |
383 | ␊ |
384 | ␉␉switch (byte & 0xf0) {␊ |
385 | ␉␉/* 2 byte sequence*/␊ |
386 | ␉␉case 0xc0:␊ |
387 | ␉␉case 0xd0:␊ |
388 | ␉␉␉/* extract bits 6 - 10 from first byte */␊ |
389 | ␉␉␉ucs_ch = (byte & 0x1F) << 6; ␊ |
390 | ␉␉␉break;␊ |
391 | ␉␉/* 3 byte sequence*/␊ |
392 | ␉␉case 0xe0:␊ |
393 | ␉␉␉/* extract bits 12 - 15 from first byte */␊ |
394 | ␉␉␉ucs_ch = (byte & 0x0F) << 6;␊ |
395 | ␊ |
396 | ␉␉␉/* extract bits 6 - 11 from second byte */␊ |
397 | ␉␉␉if (((byte = *utf8p++) & 0xc0) != 0x80)␊ |
398 | ␉␉␉␉goto stop;␊ |
399 | ␊ |
400 | ␉␉␉ucs_ch += (byte & 0x3F);␊ |
401 | ␉␉␉ucs_ch <<= 6;␊ |
402 | ␉␉␉break;␊ |
403 | ␉␉default:␊ |
404 | ␉␉␉goto stop;␊ |
405 | ␉␉}␊ |
406 | ␊ |
407 | ␉␉/* extract bits 0 - 5 from final byte */␊ |
408 | ␉␉if (((byte = *utf8p++) & 0xc0) != 0x80)␊ |
409 | ␉␉␉goto stop;␊ |
410 | ␉␉ucs_ch += (byte & 0x3F); ␊ |
411 | ␊ |
412 | if (byte_order == OSBigEndian)␊ |
413 | *ucsp++ = SWAP_BE16(ucs_ch);␊ |
414 | else␊ |
415 | *ucsp++ = SWAP_LE16(ucs_ch);␊ |
416 | ␉}␊ |
417 | stop:␊ |
418 | if (byte_order == OSBigEndian)␊ |
419 | *ucslen = SWAP_BE16(ucsp - bufstart);␊ |
420 | else␊ |
421 | *ucslen = SWAP_LE16(ucsp - bufstart);␊ |
422 | }␊ |
423 | |