1 | /*␊ |
2 | * Copyright (c) 2000-2003 Apple Computer, Inc. All rights reserved.␊ |
3 | *␊ |
4 | * @APPLE_LICENSE_HEADER_START@␊ |
5 | * ␊ |
6 | * The contents of this file constitute Original Code as defined in and␊ |
7 | * are subject to the Apple Public Source License Version 2.0 (the␊ |
8 | * "License"). You may not use this file except in compliance with the␊ |
9 | * License. Please obtain a copy of the License at␊ |
10 | * http://www.apple.com/publicsource and read it before using this file.␊ |
11 | * ␊ |
12 | * This Original Code and all software distributed under the License are␊ |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER␊ |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,␊ |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,␊ |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the␊ |
17 | * License for the specific language governing rights and limitations␊ |
18 | * under the License.␊ |
19 | * ␊ |
20 | * @APPLE_LICENSE_HEADER_END@␊ |
21 | */␊ |
22 | /*␊ |
23 | * HFSCompare.c - Functions for working with and comparing HFS nams.␊ |
24 | *␊ |
25 | * Copyright (c) 1999-2000 Apple Computer, Inc.␊ |
26 | *␊ |
27 | * DRI: Josh de Cesare␊ |
28 | */␊ |
29 | ␊ |
30 | #include <sl.h>␊ |
31 | #include "hfs_CaseTables.h"␊ |
32 | ␊ |
33 | #if ! UNCOMPRESSED␊ |
34 | ␊ |
35 | static unsigned short * UncompressStructure(struct compressed_block *bp, int count, int size)␊ |
36 | {␊ |
37 | unsigned short *out = malloc(size);␊ |
38 | unsigned short *op = out;␊ |
39 | unsigned short data;␊ |
40 | int i, j;␊ |
41 | ␊ |
42 | for (i=0; i<count; i++, bp++) {␊ |
43 | // If this happens (it shouldn't) please fix size and/or double check that count really is␊ |
44 | // the number of elements in the array.␊ |
45 | // This was a very hard bug to find, so please leave this code here.␊ |
46 | if(out + size <= op + bp->count)␊ |
47 | {␊ |
48 | stop("HFS+ Unicode tables are malformed\n");␊ |
49 | }␊ |
50 | data = bp->data;␊ |
51 | for (j=0; j<bp->count; j++) {␊ |
52 | *op++ = data;␊ |
53 | if (bp->type == kTypeAscending) data++;␊ |
54 | else if (bp->type == kTypeAscending256) data += 256;␊ |
55 | }␊ |
56 | }␊ |
57 | return out;␊ |
58 | }␊ |
59 | ␊ |
60 | static void InitCompareTables(void)␊ |
61 | {␊ |
62 | if (gCompareTable == 0) {␊ |
63 | gCompareTable = UncompressStructure(gCompareTableCompressed,␊ |
64 | kCompareTableNBlocks, kCompareTableDataSize);␊ |
65 | gLowerCaseTable = UncompressStructure(gLowerCaseTableCompressed,␊ |
66 | kLowerCaseTableNBlocks, kLowerCaseTableDataSize);␊ |
67 | }␊ |
68 | }␊ |
69 | ␊ |
70 | #endif /* ! UNCOMPRESSED */␊ |
71 | ␊ |
72 | //_______________________________________________________________________␊ |
73 | //␊ |
74 | //␉Routine:␉FastRelString␊ |
75 | //␊ |
76 | //␉Output:␉␉returns -1 if str1 < str2␊ |
77 | //␉␉␉␉returns 1 if str1 > str2␊ |
78 | //␉␉␉␉return␉ 0 if equal␊ |
79 | //␊ |
80 | //_______________________________________________________________________␊ |
81 | ␊ |
82 | int32_t␉FastRelString(u_int8_t * str1, u_int8_t * str2)␊ |
83 | {␊ |
84 | ␉int32_t bestGuess;␊ |
85 | ␉u_int8_t length, length2;␊ |
86 | ␊ |
87 | #if ! UNCOMPRESED␊ |
88 | InitCompareTables();␊ |
89 | #endif␊ |
90 | ␊ |
91 | ␉length = *(str1++);␊ |
92 | ␉length2 = *(str2++);␊ |
93 | ␊ |
94 | ␉if (length == length2)␊ |
95 | ␉␉bestGuess = 0;␊ |
96 | ␉else if (length < length2)␊ |
97 | ␉␉bestGuess = -1;␊ |
98 | ␉else␊ |
99 | ␉{␊ |
100 | ␉␉bestGuess = 1;␊ |
101 | ␉␉length = length2;␊ |
102 | ␉}␊ |
103 | ␊ |
104 | ␉while (length--)␊ |
105 | ␉{␊ |
106 | ␉␉u_int32_t aChar, bChar;␊ |
107 | ␊ |
108 | ␉␉aChar = *(str1++);␊ |
109 | ␉␉bChar = *(str2++);␊ |
110 | ␉␉␊ |
111 | ␉␉if (aChar != bChar)␉/* If they don't match exacly, do case conversion */␊ |
112 | ␉␉{␊ |
113 | ␉␉␉u_int16_t aSortWord, bSortWord;␊ |
114 | ␊ |
115 | ␉␉␉aSortWord = gCompareTable[aChar];␊ |
116 | ␉␉␉bSortWord = gCompareTable[bChar];␊ |
117 | ␊ |
118 | ␉␉␉if (aSortWord > bSortWord)␊ |
119 | ␉␉␉␉return 1;␊ |
120 | ␊ |
121 | ␉␉␉if (aSortWord < bSortWord)␊ |
122 | ␉␉␉␉return -1;␊ |
123 | ␉␉}␊ |
124 | ␉␉␊ |
125 | ␉␉/*␊ |
126 | ␉␉ * If characters match exactly, then go on to next character␊ |
127 | ␉␉ * immediately without doing any extra work.␊ |
128 | ␉␉ */␊ |
129 | ␉}␊ |
130 | ␉␊ |
131 | ␉/* if you got to here, then return bestGuess */␊ |
132 | ␉return bestGuess;␊ |
133 | }␉␊ |
134 | ␊ |
135 | ␊ |
136 | //␊ |
137 | //␉FastUnicodeCompare - Compare two Unicode strings; produce a relative ordering␊ |
138 | //␊ |
139 | //␉ IF␉␉␉␉RESULT␊ |
140 | //␉--------------------------␊ |
141 | //␉str1 < str2␉␉=>␉-1␊ |
142 | //␉str1 = str2␉␉=>␉ 0␊ |
143 | //␉str1 > str2␉␉=>␉+1␊ |
144 | //␊ |
145 | //␉The lower case table starts with 256 entries (one for each of the upper bytes␊ |
146 | //␉of the original Unicode char). If that entry is zero, then all characters with␊ |
147 | //␉that upper byte are already case folded. If the entry is non-zero, then it is␊ |
148 | //␉the _index_ (not byte offset) of the start of the sub-table for the characters␊ |
149 | //␉with that upper byte. All ignorable characters are folded to the value zero.␊ |
150 | //␊ |
151 | //␉In pseudocode:␊ |
152 | //␊ |
153 | //␉␉Let c = source Unicode character␊ |
154 | //␉␉Let table[] = lower case table␊ |
155 | //␊ |
156 | //␉␉lower = table[highbyte(c)]␊ |
157 | //␉␉if (lower == 0)␊ |
158 | //␉␉␉lower = c␊ |
159 | //␉␉else␊ |
160 | //␉␉␉lower = table[lower+lowbyte(c)]␊ |
161 | //␊ |
162 | //␉␉if (lower == 0)␊ |
163 | //␉␉␉ignore this character␊ |
164 | //␊ |
165 | //␉To handle ignorable characters, we now need a loop to find the next valid character.␊ |
166 | //␉Also, we can't pre-compute the number of characters to compare; the string length might␊ |
167 | //␉be larger than the number of non-ignorable characters. Further, we must be able to handle␊ |
168 | //␉ignorable characters at any point in the string, including as the first or last characters.␊ |
169 | //␉We use a zero value as a sentinel to detect both end-of-string and ignorable characters.␊ |
170 | //␉Since the File Manager doesn't prevent the NUL character (value zero) as part of a filename,␊ |
171 | //␉the case mapping table is assumed to map u+0000 to some non-zero value (like 0xFFFF, which is␊ |
172 | //␉an invalid Unicode character).␊ |
173 | //␊ |
174 | //␉Pseudocode:␊ |
175 | //␊ |
176 | //␉␉while (1) {␊ |
177 | //␉␉␉c1 = GetNextValidChar(str1)␉␉␉//␉returns zero if at end of string␊ |
178 | //␉␉␉c2 = GetNextValidChar(str2)␊ |
179 | //␊ |
180 | //␉␉␉if (c1 != c2) break␉␉␉␉␉//␉found a difference␊ |
181 | //␊ |
182 | //␉␉␉if (c1 == 0)␉␉␉␉␉␉//␉reached end of string on both strings at once?␊ |
183 | //␉␉␉␉return 0;␉␉␉␉␉␉//␉yes, so strings are equal␊ |
184 | //␉␉}␊ |
185 | //␊ |
186 | //␉␉// When we get here, c1 != c2. So, we just need to determine which one is less.␊ |
187 | //␉␉if (c1 < c2)␊ |
188 | //␉␉␉return -1;␊ |
189 | //␉␉else␊ |
190 | //␉␉␉return 1;␊ |
191 | //␊ |
192 | ␊ |
193 | int32_t FastUnicodeCompare( u_int16_t * str1, register u_int32_t length1,␊ |
194 | u_int16_t * str2, register u_int32_t length2, int byte_order )␊ |
195 | {␊ |
196 | ␉register u_int16_t c1,c2;␊ |
197 | ␉register u_int16_t temp;␊ |
198 | ␊ |
199 | #if ! UNCOMPRESSED␊ |
200 | InitCompareTables();␊ |
201 | #endif␊ |
202 | ␊ |
203 | ␉while (1) {␊ |
204 | ␉␉/* Set default values for c1, c2 in case there are no more valid chars */␊ |
205 | ␉␉c1 = 0;␊ |
206 | ␉␉c2 = 0;␊ |
207 | ␊ |
208 | ␉␉/* Find next non-ignorable char from str1, or zero if no more */␊ |
209 | ␉␉while (length1 && c1 == 0) {␊ |
210 | ␉␉␉if (byte_order == OSBigEndian)␊ |
211 | ␉␉␉␉c1 = SWAP_BE16(*(str1++));␊ |
212 | ␉␉␉else␊ |
213 | ␉␉␉␉c1 = SWAP_LE16(*(str1++));␊ |
214 | ␉␉␉--length1;␊ |
215 | ␉␉␉if ((temp = gLowerCaseTable[c1>>8]) != 0)␉␉// is there a subtable for this upper byte?␊ |
216 | ␉␉␉␉c1 = gLowerCaseTable[temp + (c1 & 0x00FF)];␉// yes, so fold the char␊ |
217 | ␉␉}␊ |
218 | ␊ |
219 | ␉␉/* Find next non-ignorable char from str2, or zero if no more */␊ |
220 | ␉␉while (length2 && c2 == 0) {␊ |
221 | ␉␉␉if (byte_order == OSBigEndian)␊ |
222 | ␉␉␉␉c2 = SWAP_BE16(*(str2++));␊ |
223 | ␉␉␉else␊ |
224 | ␉␉␉␉c2 = SWAP_LE16(*(str2++));␊ |
225 | ␉␉␉--length2;␊ |
226 | ␉␉␉if ((temp = gLowerCaseTable[c2>>8]) != 0)␉␉// is there a subtable for this upper byte?␊ |
227 | ␉␉␉␉c2 = gLowerCaseTable[temp + (c2 & 0x00FF)];␉// yes, so fold the char␊ |
228 | ␉␉}␊ |
229 | ␊ |
230 | ␉␉if (c1 != c2)␉/* found a difference, so stop looping */␊ |
231 | ␉␉␉break;␊ |
232 | ␉␉␊ |
233 | ␉␉if (c1 == 0)␉␉/* did we reach the end of both strings at the same time? */␊ |
234 | ␉␉␉return 0;␉/* yes, so strings are equal */␊ |
235 | ␉}␊ |
236 | ␉␊ |
237 | ␉if (c1 < c2)␊ |
238 | ␉␉return -1;␊ |
239 | ␉else␊ |
240 | ␉␉return 1;␊ |
241 | }␊ |
242 | ␊ |
243 | ␊ |
244 | //␊ |
245 | // BinaryUnicodeCompare - Compare two Unicode strings; produce a relative ordering␊ |
246 | // Compared using a 16-bit binary comparison (no case folding)␊ |
247 | //␊ |
248 | int32_t BinaryUnicodeCompare (u_int16_t * str1, u_int32_t length1,␊ |
249 | u_int16_t * str2, u_int32_t length2)␊ |
250 | {␊ |
251 | register u_int16_t c1, c2;␊ |
252 | int32_t bestGuess;␊ |
253 | u_int32_t length;␊ |
254 | ␊ |
255 | bestGuess = 0;␊ |
256 | ␊ |
257 | if (length1 < length2) {␊ |
258 | length = length1;␊ |
259 | --bestGuess;␊ |
260 | } else if (length1 > length2) {␊ |
261 | length = length2;␊ |
262 | ++bestGuess;␊ |
263 | } else {␊ |
264 | length = length1;␊ |
265 | }␊ |
266 | ␊ |
267 | while (length--) {␊ |
268 | c1 = *(str1++);␊ |
269 | c2 = *(str2++);␊ |
270 | ␊ |
271 | if (c1 > c2)␊ |
272 | return (1);␊ |
273 | if (c1 < c2)␊ |
274 | return (-1);␊ |
275 | }␊ |
276 | ␊ |
277 | return (bestGuess);␊ |
278 | }␊ |
279 | ␊ |
280 | ␊ |
281 | /*␊ |
282 | * UTF-8 (UCS Transformation Format)␊ |
283 | *␊ |
284 | * The following subset of UTF-8 is used to encode UCS-2 filenames. It␊ |
285 | * requires a maximum of three 3 bytes per UCS-2 character. Only the␊ |
286 | * shortest encoding required to represent the significant UCS-2 bits␊ |
287 | * is legal.␊ |
288 | * ␊ |
289 | * UTF-8 Multibyte Codes␊ |
290 | *␊ |
291 | * Bytes Bits UCS-2 Min UCS-2 Max UTF-8 Byte Sequence (binary)␊ |
292 | * -------------------------------------------------------------------␊ |
293 | * 1 7 0x0000 0x007F 0xxxxxxx␊ |
294 | * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx␊ |
295 | * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx␊ |
296 | * -------------------------------------------------------------------␊ |
297 | */␊ |
298 | ␊ |
299 | ␊ |
300 | /*␊ |
301 | * utf_encodestr - Encodes the UCS-2 (Unicode) string at ucsp into a␊ |
302 | * null terminated UTF-8 string at utf8p.␊ |
303 | *␊ |
304 | * ucslen is the number of UCS-2 input characters (not bytes)␊ |
305 | * bufsize is the size of the output buffer in bytes␊ |
306 | */␊ |
307 | void utf_encodestr( const u_int16_t * ucsp, int ucslen,␊ |
308 | u_int8_t * utf8p, u_int32_t bufsize, int byte_order )␊ |
309 | {␊ |
310 | ␉u_int8_t *bufend;␊ |
311 | ␉u_int16_t ucs_ch;␊ |
312 | ␊ |
313 | ␉bufend = utf8p + bufsize;␊ |
314 | ␊ |
315 | ␉while (ucslen-- > 0) {␊ |
316 | if (byte_order == OSBigEndian)␊ |
317 | ␉␉ ucs_ch = SWAP_BE16(*ucsp++);␊ |
318 | else␊ |
319 | ucs_ch = SWAP_LE16(*ucsp++);␊ |
320 | ␊ |
321 | ␉␉if (ucs_ch < 0x0080) {␊ |
322 | ␉␉␉if (utf8p >= bufend)␊ |
323 | ␉␉␉␉break;␊ |
324 | ␉␉␉if (ucs_ch == '\0')␊ |
325 | ␉␉␉␉continue;␉/* skip over embedded NULLs */␊ |
326 | ␉␉␉*utf8p++ = ucs_ch;␊ |
327 | ␊ |
328 | ␉␉} else if (ucs_ch < 0x800) {␊ |
329 | ␉␉␉if ((utf8p + 1) >= bufend)␊ |
330 | ␉␉␉␉break;␊ |
331 | ␉␉␉*utf8p++ = (ucs_ch >> 6) | 0xc0;␊ |
332 | ␉␉␉*utf8p++ = (ucs_ch & 0x3f) | 0x80;␊ |
333 | ␊ |
334 | ␉␉} else {␊ |
335 | ␉␉␉if ((utf8p + 2) >= bufend)␊ |
336 | ␉␉␉␉break;␊ |
337 | ␉␉␉*utf8p++ = (ucs_ch >> 12) | 0xe0;␊ |
338 | ␉␉␉*utf8p++ = ((ucs_ch >> 6) & 0x3f) | 0x80;␊ |
339 | ␉␉␉*utf8p++ = ((ucs_ch) & 0x3f) | 0x80;␊ |
340 | ␉␉}␊ |
341 | ␉}␊ |
342 | ␊ |
343 | ␉*utf8p = '\0';␊ |
344 | }␊ |
345 | ␊ |
346 | ␊ |
347 | /*␊ |
348 | * utf_decodestr - Decodes the null terminated UTF-8 string at␊ |
349 | * utf8p into a UCS-2 (Unicode) string at ucsp.␊ |
350 | *␊ |
351 | * ucslen is the number of UCS-2 output characters (not bytes)␊ |
352 | * bufsize is the size of the output buffer in bytes␊ |
353 | */␊ |
354 | void utf_decodestr(const u_int8_t * utf8p, u_int16_t * ucsp, u_int16_t * ucslen, u_int32_t bufsize, int byte_order)␊ |
355 | {␊ |
356 | ␉u_int16_t *bufstart;␊ |
357 | ␉u_int16_t *bufend;␊ |
358 | ␉u_int16_t ucs_ch;␊ |
359 | ␉u_int8_t byte;␊ |
360 | ␊ |
361 | ␉bufstart = ucsp;␊ |
362 | ␉bufend = (u_int16_t *)((u_int8_t *)ucsp + bufsize);␊ |
363 | ␊ |
364 | ␉while ((byte = *utf8p++) != '\0') {␊ |
365 | ␉␉if (ucsp >= bufend)␊ |
366 | ␉␉␉break;␊ |
367 | ␊ |
368 | ␉␉/* check for ascii */␊ |
369 | ␉␉if (byte < 0x80) {␊ |
370 | ␉␉␉ucs_ch = byte;␊ |
371 | ␉␉␉␊ |
372 | if (byte_order == OSBigEndian)␊ |
373 | ␉␉ *ucsp++ = SWAP_BE16(ucs_ch);␊ |
374 | else␊ |
375 | *ucsp++ = SWAP_LE16(ucs_ch);␊ |
376 | ␉␉␉␊ |
377 | ␉␉␉continue;␊ |
378 | ␉␉}␊ |
379 | ␊ |
380 | ␉␉switch (byte & 0xf0) {␊ |
381 | ␉␉/* 2 byte sequence*/␊ |
382 | ␉␉case 0xc0:␊ |
383 | ␉␉case 0xd0:␊ |
384 | ␉␉␉/* extract bits 6 - 10 from first byte */␊ |
385 | ␉␉␉ucs_ch = (byte & 0x1F) << 6; ␊ |
386 | ␉␉␉break;␊ |
387 | ␉␉/* 3 byte sequence*/␊ |
388 | ␉␉case 0xe0:␊ |
389 | ␉␉␉/* extract bits 12 - 15 from first byte */␊ |
390 | ␉␉␉ucs_ch = (byte & 0x0F) << 6;␊ |
391 | ␊ |
392 | ␉␉␉/* extract bits 6 - 11 from second byte */␊ |
393 | ␉␉␉if (((byte = *utf8p++) & 0xc0) != 0x80)␊ |
394 | ␉␉␉␉goto stop;␊ |
395 | ␊ |
396 | ␉␉␉ucs_ch += (byte & 0x3F);␊ |
397 | ␉␉␉ucs_ch <<= 6;␊ |
398 | ␉␉␉break;␊ |
399 | ␉␉default:␊ |
400 | ␉␉␉goto stop;␊ |
401 | ␉␉}␊ |
402 | ␊ |
403 | ␉␉/* extract bits 0 - 5 from final byte */␊ |
404 | ␉␉if (((byte = *utf8p++) & 0xc0) != 0x80)␊ |
405 | ␉␉␉goto stop;␊ |
406 | ␉␉ucs_ch += (byte & 0x3F); ␊ |
407 | ␊ |
408 | if (byte_order == OSBigEndian)␊ |
409 | *ucsp++ = SWAP_BE16(ucs_ch);␊ |
410 | else␊ |
411 | *ucsp++ = SWAP_LE16(ucs_ch);␊ |
412 | ␉}␊ |
413 | stop:␊ |
414 | if (byte_order == OSBigEndian)␊ |
415 | *ucslen = SWAP_BE16(ucsp - bufstart);␊ |
416 | else␊ |
417 | *ucslen = SWAP_LE16(ucsp - bufstart);␊ |
418 | }␊ |
419 | |