diff options
Diffstat (limited to 'trunk/ext/json/ext/parser/unicode.c')
-rw-r--r-- | trunk/ext/json/ext/parser/unicode.c | 156 |
1 files changed, 0 insertions, 156 deletions
diff --git a/trunk/ext/json/ext/parser/unicode.c b/trunk/ext/json/ext/parser/unicode.c deleted file mode 100644 index f196727354..0000000000 --- a/trunk/ext/json/ext/parser/unicode.c +++ /dev/null @@ -1,156 +0,0 @@ -/* vim: set cin et sw=4 ts=4: */ - -#include "unicode.h" - -/* - * Copyright 2001-2004 Unicode, Inc. - * - * Disclaimer - * - * This source code is provided as is by Unicode, Inc. No claims are - * made as to fitness for any particular purpose. No warranties of any - * kind are expressed or implied. The recipient agrees to determine - * applicability of information provided. If this file has been - * purchased on magnetic or optical media from Unicode, Inc., the - * sole remedy for any claim will be exchange of defective media - * within 90 days of receipt. - * - * Limitations on Rights to Redistribute This Code - * - * Unicode, Inc. hereby grants the right to freely use the information - * supplied in this file in the creation of products supporting the - * Unicode Standard, and to make copies of this file in any form - * for internal or external distribution as long as this notice - * remains attached. - */ - -/* - * Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - -/* - * Magic values subtracted from a buffer value during UTF8 conversion. - * This table contains as many values as there might be trailing bytes - * in a UTF-8 sequence. - */ -static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - -/* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ -static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - -char *JSON_convert_UTF16_to_UTF8 ( - VALUE buffer, - char *source, - char *sourceEnd, - ConversionFlags flags) -{ - UTF16 *tmp, *tmpPtr, *tmpEnd; - char buf[5]; - long n = 0, i; - char *p = source - 1; - - while (p < sourceEnd && p[0] == '\\' && p[1] == 'u') { - p += 6; - n++; - } - p = source + 1; - buf[4] = 0; - tmpPtr = tmp = ALLOC_N(UTF16, n); - tmpEnd = tmp + n; - for (i = 0; i < n; i++) { - buf[0] = *p++; - buf[1] = *p++; - buf[2] = *p++; - buf[3] = *p++; - tmpPtr[i] = strtol(buf, NULL, 16); - p += 2; - } - - while (tmpPtr < tmpEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *tmpPtr++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source - * buffer... */ - if (tmpPtr < tmpEnd) { - UTF32 ch2 = *tmpPtr; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++tmpPtr; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - ruby_xfree(tmp); - rb_raise(rb_path2class("JSON::ParserError"), - "source sequence is illegal/malformed near %s", source); - } - } else { /* We don't have the 16 bits following the high surrogate. */ - ruby_xfree(tmp); - rb_raise(rb_path2class("JSON::ParserError"), - "partial character in source, but hit end near %s", source); - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - ruby_xfree(tmp); - rb_raise(rb_path2class("JSON::ParserError"), - "source sequence is illegal/malformed near %s", source); - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32) 0x80) { - bytesToWrite = 1; - } else if (ch < (UTF32) 0x800) { - bytesToWrite = 2; - } else if (ch < (UTF32) 0x10000) { - bytesToWrite = 3; - } else if (ch < (UTF32) 0x110000) { - bytesToWrite = 4; - } else { - bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - - buf[0] = 0; - buf[1] = 0; - buf[2] = 0; - buf[3] = 0; - p = buf + bytesToWrite; - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--p = (UTF8) ((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--p = (UTF8) (ch | firstByteMark[bytesToWrite]); - } - rb_str_buf_cat(buffer, p, bytesToWrite); - } - ruby_xfree(tmp); - source += 5 + (n - 1) * 6; - return source; -} |