diff options
Diffstat (limited to 'include/ruby/internal/encoding')
| -rw-r--r-- | include/ruby/internal/encoding/coderange.h | 88 | ||||
| -rw-r--r-- | include/ruby/internal/encoding/ctype.h | 184 | ||||
| -rw-r--r-- | include/ruby/internal/encoding/encoding.h | 315 | ||||
| -rw-r--r-- | include/ruby/internal/encoding/string.h | 72 | ||||
| -rw-r--r-- | include/ruby/internal/encoding/transcode.h | 18 |
5 files changed, 486 insertions, 191 deletions
diff --git a/include/ruby/internal/encoding/coderange.h b/include/ruby/internal/encoding/coderange.h index 84daddeeb3..c89f871518 100644 --- a/include/ruby/internal/encoding/coderange.h +++ b/include/ruby/internal/encoding/coderange.h @@ -22,7 +22,9 @@ */ #include "ruby/internal/attr/const.h" +#include "ruby/internal/attr/pure.h" #include "ruby/internal/dllexport.h" +#include "ruby/internal/fl_type.h" #include "ruby/internal/value.h" RBIMPL_SYMBOL_EXPORT_BEGIN() @@ -65,6 +67,7 @@ rb_enc_coderange_clean_p(int cr) return (cr ^ (cr >> 1)) & RUBY_ENC_CODERANGE_7BIT; } +RBIMPL_ATTR_CONST() /** * Queries if a code range is "clean". "Clean" in this context means it is * known and valid. @@ -73,8 +76,13 @@ rb_enc_coderange_clean_p(int cr) * @retval 1 It is. * @retval 0 It isn't. */ -#define RB_ENC_CODERANGE_CLEAN_P(cr) rb_enc_coderange_clean_p(cr) +static inline bool +RB_ENC_CODERANGE_CLEAN_P(enum ruby_coderange_type cr) +{ + return rb_enc_coderange_clean_p(RBIMPL_CAST((int)cr)); +} +RBIMPL_ATTR_PURE_UNLESS_DEBUG() /** * Queries the (inline) code range of the passed object. The object must be * capable of having inline encoding. Using this macro needs deep @@ -83,8 +91,15 @@ rb_enc_coderange_clean_p(int cr) * @param[in] obj Target object. * @return An enum ::ruby_coderange_type. */ -#define RB_ENC_CODERANGE(obj) ((int)RBASIC(obj)->flags & RUBY_ENC_CODERANGE_MASK) +static inline enum ruby_coderange_type +RB_ENC_CODERANGE(VALUE obj) +{ + VALUE ret = RB_FL_TEST_RAW(obj, RUBY_ENC_CODERANGE_MASK); + + return RBIMPL_CAST((enum ruby_coderange_type)ret); +} +RBIMPL_ATTR_PURE_UNLESS_DEBUG() /** * Queries the (inline) code range of the passed object is * ::RUBY_ENC_CODERANGE_7BIT. The object must be capable of having inline @@ -95,7 +110,11 @@ rb_enc_coderange_clean_p(int cr) * @retval 1 It is ascii only. * @retval 0 Otherwise (including cases when the range is not known). */ -#define RB_ENC_CODERANGE_ASCIIONLY(obj) (RB_ENC_CODERANGE(obj) == RUBY_ENC_CODERANGE_7BIT) +static inline bool +RB_ENC_CODERANGE_ASCIIONLY(VALUE obj) +{ + return RB_ENC_CODERANGE(obj) == RUBY_ENC_CODERANGE_7BIT; +} /** * Destructively modifies the passed object so that its (inline) code range is @@ -106,9 +125,12 @@ rb_enc_coderange_clean_p(int cr) * @param[out] cr An enum ::ruby_coderange_type. * @post `obj`'s code range is `cr`. */ -#define RB_ENC_CODERANGE_SET(obj,cr) (\ - RBASIC(obj)->flags = \ - (RBASIC(obj)->flags & ~RUBY_ENC_CODERANGE_MASK) | (cr)) +static inline void +RB_ENC_CODERANGE_SET(VALUE obj, enum ruby_coderange_type cr) +{ + RB_FL_UNSET_RAW(obj, RUBY_ENC_CODERANGE_MASK); + RB_FL_SET_RAW(obj, cr); +} /** * Destructively clears the passed object's (inline) code range. The object @@ -118,8 +140,13 @@ rb_enc_coderange_clean_p(int cr) * @param[out] obj Target object. * @post `obj`'s code range is ::RUBY_ENC_CODERANGE_UNKNOWN. */ -#define RB_ENC_CODERANGE_CLEAR(obj) RB_ENC_CODERANGE_SET((obj),0) +static inline void +RB_ENC_CODERANGE_CLEAR(VALUE obj) +{ + RB_FL_UNSET_RAW(obj, RUBY_ENC_CODERANGE_MASK); +} +RBIMPL_ATTR_CONST() /* assumed ASCII compatibility */ /** * "Mix" two code ranges into one. This is handy for instance when you @@ -131,28 +158,22 @@ rb_enc_coderange_clean_p(int cr) * @param[in] b Another enum ::ruby_coderange_type. * @return The `a` "and" `b`. */ -#define RB_ENC_CODERANGE_AND(a, b) \ - ((a) == RUBY_ENC_CODERANGE_7BIT ? (b) : \ - (a) != RUBY_ENC_CODERANGE_VALID ? RUBY_ENC_CODERANGE_UNKNOWN : \ - (b) == RUBY_ENC_CODERANGE_7BIT ? RUBY_ENC_CODERANGE_VALID : (b)) - -/** - * This is #RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo. The object must be - * capable of having inline encoding. Using this macro needs deep - * understanding of bit level object binary layout. - * - * @param[out] obj Target object. - * @param[in] encindex Encoding in encindex format. - * @param[in] cr An enum ::ruby_coderange_type. - * @post `obj`'s encoding is `encindex`. - * @post `obj`'s code range is `cr`. - */ -#define RB_ENCODING_CODERANGE_SET(obj, encindex, cr) \ - do { \ - VALUE rb_encoding_coderange_obj = (obj); \ - RB_ENCODING_SET(rb_encoding_coderange_obj, (encindex)); \ - RB_ENC_CODERANGE_SET(rb_encoding_coderange_obj, (cr)); \ - } while (0) +static inline enum ruby_coderange_type +RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b) +{ + if (a == RUBY_ENC_CODERANGE_7BIT) { + return b; + } + else if (a != RUBY_ENC_CODERANGE_VALID) { + return RUBY_ENC_CODERANGE_UNKNOWN; + } + else if (b == RUBY_ENC_CODERANGE_7BIT) { + return RUBY_ENC_CODERANGE_VALID; + } + else { + return b; + } +} #define ENC_CODERANGE_MASK RUBY_ENC_CODERANGE_MASK /**< @old{RUBY_ENC_CODERANGE_MASK} */ #define ENC_CODERANGE_UNKNOWN RUBY_ENC_CODERANGE_UNKNOWN /**< @old{RUBY_ENC_CODERANGE_UNKNOWN} */ @@ -167,6 +188,15 @@ rb_enc_coderange_clean_p(int cr) #define ENC_CODERANGE_AND(a, b) RB_ENC_CODERANGE_AND(a, b) /**< @old{RB_ENC_CODERANGE_AND} */ #define ENCODING_CODERANGE_SET(obj, encindex, cr) RB_ENCODING_CODERANGE_SET(obj, encindex, cr) /**< @old{RB_ENCODING_CODERANGE_SET} */ +/** @cond INTERNAL_MACRO */ +#define RB_ENC_CODERANGE RB_ENC_CODERANGE +#define RB_ENC_CODERANGE_AND RB_ENC_CODERANGE_AND +#define RB_ENC_CODERANGE_ASCIIONLY RB_ENC_CODERANGE_ASCIIONLY +#define RB_ENC_CODERANGE_CLEAN_P RB_ENC_CODERANGE_CLEAN_P +#define RB_ENC_CODERANGE_CLEAR RB_ENC_CODERANGE_CLEAR +#define RB_ENC_CODERANGE_SET RB_ENC_CODERANGE_SET +/** @endcond */ + RBIMPL_SYMBOL_EXPORT_END() #endif /* RUBY_INTERNAL_ENCODING_CODERANGE_H */ diff --git a/include/ruby/internal/encoding/ctype.h b/include/ruby/internal/encoding/ctype.h index e0b95f93b2..05c314aeb3 100644 --- a/include/ruby/internal/encoding/ctype.h +++ b/include/ruby/internal/encoding/ctype.h @@ -36,118 +36,179 @@ RBIMPL_SYMBOL_EXPORT_BEGIN() * @param[in] p Pointer to a possibly-middle of a character. * @param[in] end End of the string. * @param[in] enc Encoding. - * @retval 0 It isn't. - * @retval otherwise It is. + * @retval false It isn't. + * @retval true It is. */ -#define rb_enc_is_newline(p,end,enc) ONIGENC_IS_MBC_NEWLINE((enc),(UChar*)(p),(UChar*)(end)) +static inline bool +rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc) +{ + OnigUChar *up = RBIMPL_CAST((OnigUChar *)p); + OnigUChar *ue = RBIMPL_CAST((OnigUChar *)e); + + return ONIGENC_IS_MBC_NEWLINE(enc, up, ue); +} /** * Queries if the passed code point is of passed character type in the passed * encoding. The "character type" here is a set of macros defined in onigmo.h, * like `ONIGENC_CTYPE_PUNCT`. * - * @param[in] c An `OnigCodePoint` value. - * @param[in] t An `OnigCtype` value. - * @param[in] enc A `rb_encoding*` value. - * @retval 1 `c` is of `t` in `enc`. - * @retval 0 Otherwise. + * @param[in] c An `OnigCodePoint` value. + * @param[in] t An `OnigCtype` value. + * @param[in] enc A `rb_encoding*` value. + * @retval true `c` is of `t` in `enc`. + * @retval false Otherwise. */ -#define rb_enc_isctype(c,t,enc) ONIGENC_IS_CODE_CTYPE((enc),(c),(t)) +static inline bool +rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_CTYPE(enc, c, t); +} /** * Identical to rb_isascii(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 0 `c` is out of range of ASCII character set in `enc`. - * @retval 1 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval false `c` is out of range of ASCII character set in `enc`. + * @retval true Otherwise. * * @internal * * `enc` is ignored. This is at least an intentional implementation detail * (not a bug). But there could be rooms for future extensions. */ -#define rb_enc_isascii(c,enc) ONIGENC_IS_CODE_ASCII(c) +static inline bool +rb_enc_isascii(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_ASCII(c); +} /** * Identical to rb_isalpha(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "ALPHA". - * @retval 0 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "ALPHA". + * @retval false Otherwise. */ -#define rb_enc_isalpha(c,enc) ONIGENC_IS_CODE_ALPHA((enc),(c)) +static inline bool +rb_enc_isalpha(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_ALPHA(enc, c); +} /** * Identical to rb_islower(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "LOWER". - * @retval 0 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "LOWER". + * @retval false Otherwise. */ -#define rb_enc_islower(c,enc) ONIGENC_IS_CODE_LOWER((enc),(c)) +static inline bool +rb_enc_islower(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_LOWER(enc, c); +} /** * Identical to rb_isupper(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "UPPER". - * @retval 0 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "UPPER". + * @retval false Otherwise. + */ +static inline bool +rb_enc_isupper(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_UPPER(enc, c); +} + +/** + * Identical to rb_iscntrl(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "CNTRL". + * @retval false Otherwise. */ -#define rb_enc_isupper(c,enc) ONIGENC_IS_CODE_UPPER((enc),(c)) +static inline bool +rb_enc_iscntrl(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_CNTRL(enc, c); +} /** * Identical to rb_ispunct(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "PUNCT". - * @retval 0 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "PUNCT". + * @retval false Otherwise. */ -#define rb_enc_ispunct(c,enc) ONIGENC_IS_CODE_PUNCT((enc),(c)) +static inline bool +rb_enc_ispunct(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_PUNCT(enc, c); +} /** * Identical to rb_isalnum(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "ANUM". - * @retval 0 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "ANUM". + * @retval false Otherwise. */ -#define rb_enc_isalnum(c,enc) ONIGENC_IS_CODE_ALNUM((enc),(c)) +static inline bool +rb_enc_isalnum(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_ALNUM(enc, c); +} /** * Identical to rb_isprint(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "PRINT". - * @retval 0 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "PRINT". + * @retval false Otherwise. */ -#define rb_enc_isprint(c,enc) ONIGENC_IS_CODE_PRINT((enc),(c)) +static inline bool +rb_enc_isprint(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_PRINT(enc, c); +} /** * Identical to rb_isspace(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "PRINT". - * @retval 0 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "PRINT". + * @retval false Otherwise. */ -#define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE((enc),(c)) +static inline bool +rb_enc_isspace(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_SPACE(enc, c); +} /** * Identical to rb_isdigit(), except it additionally takes an encoding. * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "DIGIT". - * @retval 0 Otherwise. + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval true `enc` classifies `c` as "DIGIT". + * @retval false Otherwise. */ -#define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT((enc),(c)) +static inline bool +rb_enc_isdigit(OnigCodePoint c, rb_encoding *enc) +{ + return ONIGENC_IS_CODE_DIGIT(enc, c); +} RBIMPL_ATTR_CONST() /** @@ -179,4 +240,19 @@ int rb_enc_tolower(int c, rb_encoding *enc); RBIMPL_SYMBOL_EXPORT_END() +/** @cond INTERNAL_MACRO */ +#define rb_enc_is_newline rb_enc_is_newline +#define rb_enc_isalnum rb_enc_isalnum +#define rb_enc_isalpha rb_enc_isalpha +#define rb_enc_isascii rb_enc_isascii +#define rb_enc_isctype rb_enc_isctype +#define rb_enc_isdigit rb_enc_isdigit +#define rb_enc_islower rb_enc_islower +#define rb_enc_isprint rb_enc_isprint +#define rb_enc_iscntrl rb_enc_iscntrl +#define rb_enc_ispunct rb_enc_ispunct +#define rb_enc_isspace rb_enc_isspace +#define rb_enc_isupper rb_enc_isupper +/** @endcond */ + #endif /* RUBY_INTERNAL_ENCODING_CTYPE_H */ diff --git a/include/ruby/internal/encoding/encoding.h b/include/ruby/internal/encoding/encoding.h index 0e6463ad78..a58f9f2b15 100644 --- a/include/ruby/internal/encoding/encoding.h +++ b/include/ruby/internal/encoding/encoding.h @@ -28,6 +28,7 @@ #include "ruby/internal/attr/pure.h" #include "ruby/internal/attr/returns_nonnull.h" #include "ruby/internal/dllexport.h" +#include "ruby/internal/encoding/coderange.h" #include "ruby/internal/value.h" #include "ruby/internal/core/rbasic.h" #include "ruby/internal/fl_type.h" @@ -65,24 +66,26 @@ enum ruby_encoding_consts { #define ENCODING_INLINE_MAX RUBY_ENCODING_INLINE_MAX /**< @old{RUBY_ENCODING_INLINE_MAX} */ #define ENCODING_SHIFT RUBY_ENCODING_SHIFT /**< @old{RUBY_ENCODING_SHIFT} */ -#define ENCODING_MASK RUBY_ENCODING_MASK /**< @old{RUBY_ENCODING_SHIFT} */ +#define ENCODING_MASK RUBY_ENCODING_MASK /**< @old{RUBY_ENCODING_MASK} */ /** * Destructively assigns the passed encoding to the passed object. The object * must be capable of having inline encoding. Using this macro needs deep * understanding of bit level object binary layout. * - * @param[out] obj Target object to modify. - * @param[in] i Encoding in encindex format. - * @post `obj`'s encoding is `i`. + * @param[out] obj Target object to modify. + * @param[in] ecindex Encoding in encindex format. + * @post `obj`'s encoding is `encindex`. */ -#define RB_ENCODING_SET_INLINED(obj,i) do {\ - RBASIC(obj)->flags &= ~RUBY_ENCODING_MASK;\ - RBASIC(obj)->flags |= (VALUE)(i) << RUBY_ENCODING_SHIFT;\ -} while (0) +static inline void +RB_ENCODING_SET_INLINED(VALUE obj, int encindex) +{ + VALUE f = /* upcast */ RBIMPL_CAST((VALUE)encindex); -/** @alias{rb_enc_set_index} */ -#define RB_ENCODING_SET(obj,i) rb_enc_set_index((obj), (i)) + f <<= RUBY_ENCODING_SHIFT; + RB_FL_UNSET_RAW(obj, RUBY_ENCODING_MASK); + RB_FL_SET_RAW(obj, f); +} /** * Queries the encoding of the passed object. The encoding must be smaller @@ -92,32 +95,13 @@ enum ruby_encoding_consts { * @param[in] obj Target object. * @return `obj`'s encoding index. */ -#define RB_ENCODING_GET_INLINED(obj) \ - (int)((RBASIC(obj)->flags & RUBY_ENCODING_MASK)>>RUBY_ENCODING_SHIFT) - -/** - * @alias{rb_enc_get_index} - * - * @internal - * - * Implementation wise this is not a verbatim alias of rb_enc_get_index(). But - * the API is consistent. Don't bother. - */ -#define RB_ENCODING_GET(obj) \ - (RB_ENCODING_GET_INLINED(obj) != RUBY_ENCODING_INLINE_MAX ? \ - RB_ENCODING_GET_INLINED(obj) : \ - rb_enc_get_index(obj)) +static inline int +RB_ENCODING_GET_INLINED(VALUE obj) +{ + VALUE ret = RB_FL_TEST_RAW(obj, RUBY_ENCODING_MASK) >> RUBY_ENCODING_SHIFT; -/** - * Queries if the passed object is in ascii 8bit (== binary) encoding. The - * object must be capable of having inline encoding. Using this macro needs - * deep understanding of bit level object binary layout. - * - * @param[in] obj An object to check. - * @retval 1 It is. - * @retval 0 It isn't. - */ -#define RB_ENCODING_IS_ASCII8BIT(obj) (RB_ENCODING_GET_INLINED(obj) == 0) + return RBIMPL_CAST((int)ret); +} #define ENCODING_SET_INLINED(obj,i) RB_ENCODING_SET_INLINED(obj,i) /**< @old{RB_ENCODING_SET_INLINED} */ #define ENCODING_SET(obj,i) RB_ENCODING_SET(obj,i) /**< @old{RB_ENCODING_SET} */ @@ -126,7 +110,6 @@ enum ruby_encoding_consts { #define ENCODING_IS_ASCII8BIT(obj) RB_ENCODING_IS_ASCII8BIT(obj) /**< @old{RB_ENCODING_IS_ASCII8BIT} */ #define ENCODING_MAXNAMELEN RUBY_ENCODING_MAXNAMELEN /**< @old{RUBY_ENCODING_MAXNAMELEN} */ - /** * The type of encoding. Our design here is we take Oniguruma/Onigmo's * multilingualisation schema as our base data structure. @@ -157,23 +140,6 @@ RBIMPL_ATTR_NOALIAS() int rb_char_to_option_kcode(int c, int *option, int *kcode); /** - * Creates a new encoding, using the passed one as a template. - * - * @param[in] name Name of the creating encoding. - * @param[in] src Template. - * @exception rb_eArgError Duplicated or malformed `name`. - * @return Replicated new encoding's index. - * @post Encoding named `name` is created as a copy of `src`, whose index - * is the return value. - * - * @internal - * - * `name` can be `NULL`, but that just raises an exception. OTOH it seems no - * sanity check is done against `src`...? - */ -int rb_enc_replicate(const char *name, rb_encoding *src); - -/** * Creates a new "dummy" encoding. Roughly speaking, an encoding is dummy when * it is stateful. Notable example of dummy encoding are those defined in * ISO/IEC 2022 @@ -218,6 +184,27 @@ int rb_enc_to_index(rb_encoding *enc); int rb_enc_get_index(VALUE obj); /** + * @alias{rb_enc_get_index} + * + * @internal + * + * Implementation wise this is not a verbatim alias of rb_enc_get_index(). But + * the API is consistent. Don't bother. + */ +static inline int +RB_ENCODING_GET(VALUE obj) +{ + int encindex = RB_ENCODING_GET_INLINED(obj); + + if (encindex == RUBY_ENCODING_INLINE_MAX) { + return rb_enc_get_index(obj); + } + else { + return encindex; + } +} + +/** * Destructively assigns an encoding (via its index) to an object. * * @param[out] obj Object in question. @@ -229,6 +216,31 @@ int rb_enc_get_index(VALUE obj); */ void rb_enc_set_index(VALUE obj, int encindex); +/** @alias{rb_enc_set_index} */ +static inline void +RB_ENCODING_SET(VALUE obj, int encindex) +{ + rb_enc_set_index(obj, encindex); +} + +/** + * This is #RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo. The object must be + * capable of having inline encoding. Using this macro needs deep + * understanding of bit level object binary layout. + * + * @param[out] obj Target object. + * @param[in] encindex Encoding in encindex format. + * @param[in] cr An enum ::ruby_coderange_type. + * @post `obj`'s encoding is `encindex`. + * @post `obj`'s code range is `cr`. + */ +static inline void +RB_ENCODING_CODERANGE_SET(VALUE obj, int encindex, enum ruby_coderange_type cr) +{ + RB_ENCODING_SET(obj, encindex); + RB_ENC_CODERANGE_SET(obj, cr); +} + RBIMPL_ATTR_PURE() /** * Queries if the passed object can have its encoding. @@ -347,8 +359,8 @@ rb_encoding *rb_enc_check(VALUE str1,VALUE str2); VALUE rb_enc_associate_index(VALUE obj, int encindex); /** - * Identical to rb_enc_associate(), except it takes an encoding itself instead - * of its index. + * Identical to rb_enc_associate_index(), except it takes an encoding itself + * instead of its index. * * @param[out] obj Object in question. * @param[in] enc An encoding. @@ -401,7 +413,11 @@ rb_encoding *rb_enc_find(const char *name); * @param[in] enc An encoding. * @return Its name. */ -#define rb_enc_name(enc) (enc)->name +static inline const char * +rb_enc_name(rb_encoding *enc) +{ + return enc->name; +} /** * Queries the minimum number of bytes that the passed encoding needs to @@ -412,7 +428,11 @@ rb_encoding *rb_enc_find(const char *name); * @param[in] enc An encoding. * @return Its least possible number of bytes except 0. */ -#define rb_enc_mbminlen(enc) (enc)->min_enc_len +static inline int +rb_enc_mbminlen(rb_encoding *enc) +{ + return enc->min_enc_len; +} /** * Queries the maximum number of bytes that the passed encoding needs to @@ -423,7 +443,11 @@ rb_encoding *rb_enc_find(const char *name); * @param[in] enc An encoding. * @return Its maximum possible number of bytes of a character. */ -#define rb_enc_mbmaxlen(enc) (enc)->max_enc_len +static inline int +rb_enc_mbmaxlen(rb_encoding *enc) +{ + return enc->max_enc_len; +} /** * Queries the number of bytes of the character at the passed pointer. @@ -525,7 +549,6 @@ int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc); */ unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc); -RBIMPL_ATTR_DEPRECATED(("use rb_enc_codepoint_len instead.")) /** * Queries the code point of character pointed by the passed pointer. * Exceptions happen in case of broken input. @@ -536,12 +559,24 @@ RBIMPL_ATTR_DEPRECATED(("use rb_enc_codepoint_len instead.")) * @param[in] enc Encoding of the string. * @exception rb_eArgError `p` is broken. * @return Code point of the character pointed by `p`. + * + * @internal + * + * @matz says in commit 91e5ba1cb865a2385d3e1cbfacd824496898e098 that the line + * below is a "prototype for obsolete function". However even today there + * still are some use cases of it throughout our repository. It seems it has + * its own niche. */ -unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc); +static inline unsigned int +rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc) +{ + return rb_enc_codepoint_len(p, e, 0, enc); + /* ^^^ + * This can be `NULL` in C, `nullptr` in C++, and `0` for both. + * We choose the most portable one here. + */ +} -/** @cond INTERNAL_MACRO */ -#define rb_enc_codepoint(p,e,enc) rb_enc_codepoint_len((p),(e),0,(enc)) -/** @endcond */ /** * Identical to rb_enc_codepoint(), except it assumes the passed character is @@ -552,7 +587,14 @@ unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc); * @param[in] enc Encoding of the string. * @return Code point of the character pointed by `p`. */ -#define rb_enc_mbc_to_codepoint(p, e, enc) ONIGENC_MBC_TO_CODE((enc),(UChar*)(p),(UChar*)(e)) +static inline OnigCodePoint +rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc) +{ + const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p); + const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e); + + return ONIGENC_MBC_TO_CODE(enc, up, ue); +} /** * Queries the number of bytes requested to represent the passed code point @@ -568,27 +610,43 @@ int rb_enc_codelen(int code, rb_encoding *enc); /** * Identical to rb_enc_codelen(), except it returns 0 for invalid code points. * - * @param[in] code Code point in question. - * @param[in] enc Encoding to convert the code into a byte sequence. - * @retval 0 `code` is invalid. - * @return otherwise Number of bytes used for `enc` to encode `code`. + * @param[in] c Code point in question. + * @param[in] enc Encoding to convert `c` into a byte sequence. + * @retval 0 `c` is invalid. + * @return otherwise Number of bytes needed for `enc` to encode `c`. */ -int rb_enc_code_to_mbclen(int code, rb_encoding *enc); +static inline int +rb_enc_code_to_mbclen(int c, rb_encoding *enc) +{ + OnigCodePoint uc = RBIMPL_CAST((OnigCodePoint)c); -/** @cond INTERNAL_MACRO */ -#define rb_enc_code_to_mbclen(c, enc) ONIGENC_CODE_TO_MBCLEN((enc), (c)); -/** @endcond */ + return ONIGENC_CODE_TO_MBCLEN(enc, uc); +} /** * Identical to rb_enc_uint_chr(), except it writes back to the passed buffer * instead of allocating one. * - * @param[in] c Code point. - * @param[out] buf Return buffer. - * @param[in] enc Target encoding scheme. - * @post `c` is encoded according to `enc`, then written to `buf`. + * @param[in] c Code point. + * @param[out] buf Return buffer. + * @param[in] enc Target encoding scheme. + * @retval <= 0 `c` is invalid in `enc`. + * @return otherwise Number of bytes written to `buf`. + * @post `c` is encoded according to `enc`, then written to `buf`. + * + * @internal + * + * The second argument must be typed. But its current usages prevent us from + * being any stricter than this. :FIXME: */ -#define rb_enc_mbcput(c,buf,enc) ONIGENC_CODE_TO_MBC((enc),(c),(UChar*)(buf)) +static inline int +rb_enc_mbcput(unsigned int c, void *buf, rb_encoding *enc) +{ + OnigCodePoint uc = RBIMPL_CAST((OnigCodePoint)c); + OnigUChar *ubuf = RBIMPL_CAST((OnigUChar *)buf); + + return ONIGENC_CODE_TO_MBC(enc, uc, ubuf); +} /** * Queries the previous (left) character. @@ -600,7 +658,16 @@ int rb_enc_code_to_mbclen(int code, rb_encoding *enc); * @retval NULL No previous character. * @retval otherwise Pointer to the head of the previous character. */ -#define rb_enc_prev_char(s,p,e,enc) ((char *)onigenc_get_prev_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) +static inline char * +rb_enc_prev_char(const char *s, const char *p, const char *e, rb_encoding *enc) +{ + const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s); + const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p); + const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e); + OnigUChar *ur = onigenc_get_prev_char_head(enc, us, up, ue); + + return RBIMPL_CAST((char *)ur); +} /** * Queries the left boundary of a character. This function takes a pointer @@ -612,7 +679,16 @@ int rb_enc_code_to_mbclen(int code, rb_encoding *enc); * @param[in] enc Encoding. * @return Pointer to the head of the character that contains `p`. */ -#define rb_enc_left_char_head(s,p,e,enc) ((char *)onigenc_get_left_adjust_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) +static inline char * +rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc) +{ + const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s); + const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p); + const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e); + OnigUChar *ur = onigenc_get_left_adjust_char_head(enc, us, up, ue); + + return RBIMPL_CAST((char *)ur); +} /** * Queries the right boundary of a character. This function takes a pointer @@ -624,7 +700,16 @@ int rb_enc_code_to_mbclen(int code, rb_encoding *enc); * @param[in] enc Encoding. * @return Pointer to the end of the character that contains `p`. */ -#define rb_enc_right_char_head(s,p,e,enc) ((char *)onigenc_get_right_adjust_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) +static inline char * +rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc) +{ + const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s); + const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p); + const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e); + OnigUChar *ur = onigenc_get_right_adjust_char_head(enc, us, up, ue); + + return RBIMPL_CAST((char *)ur); +} /** * Scans the string backwards for n characters. @@ -637,7 +722,16 @@ int rb_enc_code_to_mbclen(int code, rb_encoding *enc); * @retval NULL There are no `n` characters left. * @retval otherwise Pointer to `n` character before `p`. */ -#define rb_enc_step_back(s,p,e,n,enc) ((char *)onigenc_step_back((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e),(int)(n))) +static inline char * +rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc) +{ + const OnigUChar *us = RBIMPL_CAST((const OnigUChar *)s); + const OnigUChar *up = RBIMPL_CAST((const OnigUChar *)p); + const OnigUChar *ue = RBIMPL_CAST((const OnigUChar *)e); + const OnigUChar *ur = onigenc_step_back(enc, us, up, ue, n); + + return RBIMPL_CAST((char *)ur); +} /** * @private @@ -670,8 +764,19 @@ rb_enc_asciicompat_inline(rb_encoding *enc) * @retval 0 It is incompatible. * @retval 1 It is compatible. */ -#define rb_enc_asciicompat(enc) rb_enc_asciicompat_inline(enc) - +static inline bool +rb_enc_asciicompat(rb_encoding *enc) +{ + if (rb_enc_mbminlen(enc) != 1) { + return false; + } + else if (rb_enc_dummy_p(enc)) { + return false; + } + else { + return true; + } +} /** * Queries if the passed string is in an ASCII-compatible encoding. @@ -680,7 +785,13 @@ rb_enc_asciicompat_inline(rb_encoding *enc) * @retval 0 `str` is not a String, or an ASCII-incompatible string. * @retval 1 Otherwise. */ -#define rb_enc_str_asciicompat_p(str) rb_enc_asciicompat(rb_enc_get(str)) +static inline bool +rb_enc_str_asciicompat_p(VALUE str) +{ + rb_encoding *enc = rb_enc_get(str); + + return rb_enc_asciicompat(enc); +} /** * Queries the Ruby-level counterpart instance of ::rb_cEncoding that @@ -803,6 +914,21 @@ RBIMPL_ATTR_CONST() int rb_ascii8bit_encindex(void); #endif +/** + * Queries if the passed object is in ascii 8bit (== binary) encoding. The + * object must be capable of having inline encoding. Using this macro needs + * deep understanding of bit level object binary layout. + * + * @param[in] obj An object to check. + * @retval 1 It is. + * @retval 0 It isn't. + */ +static inline bool +RB_ENCODING_IS_ASCII8BIT(VALUE obj) +{ + return RB_ENCODING_GET_INLINED(obj) == rb_ascii8bit_encindex(); +} + #ifndef rb_utf8_encindex RBIMPL_ATTR_CONST() /** @@ -894,4 +1020,25 @@ VALUE rb_locale_charmap(VALUE klass); RBIMPL_SYMBOL_EXPORT_END() +/** @cond INTERNAL_MACRO */ +#define RB_ENCODING_GET RB_ENCODING_GET +#define RB_ENCODING_GET_INLINED RB_ENCODING_GET_INLINED +#define RB_ENCODING_IS_ASCII8BIT RB_ENCODING_IS_ASCII8BIT +#define RB_ENCODING_SET RB_ENCODING_SET +#define RB_ENCODING_SET_INLINED RB_ENCODING_SET_INLINED +#define rb_enc_asciicompat rb_enc_asciicompat +#define rb_enc_code_to_mbclen rb_enc_code_to_mbclen +#define rb_enc_codepoint rb_enc_codepoint +#define rb_enc_left_char_head rb_enc_left_char_head +#define rb_enc_mbc_to_codepoint rb_enc_mbc_to_codepoint +#define rb_enc_mbcput rb_enc_mbcput +#define rb_enc_mbmaxlen rb_enc_mbmaxlen +#define rb_enc_mbminlen rb_enc_mbminlen +#define rb_enc_name rb_enc_name +#define rb_enc_prev_char rb_enc_prev_char +#define rb_enc_right_char_head rb_enc_right_char_head +#define rb_enc_step_back rb_enc_step_back +#define rb_enc_str_asciicompat_p rb_enc_str_asciicompat_p +/** @endcond */ + #endif /* RUBY_INTERNAL_ENCODING_ENCODING_H */ diff --git a/include/ruby/internal/encoding/string.h b/include/ruby/internal/encoding/string.h index 87226bec10..ea78cf23f3 100644 --- a/include/ruby/internal/encoding/string.h +++ b/include/ruby/internal/encoding/string.h @@ -25,11 +25,12 @@ #include "ruby/internal/value.h" #include "ruby/internal/encoding/encoding.h" #include "ruby/internal/attr/nonnull.h" +#include "ruby/internal/intern/string.h" /* rbimpl_strlen */ RBIMPL_SYMBOL_EXPORT_BEGIN() /** - * Identical to rb_enc_str_new(), except it additionally takes an encoding. + * Identical to rb_str_new(), except it additionally takes an encoding. * * @param[in] ptr A memory region of `len` bytes length. * @param[in] len Length of `ptr`, in bytes, not including the @@ -189,7 +190,7 @@ VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc) * In other languages, APIs like this one could be seen as the primitive * routines where encodings' "encode" feature are implemented. However in case * of Ruby this is not the primitive one. We directly manipulate encoded - * strings. Encoding conversion routines transocde an encoded string directly + * strings. Encoding conversion routines transcode an encoded string directly * to another one; not via a code point array. */ VALUE rb_enc_uint_chr(unsigned int code, rb_encoding *enc); @@ -263,6 +264,14 @@ VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to); VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts); /** + * @private + * + * This is an implementation detail of rb_enc_str_coderange(). Don't use this + * directly. + **/ +int rbimpl_enc_str_coderange_scan(VALUE str); + +/** * Scans the passed string to collect its code range. Because a Ruby's string * is mutable, its contents change from time to time; so does its code range. * A long-lived string tends to fall back to ::RUBY_ENC_CODERANGE_UNKNOWN. @@ -274,6 +283,27 @@ VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ec int rb_enc_str_coderange(VALUE str); /** + * Scans the passed string to collect its code range. Because a Ruby's string + * is mutable, its contents change from time to time; so does its code range. + * A long-lived string tends to fall back to ::RUBY_ENC_CODERANGE_UNKNOWN. + * This API scans it and re-assigns a fine-grained code range constant. + * + * @param[out] str A string. + * @return An enum ::ruby_coderange_type. + */ +static inline int +rb_enc_str_coderange_inline(VALUE str) +{ + int cr = ENC_CODERANGE(str); + if (cr == ENC_CODERANGE_UNKNOWN) { + cr = rbimpl_enc_str_coderange_scan(str); + } + return cr; +} + +#define rb_enc_str_coderange rb_enc_str_coderange_inline + +/** * Scans the passed string until it finds something odd. Returns the number of * bytes scanned. As the name implies this is suitable for repeated call. One * of its application is `IO#readlines`. The method reads from its receiver's @@ -306,30 +336,38 @@ RBIMPL_ATTR_NONNULL(()) /** * Looks for the passed string in the passed buffer. * - * @param[in] x Buffer that potentially includes `y`. + * @param[in] x Query string. * @param[in] m Number of bytes of `x`. - * @param[in] y Query string. + * @param[in] y Buffer that potentially includes `x`. * @param[in] n Number of bytes of `y`. * @param[in] enc Encoding of both `x` and `y`. * @retval -1 Not found. - * @retval otherwise Found index in `x`. + * @retval otherwise Found index in `y`. * @note This API can match at a non-character-boundary. */ long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc); /** @cond INTERNAL_MACRO */ -#ifdef HAVE_BUILTIN___BUILTIN_CONSTANT_P -#define rb_enc_str_new(str, len, enc) RB_GNUC_EXTENSION_BLOCK( \ - (__builtin_constant_p(str) && __builtin_constant_p(len)) ? \ - rb_enc_str_new_static((str), (len), (enc)) : \ - rb_enc_str_new((str), (len), (enc)) \ -) -#define rb_enc_str_new_cstr(str, enc) RB_GNUC_EXTENSION_BLOCK( \ - (__builtin_constant_p(str)) ? \ - rb_enc_str_new_static((str), (long)strlen(str), (enc)) : \ - rb_enc_str_new_cstr((str), (enc)) \ -) -#endif +RBIMPL_ATTR_NONNULL(()) +static inline VALUE +rbimpl_enc_str_new_cstr(const char *str, rb_encoding *enc) +{ + long len = rbimpl_strlen(str); + + return rb_enc_str_new_static(str, len, enc); +} + +#define rb_enc_str_new(str, len, enc) \ + ((RBIMPL_CONSTANT_P(str) && \ + RBIMPL_CONSTANT_P(len) ? \ + rb_enc_str_new_static: \ + rb_enc_str_new) ((str), (len), (enc))) + +#define rb_enc_str_new_cstr(str, enc) \ + ((RBIMPL_CONSTANT_P(str) ? \ + rbimpl_enc_str_new_cstr : \ + rb_enc_str_new_cstr) ((str), (enc))) + /** @endcond */ RBIMPL_SYMBOL_EXPORT_END() diff --git a/include/ruby/internal/encoding/transcode.h b/include/ruby/internal/encoding/transcode.h index 60c96a41c9..7f26d2eae9 100644 --- a/include/ruby/internal/encoding/transcode.h +++ b/include/ruby/internal/encoding/transcode.h @@ -476,16 +476,16 @@ enum ruby_econv_flag_type { RUBY_ECONV_UNDEF_HEX_CHARREF = 0x00000030, /** Decorators are there. */ - RUBY_ECONV_DECORATOR_MASK = 0x0000ff00, + RUBY_ECONV_DECORATOR_MASK = 0x0001ff00, /** Newline converters are there. */ - RUBY_ECONV_NEWLINE_DECORATOR_MASK = 0x00003f00, + RUBY_ECONV_NEWLINE_DECORATOR_MASK = 0x00007f00, /** (Unclear; seems unused). */ RUBY_ECONV_NEWLINE_DECORATOR_READ_MASK = 0x00000f00, /** (Unclear; seems unused). */ - RUBY_ECONV_NEWLINE_DECORATOR_WRITE_MASK = 0x00003000, + RUBY_ECONV_NEWLINE_DECORATOR_WRITE_MASK = 0x00007000, /** Universal newline mode. */ RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR = 0x00000100, @@ -496,11 +496,14 @@ enum ruby_econv_flag_type { /** CRLF to CR conversion shall happen. */ RUBY_ECONV_CR_NEWLINE_DECORATOR = 0x00002000, + /** CRLF to LF conversion shall happen. */ + RUBY_ECONV_LF_NEWLINE_DECORATOR = 0x00004000, + /** Texts shall be XML-escaped. */ - RUBY_ECONV_XML_TEXT_DECORATOR = 0x00004000, + RUBY_ECONV_XML_TEXT_DECORATOR = 0x00008000, /** Texts shall be AttrValue escaped */ - RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR = 0x00008000, + RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR = 0x00010000, /** (Unclear; seems unused). */ RUBY_ECONV_STATEFUL_DECORATOR_MASK = 0x00f00000, @@ -529,6 +532,7 @@ enum ruby_econv_flag_type { #define ECONV_UNIVERSAL_NEWLINE_DECORATOR RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR} */ #define ECONV_CRLF_NEWLINE_DECORATOR RUBY_ECONV_CRLF_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_CRLF_NEWLINE_DECORATOR} */ #define ECONV_CR_NEWLINE_DECORATOR RUBY_ECONV_CR_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_CR_NEWLINE_DECORATOR} */ +#define ECONV_LF_NEWLINE_DECORATOR RUBY_ECONV_LF_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_LF_NEWLINE_DECORATOR} */ #define ECONV_XML_TEXT_DECORATOR RUBY_ECONV_XML_TEXT_DECORATOR /**< @old{RUBY_ECONV_XML_TEXT_DECORATOR} */ #define ECONV_XML_ATTR_CONTENT_DECORATOR RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR /**< @old{RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR} */ #define ECONV_STATEFUL_DECORATOR_MASK RUBY_ECONV_STATEFUL_DECORATOR_MASK /**< @old{RUBY_ECONV_STATEFUL_DECORATOR_MASK} */ @@ -543,10 +547,10 @@ enum ruby_econv_flag_type { */ /** Indicates the input is a part of much larger one. */ - RUBY_ECONV_PARTIAL_INPUT = 0x00010000, + RUBY_ECONV_PARTIAL_INPUT = 0x00020000, /** Instructs the converter to stop after output. */ - RUBY_ECONV_AFTER_OUTPUT = 0x00020000, + RUBY_ECONV_AFTER_OUTPUT = 0x00040000, #define ECONV_PARTIAL_INPUT RUBY_ECONV_PARTIAL_INPUT /**< @old{RUBY_ECONV_PARTIAL_INPUT} */ #define ECONV_AFTER_OUTPUT RUBY_ECONV_AFTER_OUTPUT /**< @old{RUBY_ECONV_AFTER_OUTPUT} */ |
