From 312668cf031ce5e018f78d6a7cad9bcdcdac6ae6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8D=9C=E9=83=A8=E6=98=8C=E5=B9=B3?= Date: Fri, 24 Sep 2021 10:34:32 +0900 Subject: split include/ruby/encoding.h 2,291 lines are too much! include/ruby/encoding.h became the biggest header file once it had doxygen comments. Let us split it into smaller parts, so that we can better organise their contents. --- include/ruby/encoding.h | 2278 +--------------------------- include/ruby/internal/encoding/coderange.h | 172 +++ include/ruby/internal/encoding/ctype.h | 182 +++ include/ruby/internal/encoding/encoding.h | 897 +++++++++++ include/ruby/internal/encoding/pathname.h | 184 +++ include/ruby/internal/encoding/re.h | 46 + include/ruby/internal/encoding/sprintf.h | 78 + include/ruby/internal/encoding/string.h | 337 ++++ include/ruby/internal/encoding/symbol.h | 100 ++ include/ruby/internal/encoding/transcode.h | 558 +++++++ 10 files changed, 2563 insertions(+), 2269 deletions(-) create mode 100644 include/ruby/internal/encoding/coderange.h create mode 100644 include/ruby/internal/encoding/ctype.h create mode 100644 include/ruby/internal/encoding/encoding.h create mode 100644 include/ruby/internal/encoding/pathname.h create mode 100644 include/ruby/internal/encoding/re.h create mode 100644 include/ruby/internal/encoding/sprintf.h create mode 100644 include/ruby/internal/encoding/string.h create mode 100644 include/ruby/internal/encoding/symbol.h create mode 100644 include/ruby/internal/encoding/transcode.h (limited to 'include/ruby') diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 95cf74a3b8..1256393701 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -16,2276 +16,16 @@ * relatively less rooms for things in this header file be useful when writing * an extension library. */ -#include "ruby/internal/config.h" -#include #include "ruby/ruby.h" -#include "ruby/oniguruma.h" -#include "ruby/internal/attr/const.h" -#include "ruby/internal/attr/deprecated.h" -#include "ruby/internal/attr/format.h" -#include "ruby/internal/attr/noalias.h" -#include "ruby/internal/attr/nonnull.h" -#include "ruby/internal/attr/noreturn.h" -#include "ruby/internal/attr/returns_nonnull.h" -#include "ruby/internal/attr/pure.h" -#include "ruby/internal/core/rbasic.h" -#include "ruby/internal/dllexport.h" -#include "ruby/internal/fl_type.h" -RBIMPL_SYMBOL_EXPORT_BEGIN() - -/** - * @private - * - * Bit constants used when embedding encodings into ::RBasic::flags. Extension - * libraries must not bother such things. - */ -enum ruby_encoding_consts { - - /** Max possible number of embeddable encodings. */ - RUBY_ENCODING_INLINE_MAX = 127, - - /** Where inline encodings reside. */ - RUBY_ENCODING_SHIFT = (RUBY_FL_USHIFT+10), - - /** Bits we use to store inline encodings. */ - RUBY_ENCODING_MASK = (RUBY_ENCODING_INLINE_MAX<flags &= ~RUBY_ENCODING_MASK;\ - RBASIC(obj)->flags |= (VALUE)(i) << RUBY_ENCODING_SHIFT;\ -} while (0) - -/** @alias{rb_enc_set_index} */ -#define RB_ENCODING_SET(obj,i) rb_enc_set_index((obj), (i)) - -/** - * Queries the encoding of the passed object. The encoding must be smaller - * than ::RUBY_ENCODING_INLINE_MAX, which means you have some assumption on the - * return value. This means the API is for internal use only. - * - * @param[in] obj Target object. - * @return `obj`'s encoding index. - */ -#define RB_ENCODING_GET_INLINED(obj) \ - (int)((RBASIC(obj)->flags & RUBY_ENCODING_MASK)>>RUBY_ENCODING_SHIFT) - -/** - * @alias{rb_enc_get_index} - * - * @internal - * - * Implementation wise this is not a verbatim alias of rb_enc_get_index(). But - * the API is consistent. Don't bother. - */ -#define RB_ENCODING_GET(obj) \ - (RB_ENCODING_GET_INLINED(obj) != RUBY_ENCODING_INLINE_MAX ? \ - RB_ENCODING_GET_INLINED(obj) : \ - rb_enc_get_index(obj)) - -/** - * Queries if the passed object is in ascii 8bit (== binary) encoding. The - * object must be capable of having inline encoding. Using this macro needs - * deep understanding of bit level object binary layout. - * - * @param[in] obj An object to check. - * @retval 1 It is. - * @retval 0 It isn't. - */ -#define RB_ENCODING_IS_ASCII8BIT(obj) (RB_ENCODING_GET_INLINED(obj) == 0) - -#define ENCODING_SET_INLINED(obj,i) RB_ENCODING_SET_INLINED(obj,i) /**< @old{RB_ENCODING_SET_INLINED} */ -#define ENCODING_SET(obj,i) RB_ENCODING_SET(obj,i) /**< @old{RB_ENCODING_SET} */ -#define ENCODING_GET_INLINED(obj) RB_ENCODING_GET_INLINED(obj) /**< @old{RB_ENCODING_GET_INLINED} */ -#define ENCODING_GET(obj) RB_ENCODING_GET(obj) /**< @old{RB_ENCODING_GET} */ -#define ENCODING_IS_ASCII8BIT(obj) RB_ENCODING_IS_ASCII8BIT(obj) /**< @old{RB_ENCODING_IS_ASCII8BIT} */ -#define ENCODING_MAXNAMELEN RUBY_ENCODING_MAXNAMELEN /**< @old{RUBY_ENCODING_MAXNAMELEN} */ - -/** What rb_enc_str_coderange() returns. */ -enum ruby_coderange_type { - - /** The object's coderange is unclear yet. */ - RUBY_ENC_CODERANGE_UNKNOWN = 0, - - /** The object holds 0 to 127 inclusive and nothing else. */ - RUBY_ENC_CODERANGE_7BIT = ((int)RUBY_FL_USER8), - - /** The object's encoding and contents are consistent each other */ - RUBY_ENC_CODERANGE_VALID = ((int)RUBY_FL_USER9), - - /** The object holds invalid/malformed/broken character(s). */ - RUBY_ENC_CODERANGE_BROKEN = ((int)(RUBY_FL_USER8|RUBY_FL_USER9)), - - /** Where the coderange resides. */ - RUBY_ENC_CODERANGE_MASK = (RUBY_ENC_CODERANGE_7BIT| - RUBY_ENC_CODERANGE_VALID| - RUBY_ENC_CODERANGE_BROKEN) -}; - -RBIMPL_ATTR_CONST() -/** - * @private - * - * This is an implementation detail of #RB_ENC_CODERANGE_CLEAN_P. People don't - * use it directly. - * - * @param[in] cr An enum ::ruby_coderange_type. - * @retval 1 It is. - * @retval 0 It isn't. - */ -static inline int -rb_enc_coderange_clean_p(int cr) -{ - return (cr ^ (cr >> 1)) & RUBY_ENC_CODERANGE_7BIT; -} - -/** - * Queries if a code range is "clean". "Clean" in this context means it is - * known and valid. - * - * @param[in] cr An enum ::ruby_coderange_type. - * @retval 1 It is. - * @retval 0 It isn't. - */ -#define RB_ENC_CODERANGE_CLEAN_P(cr) rb_enc_coderange_clean_p(cr) - -/** - * Queries the (inline) code range of the passed object. The object must be - * capable of having inline encoding. Using this macro needs deep - * understanding of bit level object binary layout. - * - * @param[in] obj Target object. - * @return An enum ::ruby_coderange_type. - */ -#define RB_ENC_CODERANGE(obj) ((int)RBASIC(obj)->flags & RUBY_ENC_CODERANGE_MASK) - -/** - * Queries the (inline) code range of the passed object is - * ::RUBY_ENC_CODERANGE_7BIT. The object must be capable of having inline - * encoding. Using this macro needs deep understanding of bit level object - * binary layout. - * - * @param[in] obj Target object. - * @retval 1 It is ascii only. - * @retval 0 Otherwise (including cases when the range is not known). - */ -#define RB_ENC_CODERANGE_ASCIIONLY(obj) (RB_ENC_CODERANGE(obj) == RUBY_ENC_CODERANGE_7BIT) - -/** - * Destructively modifies the passed object so that its (inline) code range is - * the passed one. The object must be capable of having inline encoding. - * Using this macro needs deep understanding of bit level object binary layout. - * - * @param[out] obj Target object. - * @param[out] cr An enum ::ruby_coderange_type. - * @post `obj`'s code range is `cr`. - */ -#define RB_ENC_CODERANGE_SET(obj,cr) (\ - RBASIC(obj)->flags = \ - (RBASIC(obj)->flags & ~RUBY_ENC_CODERANGE_MASK) | (cr)) - -/** - * Destructively clears the passed object's (inline) code range. The object - * must be capable of having inline encoding. Using this macro needs deep - * understanding of bit level object binary layout. - * - * @param[out] obj Target object. - * @post `obj`'s code range is ::RUBY_ENC_CODERANGE_UNKNOWN. - */ -#define RB_ENC_CODERANGE_CLEAR(obj) RB_ENC_CODERANGE_SET((obj),0) - -/* assumed ASCII compatibility */ -/** - * "Mix" two code ranges into one. This is handy for instance when you - * concatenate two strings into one. Consider one of then is valid but the - * other isn't. The result must be invalid. This macro computes that kind of - * mixture. - * - * @param[in] a An enum ::ruby_coderange_type. - * @param[in] b Another enum ::ruby_coderange_type. - * @return The `a` "and" `b`. - */ -#define RB_ENC_CODERANGE_AND(a, b) \ - ((a) == RUBY_ENC_CODERANGE_7BIT ? (b) : \ - (a) != RUBY_ENC_CODERANGE_VALID ? RUBY_ENC_CODERANGE_UNKNOWN : \ - (b) == RUBY_ENC_CODERANGE_7BIT ? RUBY_ENC_CODERANGE_VALID : (b)) - -/** - * This is #RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo. The object must be - * capable of having inline encoding. Using this macro needs deep - * understanding of bit level object binary layout. - * - * @param[out] obj Target object. - * @param[in] encindex Encoding in encindex format. - * @param[in] cr An enum ::ruby_coderange_type. - * @post `obj`'s encoding is `encindex`. - * @post `obj`'s code range is `cr`. - */ -#define RB_ENCODING_CODERANGE_SET(obj, encindex, cr) \ - do { \ - VALUE rb_encoding_coderange_obj = (obj); \ - RB_ENCODING_SET(rb_encoding_coderange_obj, (encindex)); \ - RB_ENC_CODERANGE_SET(rb_encoding_coderange_obj, (cr)); \ - } while (0) - -#define ENC_CODERANGE_MASK RUBY_ENC_CODERANGE_MASK /**< @old{RUBY_ENC_CODERANGE_MASK} */ -#define ENC_CODERANGE_UNKNOWN RUBY_ENC_CODERANGE_UNKNOWN /**< @old{RUBY_ENC_CODERANGE_UNKNOWN} */ -#define ENC_CODERANGE_7BIT RUBY_ENC_CODERANGE_7BIT /**< @old{RUBY_ENC_CODERANGE_7BIT} */ -#define ENC_CODERANGE_VALID RUBY_ENC_CODERANGE_VALID /**< @old{RUBY_ENC_CODERANGE_VALID} */ -#define ENC_CODERANGE_BROKEN RUBY_ENC_CODERANGE_BROKEN /**< @old{RUBY_ENC_CODERANGE_BROKEN} */ -#define ENC_CODERANGE_CLEAN_P(cr) RB_ENC_CODERANGE_CLEAN_P(cr) /**< @old{RB_ENC_CODERANGE_CLEAN_P} */ -#define ENC_CODERANGE(obj) RB_ENC_CODERANGE(obj) /**< @old{RB_ENC_CODERANGE} */ -#define ENC_CODERANGE_ASCIIONLY(obj) RB_ENC_CODERANGE_ASCIIONLY(obj) /**< @old{RB_ENC_CODERANGE_ASCIIONLY} */ -#define ENC_CODERANGE_SET(obj,cr) RB_ENC_CODERANGE_SET(obj,cr) /**< @old{RB_ENC_CODERANGE_SET} */ -#define ENC_CODERANGE_CLEAR(obj) RB_ENC_CODERANGE_CLEAR(obj) /**< @old{RB_ENC_CODERANGE_CLEAR} */ -#define ENC_CODERANGE_AND(a, b) RB_ENC_CODERANGE_AND(a, b) /**< @old{RB_ENC_CODERANGE_AND} */ -#define ENCODING_CODERANGE_SET(obj, encindex, cr) RB_ENCODING_CODERANGE_SET(obj, encindex, cr) /**< @old{RB_ENCODING_CODERANGE_SET} */ - -/** - * The type of encoding. Our design here is we take Oniguruma/Onigmo's - * multilingualisation schema as our base data structure. - */ -typedef const OnigEncodingType rb_encoding; - -RBIMPL_ATTR_NOALIAS() -/** - * Converts a character option to its encoding. It only supports a very - * limited set of Japanese encodings due to its Japanese origin. Ruby still - * has this in-core for backwards compatibility. But new codes must not bother - * such concept like one-character encoding option. Consider deprecated in - * practice. - * - * @param[in] c One of `['n', 'e', 's', 'u', 'i', 'x', 'm']`. - * @param[out] option Return buffer. - * @param[out] kcode Return buffer. - * @retval 1 `c` understood properly. - * @retval 0 `c` is not understood. - * @post `option` is a ::OnigOptionType. - * @post `kcode` is an enum `ruby_preserved_encindex`. - * - * @internal - * - * `kcode` is opaque because `ruby_preserved_encindex` is not visible from - * extension libraries. But who cares? - */ -int rb_char_to_option_kcode(int c, int *option, int *kcode); - -/** - * Creates a new encoding, using the passed one as a template. - * - * @param[in] name Name of the creating encoding. - * @param[in] src Template. - * @exception rb_eArgError Duplicated or malformed `name`. - * @return Replicated new encoding's index. - * @post Encoding named `name` is created as a copy of `src`, whose index - * is the return value. - * - * @internal - * - * `name` can be `NULL`, but that just raises an exception. OTOH it seems no - * sanity check is done against `src`...? - */ -int rb_enc_replicate(const char *name, rb_encoding *src); - -/** - * Creates a new "dummy" encoding. Roughly speaking, an encoding is dummy when - * it is stateful. Notable example of dummy encoding are those defined in - * ISO/IEC 2022 - * - * @param[in] name Name of the creating encoding. - * @exception rb_eArgError Duplicated or malformed `name`. - * @return New dummy encoding's index. - * @post Encoding named `name` is created, whose index is the return - * value. - */ -int rb_define_dummy_encoding(const char *name); - -RBIMPL_ATTR_PURE() -/** - * Queries if the passed encoding is dummy. - * - * @param[in] enc Encoding in question. - * @retval 1 It is. - * @retval 0 It isn't. - */ -int rb_enc_dummy_p(rb_encoding *enc); - -RBIMPL_ATTR_PURE() -/** - * Queries the index of the encoding. An encoding's index is a Ruby-local - * concept. It is a (sequential) number assigned to each encoding. - * - * @param[in] enc Encoding in question. - * @return Its index. - * @note You can pass null pointers to this function. It is equivalent - * to rb_usascii_encindex() then. - */ -int rb_enc_to_index(rb_encoding *enc); - -/** - * Queries the index of the encoding of the passed object, if any. - * - * @param[in] obj Object in question. - * @retval -1 `obj` is incapable of having an encoding. - * @retval otherwise `obj`'s encoding's index. - */ -int rb_enc_get_index(VALUE obj); - -/** - * Destructively assigns an encoding (via its index) to an object. - * - * @param[out] obj Object in question. - * @param[in] encindex An encoding index. - * @exception rb_eFrozenError `obj` is frozen. - * @exception rb_eArgError `obj` is incapable of having an encoding. - * @exception rb_eEncodingError `encindex` is out of bounds. - * @exception rb_eLoadError Failed to load the encoding. - */ -void rb_enc_set_index(VALUE obj, int encindex); - -RBIMPL_ATTR_PURE() -/** - * Queries if the passed object can have its encoding. - * - * @param[in] obj Object in question. - * @retval 1 It can. - * @retval 0 It cannot. - */ -int rb_enc_capable(VALUE obj); - -/** - * Queries the index of the encoding. - * - * @param[in] name Name of the encoding to find. - * @exception rb_eArgError No such encoding named `name`. - * @retval -1 `name` exists, but unable to load. - * @retval otherwise Index of encoding named `name`. - */ -int rb_enc_find_index(const char *name); - -/** - * Registers an "alias" name. In the wild, an encoding can be called using - * multiple names. For instance an encoding known as `"CP932"` is also called - * `"SJIS"` on occasions. This API registers such relationships. - * - * @param[in] alias New name. - * @param[in] orig Old name. - * @exception rb_eArgError `alias` is duplicated or malformed. - * @retval -1 Failed to load `orig`. - * @retval otherwise The index of `orig` and `alias`. - * @post `alias` is a synonym of `orig`. They refer to the identical - * encoding. - */ -int rb_enc_alias(const char *alias, const char *orig); - -/** - * Obtains a encoding index from a wider range of objects (than - * rb_enc_find_index()). - * - * @param[in] obj An ::rb_cEncoding, or its name in ::rb_cString. - * @retval -1 `obj` is unexpected type/contents. - * @retval otherwise Index corresponding to `obj`. - */ -int rb_to_encoding_index(VALUE obj); - -/** - * Identical to rb_find_encoding(), except it raises an exception instead of - * returning NULL. - * - * @param[in] obj An ::rb_cEncoding, or its name in ::rb_cString. - * @exception rb_eTypeError `obj` is neither ::rb_cEncoding nor ::rb_cString. - * @exception rb_eArgError `obj` is an unknown encoding name. - * @return Encoding of `obj`. - */ -rb_encoding *rb_to_encoding(VALUE obj); - -/** - * Identical to rb_to_encoding_index(), except the return type. - * - * @param[in] obj An ::rb_cEncoding, or its name in ::rb_cString. - * @exception rb_eTypeError `obj` is neither ::rb_cEncoding nor ::rb_cString. - * @retval NULL No such encoding. - * @return otherwise Encoding of `obj`. - */ -rb_encoding *rb_find_encoding(VALUE obj); - -/** - * Identical to rb_enc_get_index(), except the return type. - * - * @param[in] obj Object in question. - * @retval NULL Obj is incapable of having an encoding. - * @retval otherwise `obj`'s encoding. - */ -rb_encoding *rb_enc_get(VALUE obj); - -/** - * Look for the "common" encoding between the two. One character can or cannot - * be expressed depending on an encoding. This function finds the super-set of - * encodings that satisfy contents of both arguments. If that is impossible - * returns NULL. - * - * @param[in] str1 An object. - * @param[in] str2 Another object. - * @retval NULL No encoding can satisfy both at once. - * @retval otherwise Common encoding between the two. - * @note Arguments can be non-string, e.g. Regexp. - */ -rb_encoding *rb_enc_compatible(VALUE str1, VALUE str2); - -/** - * Identical to rb_enc_compatible(), except it raises an exception instead of - * returning NULL. - * - * @param[in] str1 An object. - * @param[in] str2 Another object. - * @exception rb_eEncCompatError No encoding can satisfy both. - * @return Common encoding between the two. - * @note Arguments can be non-string, e.g. Regexp. - */ -rb_encoding *rb_enc_check(VALUE str1,VALUE str2); - -/** - * Identical to rb_enc_set_index(), except it additionally does contents fix-up - * depending on the passed object. It for instance changes the byte length of - * terminating `U+0000` according to the passed encoding. - * - * @param[out] obj Object in question. - * @param[in] encindex An encoding index. - * @exception rb_eFrozenError `obj` is frozen. - * @exception rb_eArgError `obj` is incapable of having an encoding. - * @exception rb_eEncodingError `encindex` is out of bounds. - * @exception rb_eLoadError Failed to load the encoding. - * @return The passed `obj`. - * @post `obj`'s contents might be fixed according to `encindex`. - */ -VALUE rb_enc_associate_index(VALUE obj, int encindex); - -/** - * Identical to rb_enc_associate(), except it takes an encoding itself instead - * of its index. - * - * @param[out] obj Object in question. - * @param[in] enc An encoding. - * @exception rb_eFrozenError `obj` is frozen. - * @exception rb_eArgError `obj` is incapable of having an encoding. - * @return The passed `obj`. - * @post `obj`'s contents might be fixed according to `enc`. - */ -VALUE rb_enc_associate(VALUE obj, rb_encoding *enc); - -/** - * Destructively copies the encoding of the latter object to that of former - * one. It can also be seen as a routine identical to - * rb_enc_associate_index(), except it takes an object's encoding instead of an - * encoding's index. - * - * @param[out] dst Object to modify. - * @param[in] src Object to reference. - * @exception rb_eFrozenError `dst` is frozen. - * @exception rb_eArgError `dst` is incapable of having an encoding. - * @exception rb_eEncodingError `src` is incapable of having an encoding. - * @post `dst`'s encoding is that of `src`'s. - */ -void rb_enc_copy(VALUE dst, VALUE src); - -/** - * Identical to rb_enc_str_new(), except it additionally takes an encoding. - * - * @param[in] ptr A memory region of `len` bytes length. - * @param[in] len Length of `ptr`, in bytes, not including the - * terminating NUL character. - * @param[in] enc Encoding of `ptr`. - * @exception rb_eNoMemError Failed to allocate `len+1` bytes. - * @exception rb_eArgError `len` is negative. - * @return An instance of ::rb_cString, of `len` bytes length, of `enc` - * encoding, whose contents are verbatim copy of `ptr`. - * @pre At least `len` bytes of continuous memory region shall be - * accessible via `ptr`. - * @note `enc` can be a null pointer. It can also be seen as a routine - * identical to rb_usascii_str_new() then. - */ -VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL((1)) -/** - * Identical to rb_enc_str_new(), except it assumes the passed pointer is a - * pointer to a C string. It can also be seen as a routine identical to - * rb_str_new_cstr(), except it additionally takes an encoding. - * - * @param[in] ptr A C string. - * @param[in] enc Encoding of `ptr`. - * @exception rb_eNoMemError Failed to allocate memory. - * @return An instance of ::rb_cString, of `enc` encoding, whose contents - * are verbatim copy of `ptr`. - * @pre `ptr` must not be a null pointer. - * @pre Because `ptr` is a C string it makes no sense for `enc` to be - * something like UTF-32. - * @note `enc` can be a null pointer. It can also be seen as a routine - * identical to rb_usascii_str_new_cstr() then. - */ -VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc); - -/** - * Identical to rb_enc_str_new(), except it takes a C string literal. It can - * also be seen as a routine identical to rb_str_new_static(), except it - * additionally takes an encoding. - * - * @param[in] ptr A C string literal. - * @param[in] len `strlen(ptr)`. - * @param[in] enc Encoding of `ptr`. - * @exception rb_eArgError `len` out of range of `size_t`. - * @pre `ptr` must be a C string constant. - * @return An instance of ::rb_cString, of `enc` encoding, whose backend - * storage is the passed C string literal. - * @warning It is a very bad idea to write to a C string literal (often - * immediate SEGV shall occur). Consider return values of this - * function be read-only. - * @note `enc` can be a null pointer. It can also be seen as a routine - * identical to rb_usascii_str_new_static() then. - */ -VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc); - -/** - * Identical to rb_enc_str_new(), except it returns a "f"string. It can also - * be seen as a routine identical to rb_interned_str(), except it additionally - * takes an encoding. - * - * @param[in] ptr A memory region of `len` bytes length. - * @param[in] len Length of `ptr`, in bytes, not including the - * terminating NUL character. - * @param[in] enc Encoding of `ptr`. - * @exception rb_eArgError `len` is negative. - * @return A found or created instance of ::rb_cString, of `len` bytes - * length, of `enc` encoding, whose contents are identical to that - * of `ptr`. - * @pre At least `len` bytes of continuous memory region shall be - * accessible via `ptr`. - * @note `enc` can be a null pointer. - */ -VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL((1)) -/** - * Identical to rb_enc_str_new_cstr(), except it returns a "f"string. It can - * also be seen as a routine identical to rb_interned_str_cstr(), except it - * additionally takes an encoding. - * - * @param[in] ptr A memory region of `len` bytes length. - * @param[in] enc Encoding of `ptr`. - * @return A found or created instance of ::rb_cString of `enc` encoding, - * whose contents are identical to that of `ptr`. - * @pre At least `len` bytes of continuous memory region shall be - * accessible via `ptr`. - * @note `enc` can be a null pointer. - */ -VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc); - -/** - * Identical to rb_reg_new(), except it additionally takes an encoding. - * - * @param[in] ptr A memory region of `len` bytes length. - * @param[in] len Length of `ptr`, in bytes, not including the - * terminating NUL character. - * @param[in] enc Encoding of `ptr`. - * @param[in] opts Options e.g. ONIG_OPTION_MULTILINE. - * @exception rb_eRegexpError Failed to compile `ptr`. - * @return An allocated new instance of ::rb_cRegexp, of `enc` encoding, - * whose expression is compiled according to `ptr`. - */ -VALUE rb_enc_reg_new(const char *ptr, long len, rb_encoding *enc, int opts); - -RBIMPL_ATTR_NONNULL((2)) -RBIMPL_ATTR_FORMAT(RBIMPL_PRINTF_FORMAT, 2, 3) -/** - * Identical to rb_sprintf(), except it additionally takes an encoding. The - * passed encoding rules both the incoming format specifier and the resulting - * string. - * - * @param[in] enc Encoding of `fmt`. - * @param[in] fmt A `printf`-like format specifier. - * @param[in] ... Variadic number of contents to format. - * @return A rendered new instance of ::rb_cString, of `enc` encoding. - */ -VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt, ...); - -RBIMPL_ATTR_NONNULL((2)) -RBIMPL_ATTR_FORMAT(RBIMPL_PRINTF_FORMAT, 2, 0) -/** - * Identical to rb_enc_sprintf(), except it takes a `va_list` instead of - * variadic arguments. It can also be seen as a routine identical to - * rb_vsprintf(), except it additionally takes an encoding. - * - * @param[in] enc Encoding of `fmt`. - * @param[in] fmt A `printf`-like format specifier. - * @param[in] ap Contents to format. - * @return A rendered new instance of ::rb_cString, of `enc` encoding. - */ -VALUE rb_enc_vsprintf(rb_encoding *enc, const char *fmt, va_list ap); - -/** - * Counts the number of characters of the passed string, according to the - * passed encoding. This has to be complicated. The passed string could be - * invalid and/or broken. This routine would scan from the beginning til the - * end, byte by byte, to seek out character boundaries. Could be super slow. - * - * @param[in] head Leftmost pointer to the string. - * @param[in] tail Rightmost pointer to the string. - * @param[in] enc Encoding of the string. - * @return Number of characters exist in `head` .. `tail`. The definition - * of "character" depends on the passed `enc`. - */ -long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc); - -/** - * Queries the n-th character. Like rb_enc_strlen() this function can be fast - * or slow depending on the contents. Don't expect characters to be uniformly - * distributed across the entire string. - * - * @param[in] head Leftmost pointer to the string. - * @param[in] tail Rightmost pointer to the string. - * @param[in] nth Requested index of characters. - * @param[in] enc Encoding of the string. - * @return Pointer to the first byte of the character that is `nth` - * character ahead of `head`, or `tail` if there is no such - * character (OOB etc). The definition of "character" depends on - * the passed `enc`. - */ -char *rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc); - -/** - * Identical to rb_enc_get_index(), except the return type. - * - * @param[in] obj Object in question. - * @exception rb_eTypeError `obj` is incapable of having an encoding. - * @return `obj`'s encoding. - */ -VALUE rb_obj_encoding(VALUE obj); - -/** - * Identical to rb_str_cat(), except it additionally takes an encoding. - * - * @param[out] str Destination object. - * @param[in] ptr Contents to append. - * @param[in] len Length of `src`, in bytes. - * @param[in] enc Encoding of `ptr`. - * @exception rb_eArgError `len` is negative. - * @exception rb_eEncCompatError `enc` is not compatible with `str`. - * @return The passed `dst`. - * @post The contents of `ptr` is copied, transcoded into `dst`'s - * encoding, then pasted into `dst`'s end. - */ -VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc); - -/** - * Encodes the passed code point into a series of bytes. - * - * @param[in] code Code point. - * @param[in] enc Target encoding scheme. - * @exception rb_eRangeError `enc` does not glean `code`. - * @return An instance of ::rb_cString, of `enc` encoding, whose sole - * contents is `code` represented in `enc`. - * @note No way to encode code points bigger than UINT_MAX. - * - * @internal - * - * In other languages, APIs like this one could be seen as the primitive - * routines where encodings' "encode" feature are implemented. However in case - * of Ruby this is not the primitive one. We directly manipulate encoded - * strings. Encoding conversion routines transocde an encoded string directly - * to another one; not via a code point array. - */ -VALUE rb_enc_uint_chr(unsigned int code, rb_encoding *enc); - -/** - * Identical to rb_external_str_new(), except it additionally takes an - * encoding. However the whole point of rb_external_str_new() is to encode a - * string into default external encoding. Being able to specify arbitrary - * encoding just ruins the designed purpose the function meseems. - * - * @param[in] ptr A memory region of `len` bytes length. - * @param[in] len Length of `ptr`, in bytes, not including the - * terminating NUL character. - * @param[in] enc Target encoding scheme. - * @exception rb_eArgError `len` is negative. - * @return An instance of ::rb_cString. In case encoding conversion from - * "default internal" to `enc` is fully defined over the given - * contents, then the return value is a string of `enc` encoding, - * whose contents are the converted ones. Otherwise the string is - * a junk. - * @warning It doesn't raise on a conversion failure and silently ends up in - * a corrupted output. You can know the failure by querying - * `valid_encoding?` of the result object. - * - * @internal - * - * @shyouhei has no idea why this one does not follow the naming convention - * that others obey. It seems to him that this should have been called - * `rb_enc_external_str_new`. - */ -VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc); - -/** - * Identical to rb_str_export(), except it additionally takes an encoding. - * - * @param[in] obj Target object. - * @param[in] enc Target encoding. - * @exception rb_eTypeError No implicit conversion to String. - * @return Converted ruby string of `enc` encoding. - */ -VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc); - -/** - * Encoding conversion main routine. - * - * @param[in] str String to convert. - * @param[in] from Source encoding. - * @param[in] to Destination encoding. - * @return A copy of `str`, with conversion from `from` to `to` applied. - * @note `from` can be a null pointer. `str`'s encoding is taken then. - * @note `to` can be a null pointer. No-op then. - */ -VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to); - -/** - * Identical to rb_str_conv_enc(), except it additionally takes IO encoder - * options. The extra arguments can be constructed using io_extract_modeenc() - * etc. - * - * @param[in] str String to convert. - * @param[in] from Source encoding. - * @param[in] to Destination encoding. - * @param[in] ecflags A set of enum ::ruby_econv_flag_type. - * @param[in] ecopts Optional hash. - * @return A copy of `str`, with conversion from `from` to `to` applied. - * @note `from` can be a null pointer. `str`'s encoding is taken then. - * @note `to` can be a null pointer. No-op then. - * @note `ecopts` can be ::RUBY_Qnil, which is equivalent to passing an - * empty hash. - */ -VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts); - -/** @cond INTERNAL_MACRO */ -#ifdef HAVE_BUILTIN___BUILTIN_CONSTANT_P -#define rb_enc_str_new(str, len, enc) RB_GNUC_EXTENSION_BLOCK( \ - (__builtin_constant_p(str) && __builtin_constant_p(len)) ? \ - rb_enc_str_new_static((str), (len), (enc)) : \ - rb_enc_str_new((str), (len), (enc)) \ -) -#define rb_enc_str_new_cstr(str, enc) RB_GNUC_EXTENSION_BLOCK( \ - (__builtin_constant_p(str)) ? \ - rb_enc_str_new_static((str), (long)strlen(str), (enc)) : \ - rb_enc_str_new_cstr((str), (enc)) \ -) -#endif -/** @endcond */ - -RBIMPL_ATTR_NORETURN() -RBIMPL_ATTR_NONNULL((3)) -RBIMPL_ATTR_FORMAT(RBIMPL_PRINTF_FORMAT, 3, 4) -/** - * Identical to rb_raise(), except it additionally takes an encoding. - * - * @param[in] enc Encoding of the generating exception. - * @param[in] exc A subclass of ::rb_eException. - * @param[in] fmt Format specifier string compatible with rb_sprintf(). - * @param[in] ... Contents of the message. - * @exception exc The specified exception. - * @note It never returns. - */ -void rb_enc_raise(rb_encoding *enc, VALUE exc, const char *fmt, ...); - -/** - * Identical to rb_find_encoding(), except it takes an encoding index instead - * of a Ruby object. - * - * @param[in] idx An encoding index. - * @retval NULL No such encoding. - * @retval otherwise An encoding whose index is `idx`. - */ -rb_encoding *rb_enc_from_index(int idx); - -/** - * Identical to rb_find_encoding(), except it takes a C's string instead of - * Ruby's. - * - * @param[in] name Name of the encoding to query. - * @retval NULL No such encoding. - * @retval otherwise An encoding whose index is `idx`. - */ -rb_encoding *rb_enc_find(const char *name); - -/** - * Queries the (canonical) name of the passed encoding. - * - * @param[in] enc An encoding. - * @return Its name. - */ -#define rb_enc_name(enc) (enc)->name - -/** - * Queries the minimum number of bytes that the passed encoding needs to - * represent a character. For ASCII and compatible encodings this is typically - * 1. There are however encodings whose minimum is not 1; they are - * historically called wide characters. - * - * @param[in] enc An encoding. - * @return Its least possible number of bytes except 0. - */ -#define rb_enc_mbminlen(enc) (enc)->min_enc_len - -/** - * Queries the maximum number of bytes that the passed encoding needs to - * represent a character. Fixed-width encodings have the same value for this - * one and #rb_enc_mbminlen. However there are variable-width encodings. - * UTF-8, for instance, takes from 1 up to 6 bytes. - * - * @param[in] enc An encoding. - * @return Its maximum possible number of bytes of a character. - */ -#define rb_enc_mbmaxlen(enc) (enc)->max_enc_len - -/** - * Queries the number of bytes of the character at the passed pointer. - * - * @param[in] p Pointer to a character's first byte. - * @param[in] e End of the string that has `p`. - * @param[in] enc Encoding of the string. - * @return If the character at `p` does not end until `e`, number of bytes - * between `p` and `e`. Otherwise the number of bytes that the - * character at `p` is encoded. - * - * @internal - * - * Strictly speaking there are chances when `p` points to a middle byte of a - * wide character. This function returns "the number of bytes from `p` to - * nearest of either `e` or the next character boundary", if you go strict. - */ -int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc); - -/** - * Identical to rb_enc_mbclen() unless the character at `p` overruns `e`. That - * can happen for instance when you read from a socket and its partial read - * cuts a wide character in-between. In those situations this function - * "estimates" theoretical length of the character in question. Typically it - * tends to be possible to know how many bytes a character needs before - * actually reaching its end; for instance UTF-8 encodes a character's length - * in the first byte of it. This function returns that info. - * - * @note This implies that the string is not broken. - * - * @param[in] p Pointer to the character's first byte. - * @param[in] e End of the string that has `p`. - * @param[in] enc Encoding of the string. - * @return Number of bytes of character at `p`, measured or estimated. - */ -int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc); - -/** - * Queries the number of bytes of the character at the passed pointer. This - * function returns 3 different types of information: - * - * ```CXX - * auto n = rb_enc_precise_mbclen(p, q, r); - * - * if (ONIGENC_MBCLEN_CHARFOUND_P(n)) { - * // Character found. Normal return. - * auto found_length = ONIGENC_MBCLEN_CHARFOUND_LEN(n); - * } - * else if (ONIGENC_MBCLEN_NEEDMORE_P(n)) { - * // Character overruns past `q`; needs more. - * auto requested_length = ONIGENC_MBCLEN_NEEDMORE_LEN(n); - * } - * else { - * // `p` is broken. - * assert(ONIGENC_MBCLEN_INVALID_P(n)); - * } - * ``` - * - * @param[in] p Pointer to the character's first byte. - * @param[in] e End of the string that has `p`. - * @param[in] enc Encoding of the string. - * @return Encoded read/needed number of bytes (see above). - */ -int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc); - -#define MBCLEN_CHARFOUND_P(ret) ONIGENC_MBCLEN_CHARFOUND_P(ret) /**< @old{ONIGENC_MBCLEN_CHARFOUND_P} */ -#define MBCLEN_CHARFOUND_LEN(ret) ONIGENC_MBCLEN_CHARFOUND_LEN(ret) /**< @old{ONIGENC_MBCLEN_CHARFOUND_LEN} */ -#define MBCLEN_INVALID_P(ret) ONIGENC_MBCLEN_INVALID_P(ret) /**< @old{ONIGENC_MBCLEN_INVALID_P} */ -#define MBCLEN_NEEDMORE_P(ret) ONIGENC_MBCLEN_NEEDMORE_P(ret) /**< @old{ONIGENC_MBCLEN_NEEDMORE_P} */ -#define MBCLEN_NEEDMORE_LEN(ret) ONIGENC_MBCLEN_NEEDMORE_LEN(ret) /**< @old{ONIGENC_MBCLEN_NEEDMORE_LEN} */ - -/** - * Queries the code point of character pointed by the passed pointer. If that - * code point is included in ASCII that code point is returned. Otherwise -1. - * This can be different from just looking at the first byte. For instance it - * reads 2 bytes in case of UTF-16BE. - * - * @param[in] p Pointer to the character's first byte. - * @param[in] e End of the string that has `p`. - * @param[in] len Return buffer. - * @param[in] enc Encoding of the string. - * @retval -1 The character at `p` is not i ASCII. - * @retval otherwise A code point of the character at `p`. - * @post `len` (if set) is the number of bytes of `p`. - */ -int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc); - -/** - * Queries the code point of character pointed by the passed pointer. - * Exceptions happen in case of broken input. - * - * @param[in] p Pointer to the character's first byte. - * @param[in] e End of the string that has `p`. - * @param[in] len Return buffer. - * @param[in] enc Encoding of the string. - * @exception rb_eArgError `p` is broken. - * @return Code point of the character pointed by `p`. - * @post `len` (if set) is the number of bytes of `p`. - */ -unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc); - -RBIMPL_ATTR_DEPRECATED(("use rb_enc_codepoint_len instead.")) -/** - * Queries the code point of character pointed by the passed pointer. - * Exceptions happen in case of broken input. - * - * @deprecated Use rb_enc_codepoint_len() instead. - * @param[in] p Pointer to the character's first byte. - * @param[in] e End of the string that has `p`. - * @param[in] enc Encoding of the string. - * @exception rb_eArgError `p` is broken. - * @return Code point of the character pointed by `p`. - */ -unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc); - -/** @cond INTERNAL_MACRO */ -#define rb_enc_codepoint(p,e,enc) rb_enc_codepoint_len((p),(e),0,(enc)) -/** @endcond */ - -/** - * Identical to rb_enc_codepoint(), except it assumes the passed character is - * not broken. - * - * @param[in] p Pointer to the character's first byte. - * @param[in] e End of the string that has `p`. - * @param[in] enc Encoding of the string. - * @return Code point of the character pointed by `p`. - */ -#define rb_enc_mbc_to_codepoint(p, e, enc) ONIGENC_MBC_TO_CODE((enc),(UChar*)(p),(UChar*)(e)) - -/** - * Queries the number of bytes requested to represent the passed code point - * using the passed encoding. - * - * @param[in] code Code point in question. - * @param[in] enc Encoding to convert the code into a byte sequence. - * @exception rb_eArgError `enc` does not glean `code`. - * @return Number of bytes requested to represent `code` using `enc`. - */ -int rb_enc_codelen(int code, rb_encoding *enc); - -/** - * Identical to rb_enc_codelen(), except it returns 0 for invalid code points. - * - * @param[in] code Code point in question. - * @param[in] enc Encoding to convert the code into a byte sequence. - * @retval 0 `code` is invalid. - * @return otherwise Number of bytes used for `enc` to encode `code`. - */ -int rb_enc_code_to_mbclen(int code, rb_encoding *enc); - -/** @cond INTERNAL_MACRO */ -#define rb_enc_code_to_mbclen(c, enc) ONIGENC_CODE_TO_MBCLEN((enc), (c)); -/** @endcond */ - -/** - * Identical to rb_enc_uint_chr(), except it writes back to the passed buffer - * instead of allocating one. - * - * @param[in] c Code point. - * @param[out] buf Return buffer. - * @param[in] enc Target encoding scheme. - * @post `c` is encoded according to `enc`, then written to `buf`. - */ -#define rb_enc_mbcput(c,buf,enc) ONIGENC_CODE_TO_MBC((enc),(c),(UChar*)(buf)) - -/** - * Queries the previous (left) character. - * - * @param[in] s Start of the string. - * @param[in] p Pointer to a character. - * @param[in] e End of the string. - * @param[in] enc Encoding. - * @retval NULL No previous character. - * @retval otherwise Pointer to the head of the previous character. - */ -#define rb_enc_prev_char(s,p,e,enc) ((char *)onigenc_get_prev_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) - -/** - * Queries the left boundary of a character. This function takes a pointer - * that is not necessarily a head of a character, and searches for its head. - * - * @param[in] s Start of the string. - * @param[in] p Pointer to a possibly-middle of a character. - * @param[in] e End of the string. - * @param[in] enc Encoding. - * @return Pointer to the head of the character that contains `p`. - */ -#define rb_enc_left_char_head(s,p,e,enc) ((char *)onigenc_get_left_adjust_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) - -/** - * Queries the right boundary of a character. This function takes a pointer - * that is not necessarily a head of a character, and searches for its tail. - * - * @param[in] s Start of the string. - * @param[in] p Pointer to a possibly-middle of a character. - * @param[in] e End of the string. - * @param[in] enc Encoding. - * @return Pointer to the end of the character that contains `p`. - */ -#define rb_enc_right_char_head(s,p,e,enc) ((char *)onigenc_get_right_adjust_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) - -/** - * Scans the string backwards for n characters. - * - * @param[in] s Start of the string. - * @param[in] p Pointer to a character. - * @param[in] e End of the string. - * @param[in] n Steps. - * @param[in] enc Encoding. - * @retval NULL There are no `n` characters left. - * @retval otherwise Pointer to `n` character before `p`. - */ -#define rb_enc_step_back(s,p,e,n,enc) ((char *)onigenc_step_back((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e),(int)(n))) - -/** - * Queries if the passed pointer points to a newline character. What is a - * newline and what is not depends on the passed encoding. - * - * @param[in] p Pointer to a possibly-middle of a character. - * @param[in] end End of the string. - * @param[in] enc Encoding. - * @retval 0 It isn't. - * @retval otherwise It is. - */ -#define rb_enc_is_newline(p,end,enc) ONIGENC_IS_MBC_NEWLINE((enc),(UChar*)(p),(UChar*)(end)) - -/** - * Queries if the passed code point is of passed character type in the passed - * encoding. The "character type" here is a set of macros defined in onigmo.h, - * like `ONIGENC_CTYPE_PUNCT`. - * - * @param[in] c A code point. - * @param[in] t Type (see above). - * @param[in] enc Encoding. - * @retval 1 `c` is of `t` in `enc`. - * @retval 0 Otherwise. - */ -#define rb_enc_isctype(c,t,enc) ONIGENC_IS_CODE_CTYPE((enc),(c),(t)) - -/** - * Identical to rb_isascii(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 0 `c` is out of range of ASCII character set in `enc`. - * @retval 1 Otherwise. - * - * @internal - * - * `enc` is ignored. This is at least an intentional implementation detail - * (not a bug). But there could be rooms for future extensions. - */ -#define rb_enc_isascii(c,enc) ONIGENC_IS_CODE_ASCII(c) - -/** - * Identical to rb_isalpha(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "ALPHA". - * @retval 0 Otherwise. - */ -#define rb_enc_isalpha(c,enc) ONIGENC_IS_CODE_ALPHA((enc),(c)) - -/** - * Identical to rb_islower(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "LOWER". - * @retval 0 Otherwise. - */ -#define rb_enc_islower(c,enc) ONIGENC_IS_CODE_LOWER((enc),(c)) - -/** - * Identical to rb_isupper(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "UPPER". - * @retval 0 Otherwise. - */ -#define rb_enc_isupper(c,enc) ONIGENC_IS_CODE_UPPER((enc),(c)) - -/** - * Identical to rb_ispunct(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "PUNCT". - * @retval 0 Otherwise. - */ -#define rb_enc_ispunct(c,enc) ONIGENC_IS_CODE_PUNCT((enc),(c)) - -/** - * Identical to rb_isalnum(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "ANUM". - * @retval 0 Otherwise. - */ -#define rb_enc_isalnum(c,enc) ONIGENC_IS_CODE_ALNUM((enc),(c)) - -/** - * Identical to rb_isprint(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "PRINT". - * @retval 0 Otherwise. - */ -#define rb_enc_isprint(c,enc) ONIGENC_IS_CODE_PRINT((enc),(c)) - -/** - * Identical to rb_isspace(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "PRINT". - * @retval 0 Otherwise. - */ -#define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE((enc),(c)) - -/** - * Identical to rb_isdigit(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @retval 1 `enc` classifies `c` as "DIGIT". - * @retval 0 Otherwise. - */ -#define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT((enc),(c)) - -/** - * @private - * - * This is an implementation detail of rb_enc_asciicompat(). People don't use - * it directly. Just always use rb_enc_asciicompat(). - * - * @param[in] enc Encoding in question. - * @retval 1 It is ASCII compatible. - * @retval 0 It isn't. - */ -static inline int -rb_enc_asciicompat_inline(rb_encoding *enc) -{ - return rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc); -} - -/** - * Queries if the passed encoding is _in some sense_ compatible with ASCII. - * The concept of ASCII compatibility is nuanced, and private to our - * implementation. For instance SJIS is ASCII compatible to us, despite their - * having different characters at code point `0x5C`. This is based on some - * practical consideration that Japanese people confuses SJIS to be "upper - * compatible" with ASCII (which is in fact a wrong idea, but we just don't go - * strict here). An example of ASCII incompatible encoding is UTF-16. UTF-16 - * shares code points with ASCII, but employs a completely different encoding - * scheme. - * - * @param[in] enc Encoding in question. - * @retval 0 It is incompatible. - * @retval 1 It is compatible. - */ -#define rb_enc_asciicompat(enc) rb_enc_asciicompat_inline(enc) - -RBIMPL_ATTR_CONST() -/** - * Identical to rb_toupper(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @return `c`'s (Ruby's definition of) upper case counterpart. - * - * @internal - * - * As `RBIMPL_ATTR_CONST` implies this function ignores `enc`. - */ -int rb_enc_toupper(int c, rb_encoding *enc); - -RBIMPL_ATTR_CONST() -/** - * Identical to rb_tolower(), except it additionally takes an encoding. - * - * @param[in] c A code point. - * @param[in] enc An encoding. - * @return `c`'s (Ruby's definition of) lower case counterpart. - * - * @internal - * - * As `RBIMPL_ATTR_CONST` implies this function ignores `enc`. - */ -int rb_enc_tolower(int c, rb_encoding *enc); - -/** - * Identical to rb_intern2(), except it additionally takes an encoding. - * - * @param[in] name The name of the id. - * @param[in] len Length of `name`. - * @param[in] enc `name`'s encoding. - * @exception rb_eRuntimeError Too many symbols. - * @return A (possibly new) id whose value is the given name. - * @note These days Ruby internally has two kinds of symbols - * (static/dynamic). Symbols created using this function would - * become static ones; i.e. would never be garbage collected. It - * is up to you to avoid memory leaks. Think twice before using - * it. - */ -ID rb_intern3(const char *name, long len, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL(()) -/** - * Identical to rb_symname_p(), except it additionally takes an encoding. - * - * @param[in] str A C string to check. - * @param[in] enc `str`'s encoding. - * @retval 1 It is a valid symbol name. - * @retval 0 It is invalid as a symbol name. - */ -int rb_enc_symname_p(const char *str, rb_encoding *enc); - -/** - * Identical to rb_enc_symname_p(), except it additionally takes the passed - * string's length. This is needed for strings containing NUL bytes, like in - * case of UTF-32. - * - * @param[in] name A C string to check. - * @param[in] len Number of bytes of `str`. - * @param[in] enc `str`'s encoding. - * @retval 1 It is a valid symbol name. - * @retval 0 It is invalid as a symbol name. - */ -int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc); - -/** - * Scans the passed string to collect its code range. Because a Ruby's string - * is mutable, its contents change from time to time; so does its code range. - * A long-lived string tends to fall back to ::RUBY_ENC_CODERANGE_UNKNOWN. - * This API scans it and re-assigns a fine-grained code range constant. - * - * @param[out] str A string. - * @return An enum ::ruby_coderange_type. - */ -int rb_enc_str_coderange(VALUE str); - -/** - * Scans the passed string until it finds something odd. Returns the number of - * bytes scanned. As the name implies this is suitable for repeated call. One - * of its application is `IO#readlines`. The method reads from its receiver's - * read buffer, maybe more than once, looking for newlines. But "newline" can - * be different among encodings. This API is used to detect broken contents to - * properly mark them as such. - * - * @param[in] str String to scan. - * @param[in] end End of `str`. - * @param[in] enc `str`'s encoding. - * @param[out] cr Return buffer. - * @return Distance between `str` and first such byte where broken. - * @post `cr` has the code range type. - */ -long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr); - -/** - * Queries if the passed string is "ASCII only". An ASCII only string is a - * string who doesn't have any non-ASCII characters at all. This doesn't - * necessarily mean the string is in ASCII encoding. For instance a String of - * CP932 encoding can quite much be ASCII only, depending on its contents. - * - * @param[in] str String in question. - * @retval 1 It doesn't have non-ASCII characters. - * @retval 0 It has characters that are out of ASCII. - */ -int rb_enc_str_asciionly_p(VALUE str); - -/** - * Queries if the passed string is in an ASCII-compatible encoding. - * - * @param[in] str A Ruby's string to query. - * @retval 0 `str` is not a String, or an ASCII-incompatible string. - * @retval 1 Otherwise. - */ -#define rb_enc_str_asciicompat_p(str) rb_enc_asciicompat(rb_enc_get(str)) - -/** - * Queries the Ruby-level counterpart instance of ::rb_cEncoding that - * corresponds to the passed encoding. - * - * @param[in] enc An encoding - * @retval RUBY_Qnil `enc` is a null pointer. - * @retval otherwise An instance of ::rb_cEncoding. - */ -VALUE rb_enc_from_encoding(rb_encoding *enc); - -RBIMPL_ATTR_PURE() -/** - * Queries if the passed encoding is either one of UTF-8/16/32. - * - * @note It does not take UTF-7, which we actually support, into account. - * - * @param[in] enc Encoding in question. - * @retval 0 It is not a Unicode variant. - * @retval otherwise It is. - * - * @internal - * - * In reality it returns 1/0, but the value is abstracted as - * `ONIGENC_FLAG_UNICODE`. - */ -int rb_enc_unicode_p(rb_encoding *enc); - -RBIMPL_ATTR_RETURNS_NONNULL() -/** - * Queries the encoding that represents ASCII-8BIT a.k.a. binary. - * - * @return The encoding that represents ASCII-8BIT. - * - * @internal - * - * This can not return NULL once the process properly boots up. - */ -rb_encoding *rb_ascii8bit_encoding(void); - -RBIMPL_ATTR_RETURNS_NONNULL() -/** - * Queries the encoding that represents UTF-8. - * - * @return The encoding that represents UTF-8. - * - * @internal - * - * This can not return NULL once the process properly boots up. - */ -rb_encoding *rb_utf8_encoding(void); - -RBIMPL_ATTR_RETURNS_NONNULL() -/** - * Queries the encoding that represents US-ASCII. - * - * @return The encoding that represents US-ASCII. - * - * @internal - * - * This can not return NULL once the process properly boots up. - */ -rb_encoding *rb_usascii_encoding(void); - -/** - * Queries the encoding that represents the current locale. - * - * @return The encoding that represents the process' locale. - * - * @internal - * - * This is dynamic. If you change the process' locale by e.g. calling - * `setlocale(3)`, that should also change the return value of this function. - * - * There is no official way for Ruby scripts to manipulate locales, though. - */ -rb_encoding *rb_locale_encoding(void); - -/** - * Queries the "filesystem" encoding. This is the encoding that ruby expects - * info from the OS' file system are in. This affects for instance return - * value of rb_dir_getwd(). Most notably on Windows it can be an alias of OS - * codepage. Most notably on Linux users can set this via default external - * encoding. - * - * @return The "filesystem" encoding. - */ -rb_encoding *rb_filesystem_encoding(void); - -/** - * Queries the "default external" encoding. This is used to interact with - * outer-process things such as File. Though not recommended, you can set this - * using rb_enc_set_default_external(). - * - * @return The "default external" encoding. - */ -rb_encoding *rb_default_external_encoding(void); - -/** - * Queries the "default internal" encoding. This could be a null pointer. - * Otherwise, outer-process info are transcoded from default external encoding - * to this one during reading from an IO. - * - * @return The "default internal" encoding (if any). - */ -rb_encoding *rb_default_internal_encoding(void); - -#ifndef rb_ascii8bit_encindex -RBIMPL_ATTR_CONST() -/** - * Identical to rb_ascii8bit_encoding(), except it returns the encoding's index - * instead of the encoding itself. - * - * @return The index of encoding of ASCII-8BIT. - * - * @internal - * - * This happens to be 0. - */ -int rb_ascii8bit_encindex(void); -#endif - -#ifndef rb_utf8_encindex -RBIMPL_ATTR_CONST() -/** - * Identical to rb_utf8_encoding(), except it returns the encoding's index - * instead of the encoding itself. - * - * @return The index of encoding of UTF-8. - */ -int rb_utf8_encindex(void); -#endif - -#ifndef rb_usascii_encindex -RBIMPL_ATTR_CONST() -/** - * Identical to rb_usascii_encoding(), except it returns the encoding's index - * instead of the encoding itself. - * - * @return The index of encoding of UTF-8. - */ -int rb_usascii_encindex(void); -#endif - -/** - * Identical to rb_locale_encoding(), except it returns the encoding's index - * instead of the encoding itself. - * - * @return The index of the locale encoding. - */ -int rb_locale_encindex(void); - -/** - * Identical to rb_filesystem_encoding(), except it returns the encoding's - * index instead of the encoding itself. - * - * @return The index of the filesystem encoding. - */ -int rb_filesystem_encindex(void); - -/** - * Identical to rb_default_external_encoding(), except it returns the - * Ruby-level counterpart instance of ::rb_cEncoding that corresponds to the - * default external encoding. - * - * @return An instance of ::rb_cEncoding of default external. - */ -VALUE rb_enc_default_external(void); - -/** - * Identical to rb_default_internal_encoding(), except it returns the - * Ruby-level counterpart instance of ::rb_cEncoding that corresponds to the - * default internal encoding. - * - * @return An instance of ::rb_cEncoding of default internal. - */ -VALUE rb_enc_default_internal(void); - -/** - * Destructively assigns the passed encoding as the default external encoding. - * You should not use this API. It has process-global side effects. Also it - * doesn't change encodings of strings that have already been read. - * - * @param[in] encoding Ruby level encoding. - * @exception rb_eArgError `encoding` is ::RUBY_Qnil. - * @post The default external encoding is `encoding`. - */ -void rb_enc_set_default_external(VALUE encoding); - -/** - * Destructively assigns the passed encoding as the default internal encoding. - * You should not use this API. It has process-global side effects. Also it - * doesn't change encodings of strings that have already been read. - * - * @param[in] encoding Ruby level encoding. - * @post The default internal encoding is `encoding`. - * @note Unlike rb_enc_set_default_external() you can pass ::RUBY_Qnil. - */ -void rb_enc_set_default_internal(VALUE encoding); - -/** - * Returns a platform-depended "charmap" of the current locale. This - * information is called a "Codeset name" in IEEE 1003.1 section 13 - * (``). This is a very low-level API. The return value can have - * no corresponding encoding when passed to rb_find_encoding(). - * - * @param[in] klass Ignored for no reason (why...) - * @return The low-level locale charmap, in Ruby's String. - */ -VALUE rb_locale_charmap(VALUE klass); - -RBIMPL_ATTR_NONNULL(()) -/** - * Looks for the passed string in the passed buffer. - * - * @param[in] x Buffer that potentially includes `y`. - * @param[in] m Number of bytes of `x`. - * @param[in] y Query string. - * @param[in] n Number of bytes of `y`. - * @param[in] enc Encoding of both `x` and `y`. - * @retval -1 Not found. - * @retval otherwise Found index in `x`. - * @note This API can match at a non-character-boundary. - */ -long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL(()) -/** - * Returns a path component directly adjacent to the passed pointer. - * - * ``` - * "/multi/byte/encoded/pathname.txt" - * ^ ^ ^ - * | | +--- end - * | +--- @return - * +--- path - * ``` - * - * @param[in] path Where to start scanning. - * @param[in] end End of the path string. - * @param[in] enc Encoding of the string. - * @return A pointer in the passed string where the next path component - * resides, or `end` if there is no next path component. - */ -char *rb_enc_path_next(const char *path, const char *end, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL(()) -/** - * Seeks for non-prefix part of a pathname. This can be a no-op when the OS - * has no such concept like a path prefix. But there are OSes where path - * prefixes do exist. - * - * ``` - * "C:\multi\byte\encoded\pathname.txt" - * ^ ^ ^ - * | | +--- end - * | +--- @return - * +--- path - * ``` - * - * @param[in] path Where to start scanning. - * @param[in] end End of the path string. - * @param[in] enc Encoding of the string. - * @return A pointer in the passed string where non-prefix part starts, or - * `path` if the OS does not have path prefix. - */ -char *rb_enc_path_skip_prefix(const char *path, const char *end, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL(()) -/** - * Returns the last path component. - * - * ``` - * "/multi/byte/encoded/pathname.txt" - * ^ ^ ^ - * | | +--- end - * | +--- @return - * +--- path - * ``` - * - * @param[in] path Where to start scanning. - * @param[in] end End of the path string. - * @param[in] enc Encoding of the string. - * @return A pointer in the passed string where the last path component - * resides, or `end` if there is no more path component. - */ -char *rb_enc_path_last_separator(const char *path, const char *end, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL(()) -/** - * This just returns the passed end basically. It makes difference in case the - * passed string ends with tons of path separators like the following: - * - * ``` - * "/path/that/ends/with/lots/of/slashes//////////////" - * ^ ^ ^ - * | | +--- end - * | +--- @return - * +--- path - * ``` - * - * @param[in] path Where to start scanning. - * @param[in] end End of the path string. - * @param[in] enc Encoding of the string. - * @return A pointer in the passed string where the trailing path - * separators start, or `end` if there is no trailing path - * separators. - * - * @internal - * - * It seems this function was introduced to mimic what POSIX says about - * `basename(3)`. - */ -char *rb_enc_path_end(const char *path, const char *end, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL((1, 4)) -/** - * Our own encoding-aware version of `basename(3)`. Normally, this function - * returns the last path component of the given name. However in case the - * passed name ends with a path separator, it returns the name of the - * directory, not the last (empty) component. Also if the passed name is a - * root directory, it returns that root directory. Note however that Windows - * filesystem have drive letters, which this function does not return. - * - * @param[in] name Target path. - * @param[out] baselen Return buffer. - * @param[in,out] alllen Number of bytes of `name`. - * @param[enc] enc Encoding of `name`. - * @return The rightmost component of `name`. - * @post `baselen`, if passed, is updated to be the number of bytes - * of the returned basename. - * @post `alllen`, if passed, is updated to be the number of bytes of - * strings not considered as the basename. - */ -const char *ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encoding *enc); - -RBIMPL_ATTR_NONNULL((1, 3)) -/** - * Our own encoding-aware version of `extname`. This function first applies - * rb_enc_path_last_separator() to the passed name and only concerns its return - * value (ignores any parent directories). This function returns complicated - * results: - * - * ```CXX - * auto path = "..."; - * auto len = strlen(path); - * auto ret = ruby_enc_find_extname(path, &len, rb_ascii8bit_encoding()); - * - * switch(len) { - * case 0: - * if (ret == 0) { - * // `path` is a file without extensions. - * } - * else { - * // `path` is a dotfile. - * // `ret` is the file's name. - * } - * break; - * - * case 1: - * // `path` _ends_ with a dot. - * // `ret` is that dot. - * break; - * - * default: - * // `path` has an extension. - * // `ret` is that extension. - * } - * ``` - * - * @param[in] name Target path. - * @param[in,out] len Number of bytes of `name`. - * @param[in] enc Encoding of `name`. - * @return See above. - * @post `len`, if passed, is updated (see above). - */ -const char *ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc); - -/** - * Identical to rb_check_id(), except it takes a pointer to a memory region - * instead of Ruby's string. - * - * @param[in] ptr A pointer to a memory region. - * @param[in] len Number of bytes of `ptr`. - * @param[in] enc Encoding of `ptr`. - * @exception rb_eEncodingError `ptr` contains non-ASCII according to `enc`. - * @retval 0 No such id ever existed in the history. - * @retval otherwise The id that represents the given name. - */ -ID rb_check_id_cstr(const char *ptr, long len, rb_encoding *enc); - -/** - * Identical to rb_check_id_cstr(), except for the return type. It can also be - * seen as a routine identical to rb_check_symbol(), except it takes a pointer - * to a memory region instead of Ruby's string. - * - * @param[in] ptr A pointer to a memory region. - * @param[in] len Number of bytes of `ptr`. - * @param[in] enc Encoding of `ptr`. - * @exception rb_eEncodingError `ptr` contains non-ASCII according to `enc`. - * @retval RUBY_Qnil No such id ever existed in the history. - * @retval otherwise The id that represents the given name. - */ -VALUE rb_check_symbol_cstr(const char *ptr, long len, rb_encoding *enc); - -/** - * `Encoding` class. - * - * @ingroup object - */ -RUBY_EXTERN VALUE rb_cEncoding; - -/* econv stuff */ - -/** return value of rb_econv_convert() */ -typedef enum { - - /** - * The conversion stopped when it found an invalid sequence. - */ - econv_invalid_byte_sequence, - - /** - * The conversion stopped when it found a character in the input which - * cannot be representable in the output. - */ - econv_undefined_conversion, - - /** - * The conversion stopped because there is no destination. - */ - econv_destination_buffer_full, - - /** - * The conversion stopped because there is no input. - */ - econv_source_buffer_empty, - - /** - * The conversion stopped after converting everything. This is arguably - * the expected normal end of conversion. - */ - econv_finished, - - /** - * The conversion stopped after writing something to somewhere, before - * reading everything. - */ - econv_after_output, - - /** - * The conversion stopped in middle of reading a character, possibly due to - * a partial read of a socket etc. - */ - econv_incomplete_input -} rb_econv_result_t; - -/** An opaque struct that represents a lowest level of encoding conversion. */ -typedef struct rb_econv_t rb_econv_t; - -/** - * Converts the contents of the passed string from its encoding to the passed - * one. - * - * @param[in] str Target string. - * @param[in] to Destination encoding. - * @param[in] ecflags A set of enum - * ::ruby_econv_flag_type. - * @param[in] ecopts A keyword hash, like - * ::rb_io_t::rb_io_enc_t::ecopts. - * @exception rb_eArgError Not fully converted. - * @exception rb_eInvalidByteSequenceError `str` is malformed. - * @exception rb_eUndefinedConversionError `str` has a character not - * representable using `to`. - * @exception rb_eConversionNotFoundError There is no known conversion from - * `str`'s encoding to `to`. - * @return A string whose encoding is `to`, and whose contents is converted - * contents of `str`. - * @note Use rb_econv_prepare_options() to generate `ecopts`. - */ -VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts); - -/** - * Queries if there is more than one way to convert between the passed two - * encodings. Encoding conversion are has_and_belongs_to_many relationships. - * There could be no direct conversion defined for the passed pair. Ruby tries - * to find an indirect way to do so then. For instance ISO-8859-1 has no - * direct conversion to ISO-2022-JP. But there is ISO-8859-1 to UTF-8 - * conversion; then there is UTF-8 to EUC-JP conversion; finally there also is - * EUC-JP to ISO-2022-JP conversion. So in short ISO-8859-1 can be converted - * to ISO-2022-JP using that path. This function returns true. Obviously not - * everything that can be represented using UTF-8 can also be represented using - * EUC-JP. Conversions in practice can fail depending on the actual input, and - * that renders exceptions in case of rb_str_encode(). - * - * @param[in] from_encoding One encoding. - * @param[in] to_encoding Another encoding. - * @retval 0 No way to convert the two. - * @retval 1 At least one way to convert the two. - * - * @internal - * - * Practically @shyouhei knows no way for this function to return 0. It seems - * everything can eventually be converted to/from UTF-8, which connects - * everything. - */ -int rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding); - -/** - * Identical to rb_econv_prepare_opts(), except it additionally takes the - * initial value of flags. The extra bits are bitwise-ORed to the return - * value. - * - * @param[in] opthash Keyword arguments. - * @param[out] ecopts Return buffer. - * @param[in] ecflags Default set of enum ::ruby_econv_flag_type. - * @exception rb_eArgError Unknown/Broken values passed. - * @return Calculated set of enum ::ruby_econv_flag_type. - * @post `ecopts` holds a hash object suitable for - * ::rb_io_t::rb_io_enc_t::ecopts. - */ -int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags); - -/** - * Splits a keyword arguments hash (that for instance `String#encode` took) - * into a set of enum ::ruby_econv_flag_type and a hash storing replacement - * characters etc. - * - * @param[in] opthash Keyword arguments. - * @param[out] ecopts Return buffer. - * @exception rb_eArgError Unknown/Broken values passed. - * @return Calculated set of enum ::ruby_econv_flag_type. - * @post `ecopts` holds a hash object suitable for - * ::rb_io_t::rb_io_enc_t::ecopts. - */ -int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts); - -/** - * Creates a new instance of struct ::rb_econv_t. - * - * @param[in] source_encoding Name of an encoding. - * @param[in] destination_encoding Name of another encoding. - * @param[in] ecflags A set of enum ::ruby_econv_flag_type. - * @exception rb_eArgError No such encoding. - * @retval NULL Failed to create a struct ::rb_econv_t. - * @retval otherwise Allocated struct ::rb_econv_t. - * @warning Return value must be passed to rb_econv_close() exactly once. - */ -rb_econv_t *rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags); - -/** - * Identical to rb_econv_open(), except it additionally takes a hash of - * optional strings. - * - * - * @param[in] source_encoding Name of an encoding. - * @param[in] destination_encoding Name of another encoding. - * @param[in] ecflags A set of enum ::ruby_econv_flag_type. - * @param[in] ecopts Optional set of strings. - * @exception rb_eArgError No such encoding. - * @retval NULL Failed to create a struct ::rb_econv_t. - * @retval otherwise Allocated struct ::rb_econv_t. - * @warning Return value must be passed to rb_econv_close() exactly once. - */ -rb_econv_t *rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts); - -/** - * Converts a string from an encoding to another. - * - * Possible flags are either ::RUBY_ECONV_PARTIAL_INPUT (means the source - * buffer is a part of much larger one), ::RUBY_ECONV_AFTER_OUTPUT (instructs - * the converter to stop after output before input), or both of them. - * - * @param[in,out] ec Conversion specification/state etc. - * @param[in] source_buffer_ptr Target string. - * @param[in] source_buffer_end End of target string. - * @param[out] destination_buffer_ptr Return buffer. - * @param[out] destination_buffer_end End of return buffer. - * @param[in] flags Flags (see above). - * @return The status of the conversion. - * @post `destination_buffer_ptr` holds conversion results. - */ -rb_econv_result_t rb_econv_convert(rb_econv_t *ec, - const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, - unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, - int flags); - -/** - * Destructs a converter. Note that a converter can have a buffer, and can be - * non-empty. Calling this would lose your data then. - * - * @param[out] ec The converter to destroy. - * @post `ec` is no longer a valid pointer. - */ -void rb_econv_close(rb_econv_t *ec); - -/** - * Assigns the replacement string. The string passed here would appear in - * converted string when it cannot represent its source counterpart. This can - * happen for instance you convert an emoji to ISO-8859-1. - * - * @param[out] ec Target converter. - * @param[in] str Replacement string. - * @param[in] len Number of bytes of `str`. - * @param[in] encname Name of encoding of `str`. - * @retval 0 Success. - * @retval -1 Failure (ENOMEM etc.). - * @post `ec`'s replacement string is set to `str`. - */ -int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname); - -/** - * "Decorate"s a converter. There are special kind of converters that - * transforms the contents, like replacing CR into CRLF. You can add such - * decorators to a converter using this API. By using this function a - * decorator is prepended at the beginning of a conversion sequence: in case of - * CRLF conversion, newlines are converted before encodings are converted. - * - * @param[out] ec Target converter to decorate. - * @param[in] decorator_name Name of decorator to prepend. - * @retval 0 Success. - * @retval -1 Failure (no such decorator etc.). - * @post Decorator works before encoding conversion happens. - * - * @internal - * - * What is the possible value of the `decorator_name` is not public. You have - * to read through `transcode.c` carefully. - */ -int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name); - -/** - * Identical to rb_econv_decorate_at_first(), except it adds to the opposite - * direction. For instance CRLF conversion would run _after_ encodings are - * converted. - * - * @param[out] ec Target converter to decorate. - * @param[in] decorator_name Name of decorator to prepend. - * @retval 0 Success. - * @retval -1 Failure (no such decorator etc.). - * @post Decorator works after encoding conversion happens. - */ -int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name); - -/** - * Creates a `rb_eConverterNotFoundError` exception object (but does not - * raise). - * - * @param[in] senc Name of source encoding. - * @param[in] denc Name of destination encoding. - * @param[in] ecflags A set of enum ::ruby_econv_flag_type. - * @return An instance of `rb_eConverterNotFoundError`. - */ -VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags); - -/** - * Appends the passed string to the passed converter's output buffer. This can - * be handy when an encoding needs bytes out of thin air; for instance - * ISO-2022-JP has "shift function" which does not correspond to any - * characters. - * - * @param[out] ec Target converter. - * @param[in] str String to insert. - * @param[in] len Number of bytes of `str`. - * @param[in] str_encoding Encoding of `str`. - * @retval 0 Success. - * @retval -1 Failure (conversion error etc.). - * @note `str_encoding` can be anything, and `str` itself is converted - * when necessary. - */ -int rb_econv_insert_output(rb_econv_t *ec, - const unsigned char *str, size_t len, const char *str_encoding); - -/** - * Queries an encoding name which best suits for rb_econv_insert_output()'s - * last parameter. Strings in this encoding need no conversion when inserted; - * can be both time/space efficient. - * - * @param[in] ec Target converter. - * @return Its encoding for insertion. - */ -const char *rb_econv_encoding_to_insert_output(rb_econv_t *ec); - -/** - * This is a rb_econv_make_exception() + rb_exc_raise() combo. - * - * @param[in] ec (Possibly failed) conversion. - * @exception rb_eInvalidByteSequenceError Invalid byte sequence. - * @exception rb_eUndefinedConversionError Conversion undefined. - * @note This function can return when no error. - */ -void rb_econv_check_error(rb_econv_t *ec); - -/** - * This function makes sense right after rb_econv_convert() returns. As listed - * in ::rb_econv_result_t, rb_econv_convert() can bail out for various reasons. - * This function checks the passed converter's internal state and convert it to - * an appropriate exception object. - * - * @param[in] ec Target converter. - * @retval RUBY_Qnil The converter has no error. - * @retval otherwise Conversion error turned into an exception. - */ -VALUE rb_econv_make_exception(rb_econv_t *ec); - -/** - * Queries if rb_econv_putback() makes sense, i.e. there are invalid byte - * sequences remain in the buffer. - * - * @param[in] ec Target converter. - * @return Number of bytes that can be pushed back. - */ -int rb_econv_putbackable(rb_econv_t *ec); - -/** - * Puts back the bytes. In case of ::econv_invalid_byte_sequence, some of - * those invalid bytes are discarded and the others are buffered to be - * converted later. The latter bytes can be put back using this API. - * - * @param[out] ec Target converter (invalid byte sequence). - * @param[out] p Return buffer. - * @param[in] n Max number of bytes to put back. - * @post At most `n` bytes of what was put back is written to `p`. - */ -void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n); - -/** - * Queries the passed encoding's corresponding ASCII compatible encoding. "The - * corresponding ASCII compatible encoding" in this context is an ASCII - * compatible encoding which can represent exactly the same character sets as - * the given ASCII incompatible encoding. For instance that of UTF-16LE is - * UTF-8. - * - * @param[in] encname Name of an ASCII incompatible encoding. - * @retval NULL `encname` is already ASCII compatible. - * @retval otherwise The corresponding ASCII compatible encoding. - */ -const char *rb_econv_asciicompat_encoding(const char *encname); - -/** - * Identical to rb_econv_convert(), except it takes Ruby's string instead of - * C's pointer. - * - * @param[in,out] ec Target converter. - * @param[in] src Source string. - * @param[in] flags Flags (see rb_econv_convert). - * @exception rb_eArgError Converted string is too long. - * @exception rb_eInvalidByteSequenceError Invalid byte sequence. - * @exception rb_eUndefinedConversionError Conversion undefined. - * @return The conversion result. - */ -VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags); - -/** - * Identical to rb_econv_str_convert(), except it converts only a part of the - * passed string. Can be handy when you for instance want to do line-buffered - * conversion. - * - * @param[in,out] ec Target converter. - * @param[in] src Source string. - * @param[in] byteoff Number of bytes to seek. - * @param[in] bytesize Number of bytes to read. - * @param[in] flags Flags (see rb_econv_convert). - * @exception rb_eArgError Converted string is too long. - * @exception rb_eInvalidByteSequenceError Invalid byte sequence. - * @exception rb_eUndefinedConversionError Conversion undefined. - * @return The conversion result. - */ -VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags); - -/** - * Identical to rb_econv_str_convert(), except it appends the conversion result - * to the additionally passed string instead of creating a new string. It can - * also be seen as a routine identical to rb_econv_append(), except it takes a - * Ruby's string instead of C's pointer. - * - * @param[in,out] ec Target converter. - * @param[in] src Source string. - * @param[in] dst Return buffer. - * @param[in] flags Flags (see rb_econv_convert). - * @exception rb_eArgError Converted string is too long. - * @exception rb_eInvalidByteSequenceError Invalid byte sequence. - * @exception rb_eUndefinedConversionError Conversion undefined. - * @return The conversion result. - */ -VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags); - -/** - * Identical to rb_econv_str_append(), except it appends only a part of the - * passed string with conversion. It can also be seen as a routine identical - * to rb_econv_substr_convert(), except it appends the conversion result to the - * additionally passed string instead of creating a new string. - * - * @param[in,out] ec Target converter. - * @param[in] src Source string. - * @param[in] byteoff Number of bytes to seek. - * @param[in] bytesize Number of bytes to read. - * @param[in] dst Return buffer. - * @param[in] flags Flags (see rb_econv_convert). - * @exception rb_eArgError Converted string is too long. - * @exception rb_eInvalidByteSequenceError Invalid byte sequence. - * @exception rb_eUndefinedConversionError Conversion undefined. - * @return The conversion result. - */ -VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags); - -/** - * Converts the passed C's pointer according to the passed converter, then - * append the conversion result to the passed Ruby's string. This way buffer - * overflow is properly avoided to resize the destination properly. - * - * @param[in,out] ec Target converter. - * @param[in] bytesrc Target string. - * @param[in] bytesize Number of bytes of `bytesrc`. - * @param[in] dst Return buffer. - * @param[in] flags Flags (see rb_econv_convert). - * @exception rb_eArgError Converted string is too long. - * @exception rb_eInvalidByteSequenceError Invalid byte sequence. - * @exception rb_eUndefinedConversionError Conversion undefined. - * @return The conversion result. - */ -VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags); - -/** - * This badly named function does not set the destination encoding to binary, - * but instead just nullifies newline conversion decorators if any. Other - * ordinal character conversions still happen after this; something non-binary - * would still be generated. - * - * @param[out] ec Target converter to modify. - * @post Any newline conversions, if any, would be killed. - */ -void rb_econv_binmode(rb_econv_t *ec); - -/** - * This enum is kind of omnibus. Gathers various constants. - */ -enum ruby_econv_flag_type { - - /** - * @name Flags for rb_econv_open() - * - * @{ - */ - - /** Mask for error handling related bits. */ - RUBY_ECONV_ERROR_HANDLER_MASK = 0x000000ff, - - /** Special handling of invalid sequences are there. */ - RUBY_ECONV_INVALID_MASK = 0x0000000f, - - /** Invalid sequences shall be replaced. */ - RUBY_ECONV_INVALID_REPLACE = 0x00000002, - - /** Special handling of undefined conversion are there. */ - RUBY_ECONV_UNDEF_MASK = 0x000000f0, - - /** Undefined characters shall be replaced. */ - RUBY_ECONV_UNDEF_REPLACE = 0x00000020, - - /** Undefined characters shall be escaped. */ - RUBY_ECONV_UNDEF_HEX_CHARREF = 0x00000030, - - /** Decorators are there. */ - RUBY_ECONV_DECORATOR_MASK = 0x0000ff00, - - /** Newline converters are there. */ - RUBY_ECONV_NEWLINE_DECORATOR_MASK = 0x00003f00, - - /** (Unclear; seems unused). */ - RUBY_ECONV_NEWLINE_DECORATOR_READ_MASK = 0x00000f00, - - /** (Unclear; seems unused). */ - RUBY_ECONV_NEWLINE_DECORATOR_WRITE_MASK = 0x00003000, - - /** Universal newline mode. */ - RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR = 0x00000100, - - /** CR to CRLF conversion shall happen. */ - RUBY_ECONV_CRLF_NEWLINE_DECORATOR = 0x00001000, - - /** CRLF to CR conversion shall happen. */ - RUBY_ECONV_CR_NEWLINE_DECORATOR = 0x00002000, - - /** Texts shall be XML-escaped. */ - RUBY_ECONV_XML_TEXT_DECORATOR = 0x00004000, - - /** Texts shall be AttrValue escaped */ - RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR = 0x00008000, - - /** (Unclear; seems unused). */ - RUBY_ECONV_STATEFUL_DECORATOR_MASK = 0x00f00000, - - /** Texts shall be AttrValue escaped. */ - RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR = 0x00100000, - - /** Newline decorator's default. */ - RUBY_ECONV_DEFAULT_NEWLINE_DECORATOR = -#if defined(RUBY_TEST_CRLF_ENVIRONMENT) || defined(_WIN32) - RUBY_ECONV_CRLF_NEWLINE_DECORATOR, -#else - 0, -#endif - -#define ECONV_ERROR_HANDLER_MASK RUBY_ECONV_ERROR_HANDLER_MASK /**< @old{RUBY_ECONV_ERROR_HANDLER_MASK} */ -#define ECONV_INVALID_MASK RUBY_ECONV_INVALID_MASK /**< @old{RUBY_ECONV_INVALID_MASK} */ -#define ECONV_INVALID_REPLACE RUBY_ECONV_INVALID_REPLACE /**< @old{RUBY_ECONV_INVALID_REPLACE} */ -#define ECONV_UNDEF_MASK RUBY_ECONV_UNDEF_MASK /**< @old{RUBY_ECONV_UNDEF_MASK} */ -#define ECONV_UNDEF_REPLACE RUBY_ECONV_UNDEF_REPLACE /**< @old{RUBY_ECONV_UNDEF_REPLACE} */ -#define ECONV_UNDEF_HEX_CHARREF RUBY_ECONV_UNDEF_HEX_CHARREF /**< @old{RUBY_ECONV_UNDEF_HEX_CHARREF} */ -#define ECONV_DECORATOR_MASK RUBY_ECONV_DECORATOR_MASK /**< @old{RUBY_ECONV_DECORATOR_MASK} */ -#define ECONV_NEWLINE_DECORATOR_MASK RUBY_ECONV_NEWLINE_DECORATOR_MASK /**< @old{RUBY_ECONV_NEWLINE_DECORATOR_MASK} */ -#define ECONV_NEWLINE_DECORATOR_READ_MASK RUBY_ECONV_NEWLINE_DECORATOR_READ_MASK /**< @old{RUBY_ECONV_NEWLINE_DECORATOR_READ_MASK} */ -#define ECONV_NEWLINE_DECORATOR_WRITE_MASK RUBY_ECONV_NEWLINE_DECORATOR_WRITE_MASK /**< @old{RUBY_ECONV_NEWLINE_DECORATOR_WRITE_MASK} */ -#define ECONV_UNIVERSAL_NEWLINE_DECORATOR RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR} */ -#define ECONV_CRLF_NEWLINE_DECORATOR RUBY_ECONV_CRLF_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_CRLF_NEWLINE_DECORATOR} */ -#define ECONV_CR_NEWLINE_DECORATOR RUBY_ECONV_CR_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_CR_NEWLINE_DECORATOR} */ -#define ECONV_XML_TEXT_DECORATOR RUBY_ECONV_XML_TEXT_DECORATOR /**< @old{RUBY_ECONV_XML_TEXT_DECORATOR} */ -#define ECONV_XML_ATTR_CONTENT_DECORATOR RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR /**< @old{RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR} */ -#define ECONV_STATEFUL_DECORATOR_MASK RUBY_ECONV_STATEFUL_DECORATOR_MASK /**< @old{RUBY_ECONV_STATEFUL_DECORATOR_MASK} */ -#define ECONV_XML_ATTR_QUOTE_DECORATOR RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR /**< @old{RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR} */ -#define ECONV_DEFAULT_NEWLINE_DECORATOR RUBY_ECONV_DEFAULT_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_DEFAULT_NEWLINE_DECORATOR} */ - /** @} */ - - /** - * @name Flags for rb_econv_convert() - * - * @{ - */ - - /** Indicates the input is a part of much larger one. */ - RUBY_ECONV_PARTIAL_INPUT = 0x00010000, - - /** Instructs the converter to stop after output. */ - RUBY_ECONV_AFTER_OUTPUT = 0x00020000, -#define ECONV_PARTIAL_INPUT RUBY_ECONV_PARTIAL_INPUT /**< @old{RUBY_ECONV_PARTIAL_INPUT} */ -#define ECONV_AFTER_OUTPUT RUBY_ECONV_AFTER_OUTPUT /**< @old{RUBY_ECONV_AFTER_OUTPUT} */ - - RUBY_ECONV_FLAGS_PLACEHOLDER /**< Placeholder (not used) */ -}; - -RBIMPL_SYMBOL_EXPORT_END() +#include "ruby/internal/encoding/coderange.h" +#include "ruby/internal/encoding/ctype.h" +#include "ruby/internal/encoding/encoding.h" +#include "ruby/internal/encoding/pathname.h" +#include "ruby/internal/encoding/re.h" +#include "ruby/internal/encoding/sprintf.h" +#include "ruby/internal/encoding/string.h" +#include "ruby/internal/encoding/symbol.h" +#include "ruby/internal/encoding/transcode.h" #endif /* RUBY_ENCODING_H */ diff --git a/include/ruby/internal/encoding/coderange.h b/include/ruby/internal/encoding/coderange.h new file mode 100644 index 0000000000..84daddeeb3 --- /dev/null +++ b/include/ruby/internal/encoding/coderange.h @@ -0,0 +1,172 @@ +#ifndef RUBY_INTERNAL_ENCODING_CODERANGE_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_CODERANGE_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief Routines for code ranges. + */ + +#include "ruby/internal/attr/const.h" +#include "ruby/internal/dllexport.h" +#include "ruby/internal/value.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() + +/** What rb_enc_str_coderange() returns. */ +enum ruby_coderange_type { + + /** The object's coderange is unclear yet. */ + RUBY_ENC_CODERANGE_UNKNOWN = 0, + + /** The object holds 0 to 127 inclusive and nothing else. */ + RUBY_ENC_CODERANGE_7BIT = ((int)RUBY_FL_USER8), + + /** The object's encoding and contents are consistent each other */ + RUBY_ENC_CODERANGE_VALID = ((int)RUBY_FL_USER9), + + /** The object holds invalid/malformed/broken character(s). */ + RUBY_ENC_CODERANGE_BROKEN = ((int)(RUBY_FL_USER8|RUBY_FL_USER9)), + + /** Where the coderange resides. */ + RUBY_ENC_CODERANGE_MASK = (RUBY_ENC_CODERANGE_7BIT| + RUBY_ENC_CODERANGE_VALID| + RUBY_ENC_CODERANGE_BROKEN) +}; + +RBIMPL_ATTR_CONST() +/** + * @private + * + * This is an implementation detail of #RB_ENC_CODERANGE_CLEAN_P. People don't + * use it directly. + * + * @param[in] cr An enum ::ruby_coderange_type. + * @retval 1 It is. + * @retval 0 It isn't. + */ +static inline int +rb_enc_coderange_clean_p(int cr) +{ + return (cr ^ (cr >> 1)) & RUBY_ENC_CODERANGE_7BIT; +} + +/** + * Queries if a code range is "clean". "Clean" in this context means it is + * known and valid. + * + * @param[in] cr An enum ::ruby_coderange_type. + * @retval 1 It is. + * @retval 0 It isn't. + */ +#define RB_ENC_CODERANGE_CLEAN_P(cr) rb_enc_coderange_clean_p(cr) + +/** + * Queries the (inline) code range of the passed object. The object must be + * capable of having inline encoding. Using this macro needs deep + * understanding of bit level object binary layout. + * + * @param[in] obj Target object. + * @return An enum ::ruby_coderange_type. + */ +#define RB_ENC_CODERANGE(obj) ((int)RBASIC(obj)->flags & RUBY_ENC_CODERANGE_MASK) + +/** + * Queries the (inline) code range of the passed object is + * ::RUBY_ENC_CODERANGE_7BIT. The object must be capable of having inline + * encoding. Using this macro needs deep understanding of bit level object + * binary layout. + * + * @param[in] obj Target object. + * @retval 1 It is ascii only. + * @retval 0 Otherwise (including cases when the range is not known). + */ +#define RB_ENC_CODERANGE_ASCIIONLY(obj) (RB_ENC_CODERANGE(obj) == RUBY_ENC_CODERANGE_7BIT) + +/** + * Destructively modifies the passed object so that its (inline) code range is + * the passed one. The object must be capable of having inline encoding. + * Using this macro needs deep understanding of bit level object binary layout. + * + * @param[out] obj Target object. + * @param[out] cr An enum ::ruby_coderange_type. + * @post `obj`'s code range is `cr`. + */ +#define RB_ENC_CODERANGE_SET(obj,cr) (\ + RBASIC(obj)->flags = \ + (RBASIC(obj)->flags & ~RUBY_ENC_CODERANGE_MASK) | (cr)) + +/** + * Destructively clears the passed object's (inline) code range. The object + * must be capable of having inline encoding. Using this macro needs deep + * understanding of bit level object binary layout. + * + * @param[out] obj Target object. + * @post `obj`'s code range is ::RUBY_ENC_CODERANGE_UNKNOWN. + */ +#define RB_ENC_CODERANGE_CLEAR(obj) RB_ENC_CODERANGE_SET((obj),0) + +/* assumed ASCII compatibility */ +/** + * "Mix" two code ranges into one. This is handy for instance when you + * concatenate two strings into one. Consider one of then is valid but the + * other isn't. The result must be invalid. This macro computes that kind of + * mixture. + * + * @param[in] a An enum ::ruby_coderange_type. + * @param[in] b Another enum ::ruby_coderange_type. + * @return The `a` "and" `b`. + */ +#define RB_ENC_CODERANGE_AND(a, b) \ + ((a) == RUBY_ENC_CODERANGE_7BIT ? (b) : \ + (a) != RUBY_ENC_CODERANGE_VALID ? RUBY_ENC_CODERANGE_UNKNOWN : \ + (b) == RUBY_ENC_CODERANGE_7BIT ? RUBY_ENC_CODERANGE_VALID : (b)) + +/** + * This is #RB_ENCODING_SET + RB_ENC_CODERANGE_SET combo. The object must be + * capable of having inline encoding. Using this macro needs deep + * understanding of bit level object binary layout. + * + * @param[out] obj Target object. + * @param[in] encindex Encoding in encindex format. + * @param[in] cr An enum ::ruby_coderange_type. + * @post `obj`'s encoding is `encindex`. + * @post `obj`'s code range is `cr`. + */ +#define RB_ENCODING_CODERANGE_SET(obj, encindex, cr) \ + do { \ + VALUE rb_encoding_coderange_obj = (obj); \ + RB_ENCODING_SET(rb_encoding_coderange_obj, (encindex)); \ + RB_ENC_CODERANGE_SET(rb_encoding_coderange_obj, (cr)); \ + } while (0) + +#define ENC_CODERANGE_MASK RUBY_ENC_CODERANGE_MASK /**< @old{RUBY_ENC_CODERANGE_MASK} */ +#define ENC_CODERANGE_UNKNOWN RUBY_ENC_CODERANGE_UNKNOWN /**< @old{RUBY_ENC_CODERANGE_UNKNOWN} */ +#define ENC_CODERANGE_7BIT RUBY_ENC_CODERANGE_7BIT /**< @old{RUBY_ENC_CODERANGE_7BIT} */ +#define ENC_CODERANGE_VALID RUBY_ENC_CODERANGE_VALID /**< @old{RUBY_ENC_CODERANGE_VALID} */ +#define ENC_CODERANGE_BROKEN RUBY_ENC_CODERANGE_BROKEN /**< @old{RUBY_ENC_CODERANGE_BROKEN} */ +#define ENC_CODERANGE_CLEAN_P(cr) RB_ENC_CODERANGE_CLEAN_P(cr) /**< @old{RB_ENC_CODERANGE_CLEAN_P} */ +#define ENC_CODERANGE(obj) RB_ENC_CODERANGE(obj) /**< @old{RB_ENC_CODERANGE} */ +#define ENC_CODERANGE_ASCIIONLY(obj) RB_ENC_CODERANGE_ASCIIONLY(obj) /**< @old{RB_ENC_CODERANGE_ASCIIONLY} */ +#define ENC_CODERANGE_SET(obj,cr) RB_ENC_CODERANGE_SET(obj,cr) /**< @old{RB_ENC_CODERANGE_SET} */ +#define ENC_CODERANGE_CLEAR(obj) RB_ENC_CODERANGE_CLEAR(obj) /**< @old{RB_ENC_CODERANGE_CLEAR} */ +#define ENC_CODERANGE_AND(a, b) RB_ENC_CODERANGE_AND(a, b) /**< @old{RB_ENC_CODERANGE_AND} */ +#define ENCODING_CODERANGE_SET(obj, encindex, cr) RB_ENCODING_CODERANGE_SET(obj, encindex, cr) /**< @old{RB_ENCODING_CODERANGE_SET} */ + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_CODERANGE_H */ diff --git a/include/ruby/internal/encoding/ctype.h b/include/ruby/internal/encoding/ctype.h new file mode 100644 index 0000000000..e0b95f93b2 --- /dev/null +++ b/include/ruby/internal/encoding/ctype.h @@ -0,0 +1,182 @@ +#ifndef RUBY_INTERNAL_ENCODING_CTYPE_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_CTYPE_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief Routines to query chacater types. + */ + +#include "ruby/onigmo.h" +#include "ruby/internal/attr/const.h" +#include "ruby/internal/dllexport.h" +#include "ruby/internal/encoding/encoding.h" +#include "ruby/internal/value.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() + +/** + * Queries if the passed pointer points to a newline character. What is a + * newline and what is not depends on the passed encoding. + * + * @param[in] p Pointer to a possibly-middle of a character. + * @param[in] end End of the string. + * @param[in] enc Encoding. + * @retval 0 It isn't. + * @retval otherwise It is. + */ +#define rb_enc_is_newline(p,end,enc) ONIGENC_IS_MBC_NEWLINE((enc),(UChar*)(p),(UChar*)(end)) + +/** + * Queries if the passed code point is of passed character type in the passed + * encoding. The "character type" here is a set of macros defined in onigmo.h, + * like `ONIGENC_CTYPE_PUNCT`. + * + * @param[in] c An `OnigCodePoint` value. + * @param[in] t An `OnigCtype` value. + * @param[in] enc A `rb_encoding*` value. + * @retval 1 `c` is of `t` in `enc`. + * @retval 0 Otherwise. + */ +#define rb_enc_isctype(c,t,enc) ONIGENC_IS_CODE_CTYPE((enc),(c),(t)) + +/** + * Identical to rb_isascii(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 0 `c` is out of range of ASCII character set in `enc`. + * @retval 1 Otherwise. + * + * @internal + * + * `enc` is ignored. This is at least an intentional implementation detail + * (not a bug). But there could be rooms for future extensions. + */ +#define rb_enc_isascii(c,enc) ONIGENC_IS_CODE_ASCII(c) + +/** + * Identical to rb_isalpha(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 1 `enc` classifies `c` as "ALPHA". + * @retval 0 Otherwise. + */ +#define rb_enc_isalpha(c,enc) ONIGENC_IS_CODE_ALPHA((enc),(c)) + +/** + * Identical to rb_islower(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 1 `enc` classifies `c` as "LOWER". + * @retval 0 Otherwise. + */ +#define rb_enc_islower(c,enc) ONIGENC_IS_CODE_LOWER((enc),(c)) + +/** + * Identical to rb_isupper(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 1 `enc` classifies `c` as "UPPER". + * @retval 0 Otherwise. + */ +#define rb_enc_isupper(c,enc) ONIGENC_IS_CODE_UPPER((enc),(c)) + +/** + * Identical to rb_ispunct(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 1 `enc` classifies `c` as "PUNCT". + * @retval 0 Otherwise. + */ +#define rb_enc_ispunct(c,enc) ONIGENC_IS_CODE_PUNCT((enc),(c)) + +/** + * Identical to rb_isalnum(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 1 `enc` classifies `c` as "ANUM". + * @retval 0 Otherwise. + */ +#define rb_enc_isalnum(c,enc) ONIGENC_IS_CODE_ALNUM((enc),(c)) + +/** + * Identical to rb_isprint(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 1 `enc` classifies `c` as "PRINT". + * @retval 0 Otherwise. + */ +#define rb_enc_isprint(c,enc) ONIGENC_IS_CODE_PRINT((enc),(c)) + +/** + * Identical to rb_isspace(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 1 `enc` classifies `c` as "PRINT". + * @retval 0 Otherwise. + */ +#define rb_enc_isspace(c,enc) ONIGENC_IS_CODE_SPACE((enc),(c)) + +/** + * Identical to rb_isdigit(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @retval 1 `enc` classifies `c` as "DIGIT". + * @retval 0 Otherwise. + */ +#define rb_enc_isdigit(c,enc) ONIGENC_IS_CODE_DIGIT((enc),(c)) + +RBIMPL_ATTR_CONST() +/** + * Identical to rb_toupper(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @return `c`'s (Ruby's definition of) upper case counterpart. + * + * @internal + * + * As `RBIMPL_ATTR_CONST` implies this function ignores `enc`. + */ +int rb_enc_toupper(int c, rb_encoding *enc); + +RBIMPL_ATTR_CONST() +/** + * Identical to rb_tolower(), except it additionally takes an encoding. + * + * @param[in] c A code point. + * @param[in] enc An encoding. + * @return `c`'s (Ruby's definition of) lower case counterpart. + * + * @internal + * + * As `RBIMPL_ATTR_CONST` implies this function ignores `enc`. + */ +int rb_enc_tolower(int c, rb_encoding *enc); + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_CTYPE_H */ diff --git a/include/ruby/internal/encoding/encoding.h b/include/ruby/internal/encoding/encoding.h new file mode 100644 index 0000000000..0e6463ad78 --- /dev/null +++ b/include/ruby/internal/encoding/encoding.h @@ -0,0 +1,897 @@ +#ifndef RUBY_INTERNAL_ENCODING_ENCODING_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_ENCODING_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief Defines ::rb_encoding + */ + +#include "ruby/oniguruma.h" +#include "ruby/internal/attr/const.h" +#include "ruby/internal/attr/deprecated.h" +#include "ruby/internal/attr/noalias.h" +#include "ruby/internal/attr/pure.h" +#include "ruby/internal/attr/returns_nonnull.h" +#include "ruby/internal/dllexport.h" +#include "ruby/internal/value.h" +#include "ruby/internal/core/rbasic.h" +#include "ruby/internal/fl_type.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() + +/** + * `Encoding` class. + * + * @ingroup object + */ +RUBY_EXTERN VALUE rb_cEncoding; + +/** + * @private + * + * Bit constants used when embedding encodings into ::RBasic::flags. Extension + * libraries must not bother such things. + */ +enum ruby_encoding_consts { + + /** Max possible number of embeddable encodings. */ + RUBY_ENCODING_INLINE_MAX = 127, + + /** Where inline encodings reside. */ + RUBY_ENCODING_SHIFT = (RUBY_FL_USHIFT+10), + + /** Bits we use to store inline encodings. */ + RUBY_ENCODING_MASK = (RUBY_ENCODING_INLINE_MAX<flags &= ~RUBY_ENCODING_MASK;\ + RBASIC(obj)->flags |= (VALUE)(i) << RUBY_ENCODING_SHIFT;\ +} while (0) + +/** @alias{rb_enc_set_index} */ +#define RB_ENCODING_SET(obj,i) rb_enc_set_index((obj), (i)) + +/** + * Queries the encoding of the passed object. The encoding must be smaller + * than ::RUBY_ENCODING_INLINE_MAX, which means you have some assumption on the + * return value. This means the API is for internal use only. + * + * @param[in] obj Target object. + * @return `obj`'s encoding index. + */ +#define RB_ENCODING_GET_INLINED(obj) \ + (int)((RBASIC(obj)->flags & RUBY_ENCODING_MASK)>>RUBY_ENCODING_SHIFT) + +/** + * @alias{rb_enc_get_index} + * + * @internal + * + * Implementation wise this is not a verbatim alias of rb_enc_get_index(). But + * the API is consistent. Don't bother. + */ +#define RB_ENCODING_GET(obj) \ + (RB_ENCODING_GET_INLINED(obj) != RUBY_ENCODING_INLINE_MAX ? \ + RB_ENCODING_GET_INLINED(obj) : \ + rb_enc_get_index(obj)) + +/** + * Queries if the passed object is in ascii 8bit (== binary) encoding. The + * object must be capable of having inline encoding. Using this macro needs + * deep understanding of bit level object binary layout. + * + * @param[in] obj An object to check. + * @retval 1 It is. + * @retval 0 It isn't. + */ +#define RB_ENCODING_IS_ASCII8BIT(obj) (RB_ENCODING_GET_INLINED(obj) == 0) + +#define ENCODING_SET_INLINED(obj,i) RB_ENCODING_SET_INLINED(obj,i) /**< @old{RB_ENCODING_SET_INLINED} */ +#define ENCODING_SET(obj,i) RB_ENCODING_SET(obj,i) /**< @old{RB_ENCODING_SET} */ +#define ENCODING_GET_INLINED(obj) RB_ENCODING_GET_INLINED(obj) /**< @old{RB_ENCODING_GET_INLINED} */ +#define ENCODING_GET(obj) RB_ENCODING_GET(obj) /**< @old{RB_ENCODING_GET} */ +#define ENCODING_IS_ASCII8BIT(obj) RB_ENCODING_IS_ASCII8BIT(obj) /**< @old{RB_ENCODING_IS_ASCII8BIT} */ +#define ENCODING_MAXNAMELEN RUBY_ENCODING_MAXNAMELEN /**< @old{RUBY_ENCODING_MAXNAMELEN} */ + + +/** + * The type of encoding. Our design here is we take Oniguruma/Onigmo's + * multilingualisation schema as our base data structure. + */ +typedef const OnigEncodingType rb_encoding; + +RBIMPL_ATTR_NOALIAS() +/** + * Converts a character option to its encoding. It only supports a very + * limited set of Japanese encodings due to its Japanese origin. Ruby still + * has this in-core for backwards compatibility. But new codes must not bother + * such concept like one-character encoding option. Consider deprecated in + * practice. + * + * @param[in] c One of `['n', 'e', 's', 'u', 'i', 'x', 'm']`. + * @param[out] option Return buffer. + * @param[out] kcode Return buffer. + * @retval 1 `c` understood properly. + * @retval 0 `c` is not understood. + * @post `option` is a ::OnigOptionType. + * @post `kcode` is an enum `ruby_preserved_encindex`. + * + * @internal + * + * `kcode` is opaque because `ruby_preserved_encindex` is not visible from + * extension libraries. But who cares? + */ +int rb_char_to_option_kcode(int c, int *option, int *kcode); + +/** + * Creates a new encoding, using the passed one as a template. + * + * @param[in] name Name of the creating encoding. + * @param[in] src Template. + * @exception rb_eArgError Duplicated or malformed `name`. + * @return Replicated new encoding's index. + * @post Encoding named `name` is created as a copy of `src`, whose index + * is the return value. + * + * @internal + * + * `name` can be `NULL`, but that just raises an exception. OTOH it seems no + * sanity check is done against `src`...? + */ +int rb_enc_replicate(const char *name, rb_encoding *src); + +/** + * Creates a new "dummy" encoding. Roughly speaking, an encoding is dummy when + * it is stateful. Notable example of dummy encoding are those defined in + * ISO/IEC 2022 + * + * @param[in] name Name of the creating encoding. + * @exception rb_eArgError Duplicated or malformed `name`. + * @return New dummy encoding's index. + * @post Encoding named `name` is created, whose index is the return + * value. + */ +int rb_define_dummy_encoding(const char *name); + +RBIMPL_ATTR_PURE() +/** + * Queries if the passed encoding is dummy. + * + * @param[in] enc Encoding in question. + * @retval 1 It is. + * @retval 0 It isn't. + */ +int rb_enc_dummy_p(rb_encoding *enc); + +RBIMPL_ATTR_PURE() +/** + * Queries the index of the encoding. An encoding's index is a Ruby-local + * concept. It is a (sequential) number assigned to each encoding. + * + * @param[in] enc Encoding in question. + * @return Its index. + * @note You can pass null pointers to this function. It is equivalent + * to rb_usascii_encindex() then. + */ +int rb_enc_to_index(rb_encoding *enc); + +/** + * Queries the index of the encoding of the passed object, if any. + * + * @param[in] obj Object in question. + * @retval -1 `obj` is incapable of having an encoding. + * @retval otherwise `obj`'s encoding's index. + */ +int rb_enc_get_index(VALUE obj); + +/** + * Destructively assigns an encoding (via its index) to an object. + * + * @param[out] obj Object in question. + * @param[in] encindex An encoding index. + * @exception rb_eFrozenError `obj` is frozen. + * @exception rb_eArgError `obj` is incapable of having an encoding. + * @exception rb_eEncodingError `encindex` is out of bounds. + * @exception rb_eLoadError Failed to load the encoding. + */ +void rb_enc_set_index(VALUE obj, int encindex); + +RBIMPL_ATTR_PURE() +/** + * Queries if the passed object can have its encoding. + * + * @param[in] obj Object in question. + * @retval 1 It can. + * @retval 0 It cannot. + */ +int rb_enc_capable(VALUE obj); + +/** + * Queries the index of the encoding. + * + * @param[in] name Name of the encoding to find. + * @exception rb_eArgError No such encoding named `name`. + * @retval -1 `name` exists, but unable to load. + * @retval otherwise Index of encoding named `name`. + */ +int rb_enc_find_index(const char *name); + +/** + * Registers an "alias" name. In the wild, an encoding can be called using + * multiple names. For instance an encoding known as `"CP932"` is also called + * `"SJIS"` on occasions. This API registers such relationships. + * + * @param[in] alias New name. + * @param[in] orig Old name. + * @exception rb_eArgError `alias` is duplicated or malformed. + * @retval -1 Failed to load `orig`. + * @retval otherwise The index of `orig` and `alias`. + * @post `alias` is a synonym of `orig`. They refer to the identical + * encoding. + */ +int rb_enc_alias(const char *alias, const char *orig); + +/** + * Obtains a encoding index from a wider range of objects (than + * rb_enc_find_index()). + * + * @param[in] obj An ::rb_cEncoding, or its name in ::rb_cString. + * @retval -1 `obj` is unexpected type/contents. + * @retval otherwise Index corresponding to `obj`. + */ +int rb_to_encoding_index(VALUE obj); + +/** + * Identical to rb_find_encoding(), except it raises an exception instead of + * returning NULL. + * + * @param[in] obj An ::rb_cEncoding, or its name in ::rb_cString. + * @exception rb_eTypeError `obj` is neither ::rb_cEncoding nor ::rb_cString. + * @exception rb_eArgError `obj` is an unknown encoding name. + * @return Encoding of `obj`. + */ +rb_encoding *rb_to_encoding(VALUE obj); + +/** + * Identical to rb_to_encoding_index(), except the return type. + * + * @param[in] obj An ::rb_cEncoding, or its name in ::rb_cString. + * @exception rb_eTypeError `obj` is neither ::rb_cEncoding nor ::rb_cString. + * @retval NULL No such encoding. + * @return otherwise Encoding of `obj`. + */ +rb_encoding *rb_find_encoding(VALUE obj); + +/** + * Identical to rb_enc_get_index(), except the return type. + * + * @param[in] obj Object in question. + * @retval NULL Obj is incapable of having an encoding. + * @retval otherwise `obj`'s encoding. + */ +rb_encoding *rb_enc_get(VALUE obj); + +/** + * Look for the "common" encoding between the two. One character can or cannot + * be expressed depending on an encoding. This function finds the super-set of + * encodings that satisfy contents of both arguments. If that is impossible + * returns NULL. + * + * @param[in] str1 An object. + * @param[in] str2 Another object. + * @retval NULL No encoding can satisfy both at once. + * @retval otherwise Common encoding between the two. + * @note Arguments can be non-string, e.g. Regexp. + */ +rb_encoding *rb_enc_compatible(VALUE str1, VALUE str2); + +/** + * Identical to rb_enc_compatible(), except it raises an exception instead of + * returning NULL. + * + * @param[in] str1 An object. + * @param[in] str2 Another object. + * @exception rb_eEncCompatError No encoding can satisfy both. + * @return Common encoding between the two. + * @note Arguments can be non-string, e.g. Regexp. + */ +rb_encoding *rb_enc_check(VALUE str1,VALUE str2); + +/** + * Identical to rb_enc_set_index(), except it additionally does contents fix-up + * depending on the passed object. It for instance changes the byte length of + * terminating `U+0000` according to the passed encoding. + * + * @param[out] obj Object in question. + * @param[in] encindex An encoding index. + * @exception rb_eFrozenError `obj` is frozen. + * @exception rb_eArgError `obj` is incapable of having an encoding. + * @exception rb_eEncodingError `encindex` is out of bounds. + * @exception rb_eLoadError Failed to load the encoding. + * @return The passed `obj`. + * @post `obj`'s contents might be fixed according to `encindex`. + */ +VALUE rb_enc_associate_index(VALUE obj, int encindex); + +/** + * Identical to rb_enc_associate(), except it takes an encoding itself instead + * of its index. + * + * @param[out] obj Object in question. + * @param[in] enc An encoding. + * @exception rb_eFrozenError `obj` is frozen. + * @exception rb_eArgError `obj` is incapable of having an encoding. + * @return The passed `obj`. + * @post `obj`'s contents might be fixed according to `enc`. + */ +VALUE rb_enc_associate(VALUE obj, rb_encoding *enc); + +/** + * Destructively copies the encoding of the latter object to that of former + * one. It can also be seen as a routine identical to + * rb_enc_associate_index(), except it takes an object's encoding instead of an + * encoding's index. + * + * @param[out] dst Object to modify. + * @param[in] src Object to reference. + * @exception rb_eFrozenError `dst` is frozen. + * @exception rb_eArgError `dst` is incapable of having an encoding. + * @exception rb_eEncodingError `src` is incapable of having an encoding. + * @post `dst`'s encoding is that of `src`'s. + */ +void rb_enc_copy(VALUE dst, VALUE src); + + +/** + * Identical to rb_find_encoding(), except it takes an encoding index instead + * of a Ruby object. + * + * @param[in] idx An encoding index. + * @retval NULL No such encoding. + * @retval otherwise An encoding whose index is `idx`. + */ +rb_encoding *rb_enc_from_index(int idx); + +/** + * Identical to rb_find_encoding(), except it takes a C's string instead of + * Ruby's. + * + * @param[in] name Name of the encoding to query. + * @retval NULL No such encoding. + * @retval otherwise An encoding whose index is `idx`. + */ +rb_encoding *rb_enc_find(const char *name); + +/** + * Queries the (canonical) name of the passed encoding. + * + * @param[in] enc An encoding. + * @return Its name. + */ +#define rb_enc_name(enc) (enc)->name + +/** + * Queries the minimum number of bytes that the passed encoding needs to + * represent a character. For ASCII and compatible encodings this is typically + * 1. There are however encodings whose minimum is not 1; they are + * historically called wide characters. + * + * @param[in] enc An encoding. + * @return Its least possible number of bytes except 0. + */ +#define rb_enc_mbminlen(enc) (enc)->min_enc_len + +/** + * Queries the maximum number of bytes that the passed encoding needs to + * represent a character. Fixed-width encodings have the same value for this + * one and #rb_enc_mbminlen. However there are variable-width encodings. + * UTF-8, for instance, takes from 1 up to 6 bytes. + * + * @param[in] enc An encoding. + * @return Its maximum possible number of bytes of a character. + */ +#define rb_enc_mbmaxlen(enc) (enc)->max_enc_len + +/** + * Queries the number of bytes of the character at the passed pointer. + * + * @param[in] p Pointer to a character's first byte. + * @param[in] e End of the string that has `p`. + * @param[in] enc Encoding of the string. + * @return If the character at `p` does not end until `e`, number of bytes + * between `p` and `e`. Otherwise the number of bytes that the + * character at `p` is encoded. + * + * @internal + * + * Strictly speaking there are chances when `p` points to a middle byte of a + * wide character. This function returns "the number of bytes from `p` to + * nearest of either `e` or the next character boundary", if you go strict. + */ +int rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc); + +/** + * Identical to rb_enc_mbclen() unless the character at `p` overruns `e`. That + * can happen for instance when you read from a socket and its partial read + * cuts a wide character in-between. In those situations this function + * "estimates" theoretical length of the character in question. Typically it + * tends to be possible to know how many bytes a character needs before + * actually reaching its end; for instance UTF-8 encodes a character's length + * in the first byte of it. This function returns that info. + * + * @note This implies that the string is not broken. + * + * @param[in] p Pointer to the character's first byte. + * @param[in] e End of the string that has `p`. + * @param[in] enc Encoding of the string. + * @return Number of bytes of character at `p`, measured or estimated. + */ +int rb_enc_fast_mbclen(const char *p, const char *e, rb_encoding *enc); + +/** + * Queries the number of bytes of the character at the passed pointer. This + * function returns 3 different types of information: + * + * ```CXX + * auto n = rb_enc_precise_mbclen(p, q, r); + * + * if (ONIGENC_MBCLEN_CHARFOUND_P(n)) { + * // Character found. Normal return. + * auto found_length = ONIGENC_MBCLEN_CHARFOUND_LEN(n); + * } + * else if (ONIGENC_MBCLEN_NEEDMORE_P(n)) { + * // Character overruns past `q`; needs more. + * auto requested_length = ONIGENC_MBCLEN_NEEDMORE_LEN(n); + * } + * else { + * // `p` is broken. + * assert(ONIGENC_MBCLEN_INVALID_P(n)); + * } + * ``` + * + * @param[in] p Pointer to the character's first byte. + * @param[in] e End of the string that has `p`. + * @param[in] enc Encoding of the string. + * @return Encoded read/needed number of bytes (see above). + */ +int rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc); + +#define MBCLEN_CHARFOUND_P(ret) ONIGENC_MBCLEN_CHARFOUND_P(ret) /**< @old{ONIGENC_MBCLEN_CHARFOUND_P} */ +#define MBCLEN_CHARFOUND_LEN(ret) ONIGENC_MBCLEN_CHARFOUND_LEN(ret) /**< @old{ONIGENC_MBCLEN_CHARFOUND_LEN} */ +#define MBCLEN_INVALID_P(ret) ONIGENC_MBCLEN_INVALID_P(ret) /**< @old{ONIGENC_MBCLEN_INVALID_P} */ +#define MBCLEN_NEEDMORE_P(ret) ONIGENC_MBCLEN_NEEDMORE_P(ret) /**< @old{ONIGENC_MBCLEN_NEEDMORE_P} */ +#define MBCLEN_NEEDMORE_LEN(ret) ONIGENC_MBCLEN_NEEDMORE_LEN(ret) /**< @old{ONIGENC_MBCLEN_NEEDMORE_LEN} */ + +/** + * Queries the code point of character pointed by the passed pointer. If that + * code point is included in ASCII that code point is returned. Otherwise -1. + * This can be different from just looking at the first byte. For instance it + * reads 2 bytes in case of UTF-16BE. + * + * @param[in] p Pointer to the character's first byte. + * @param[in] e End of the string that has `p`. + * @param[in] len Return buffer. + * @param[in] enc Encoding of the string. + * @retval -1 The character at `p` is not i ASCII. + * @retval otherwise A code point of the character at `p`. + * @post `len` (if set) is the number of bytes of `p`. + */ +int rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc); + +/** + * Queries the code point of character pointed by the passed pointer. + * Exceptions happen in case of broken input. + * + * @param[in] p Pointer to the character's first byte. + * @param[in] e End of the string that has `p`. + * @param[in] len Return buffer. + * @param[in] enc Encoding of the string. + * @exception rb_eArgError `p` is broken. + * @return Code point of the character pointed by `p`. + * @post `len` (if set) is the number of bytes of `p`. + */ +unsigned int rb_enc_codepoint_len(const char *p, const char *e, int *len, rb_encoding *enc); + +RBIMPL_ATTR_DEPRECATED(("use rb_enc_codepoint_len instead.")) +/** + * Queries the code point of character pointed by the passed pointer. + * Exceptions happen in case of broken input. + * + * @deprecated Use rb_enc_codepoint_len() instead. + * @param[in] p Pointer to the character's first byte. + * @param[in] e End of the string that has `p`. + * @param[in] enc Encoding of the string. + * @exception rb_eArgError `p` is broken. + * @return Code point of the character pointed by `p`. + */ +unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc); + +/** @cond INTERNAL_MACRO */ +#define rb_enc_codepoint(p,e,enc) rb_enc_codepoint_len((p),(e),0,(enc)) +/** @endcond */ + +/** + * Identical to rb_enc_codepoint(), except it assumes the passed character is + * not broken. + * + * @param[in] p Pointer to the character's first byte. + * @param[in] e End of the string that has `p`. + * @param[in] enc Encoding of the string. + * @return Code point of the character pointed by `p`. + */ +#define rb_enc_mbc_to_codepoint(p, e, enc) ONIGENC_MBC_TO_CODE((enc),(UChar*)(p),(UChar*)(e)) + +/** + * Queries the number of bytes requested to represent the passed code point + * using the passed encoding. + * + * @param[in] code Code point in question. + * @param[in] enc Encoding to convert the code into a byte sequence. + * @exception rb_eArgError `enc` does not glean `code`. + * @return Number of bytes requested to represent `code` using `enc`. + */ +int rb_enc_codelen(int code, rb_encoding *enc); + +/** + * Identical to rb_enc_codelen(), except it returns 0 for invalid code points. + * + * @param[in] code Code point in question. + * @param[in] enc Encoding to convert the code into a byte sequence. + * @retval 0 `code` is invalid. + * @return otherwise Number of bytes used for `enc` to encode `code`. + */ +int rb_enc_code_to_mbclen(int code, rb_encoding *enc); + +/** @cond INTERNAL_MACRO */ +#define rb_enc_code_to_mbclen(c, enc) ONIGENC_CODE_TO_MBCLEN((enc), (c)); +/** @endcond */ + +/** + * Identical to rb_enc_uint_chr(), except it writes back to the passed buffer + * instead of allocating one. + * + * @param[in] c Code point. + * @param[out] buf Return buffer. + * @param[in] enc Target encoding scheme. + * @post `c` is encoded according to `enc`, then written to `buf`. + */ +#define rb_enc_mbcput(c,buf,enc) ONIGENC_CODE_TO_MBC((enc),(c),(UChar*)(buf)) + +/** + * Queries the previous (left) character. + * + * @param[in] s Start of the string. + * @param[in] p Pointer to a character. + * @param[in] e End of the string. + * @param[in] enc Encoding. + * @retval NULL No previous character. + * @retval otherwise Pointer to the head of the previous character. + */ +#define rb_enc_prev_char(s,p,e,enc) ((char *)onigenc_get_prev_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) + +/** + * Queries the left boundary of a character. This function takes a pointer + * that is not necessarily a head of a character, and searches for its head. + * + * @param[in] s Start of the string. + * @param[in] p Pointer to a possibly-middle of a character. + * @param[in] e End of the string. + * @param[in] enc Encoding. + * @return Pointer to the head of the character that contains `p`. + */ +#define rb_enc_left_char_head(s,p,e,enc) ((char *)onigenc_get_left_adjust_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) + +/** + * Queries the right boundary of a character. This function takes a pointer + * that is not necessarily a head of a character, and searches for its tail. + * + * @param[in] s Start of the string. + * @param[in] p Pointer to a possibly-middle of a character. + * @param[in] e End of the string. + * @param[in] enc Encoding. + * @return Pointer to the end of the character that contains `p`. + */ +#define rb_enc_right_char_head(s,p,e,enc) ((char *)onigenc_get_right_adjust_char_head((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e))) + +/** + * Scans the string backwards for n characters. + * + * @param[in] s Start of the string. + * @param[in] p Pointer to a character. + * @param[in] e End of the string. + * @param[in] n Steps. + * @param[in] enc Encoding. + * @retval NULL There are no `n` characters left. + * @retval otherwise Pointer to `n` character before `p`. + */ +#define rb_enc_step_back(s,p,e,n,enc) ((char *)onigenc_step_back((enc),(UChar*)(s),(UChar*)(p),(UChar*)(e),(int)(n))) + +/** + * @private + * + * This is an implementation detail of rb_enc_asciicompat(). People don't use + * it directly. Just always use rb_enc_asciicompat(). + * + * @param[in] enc Encoding in question. + * @retval 1 It is ASCII compatible. + * @retval 0 It isn't. + */ +static inline int +rb_enc_asciicompat_inline(rb_encoding *enc) +{ + return rb_enc_mbminlen(enc)==1 && !rb_enc_dummy_p(enc); +} + +/** + * Queries if the passed encoding is _in some sense_ compatible with ASCII. + * The concept of ASCII compatibility is nuanced, and private to our + * implementation. For instance SJIS is ASCII compatible to us, despite their + * having different characters at code point `0x5C`. This is based on some + * practical consideration that Japanese people confuses SJIS to be "upper + * compatible" with ASCII (which is in fact a wrong idea, but we just don't go + * strict here). An example of ASCII incompatible encoding is UTF-16. UTF-16 + * shares code points with ASCII, but employs a completely different encoding + * scheme. + * + * @param[in] enc Encoding in question. + * @retval 0 It is incompatible. + * @retval 1 It is compatible. + */ +#define rb_enc_asciicompat(enc) rb_enc_asciicompat_inline(enc) + + +/** + * Queries if the passed string is in an ASCII-compatible encoding. + * + * @param[in] str A Ruby's string to query. + * @retval 0 `str` is not a String, or an ASCII-incompatible string. + * @retval 1 Otherwise. + */ +#define rb_enc_str_asciicompat_p(str) rb_enc_asciicompat(rb_enc_get(str)) + +/** + * Queries the Ruby-level counterpart instance of ::rb_cEncoding that + * corresponds to the passed encoding. + * + * @param[in] enc An encoding + * @retval RUBY_Qnil `enc` is a null pointer. + * @retval otherwise An instance of ::rb_cEncoding. + */ +VALUE rb_enc_from_encoding(rb_encoding *enc); + +RBIMPL_ATTR_PURE() +/** + * Queries if the passed encoding is either one of UTF-8/16/32. + * + * @note It does not take UTF-7, which we actually support, into account. + * + * @param[in] enc Encoding in question. + * @retval 0 It is not a Unicode variant. + * @retval otherwise It is. + * + * @internal + * + * In reality it returns 1/0, but the value is abstracted as + * `ONIGENC_FLAG_UNICODE`. + */ +int rb_enc_unicode_p(rb_encoding *enc); + +RBIMPL_ATTR_RETURNS_NONNULL() +/** + * Queries the encoding that represents ASCII-8BIT a.k.a. binary. + * + * @return The encoding that represents ASCII-8BIT. + * + * @internal + * + * This can not return NULL once the process properly boots up. + */ +rb_encoding *rb_ascii8bit_encoding(void); + +RBIMPL_ATTR_RETURNS_NONNULL() +/** + * Queries the encoding that represents UTF-8. + * + * @return The encoding that represents UTF-8. + * + * @internal + * + * This can not return NULL once the process properly boots up. + */ +rb_encoding *rb_utf8_encoding(void); + +RBIMPL_ATTR_RETURNS_NONNULL() +/** + * Queries the encoding that represents US-ASCII. + * + * @return The encoding that represents US-ASCII. + * + * @internal + * + * This can not return NULL once the process properly boots up. + */ +rb_encoding *rb_usascii_encoding(void); + +/** + * Queries the encoding that represents the current locale. + * + * @return The encoding that represents the process' locale. + * + * @internal + * + * This is dynamic. If you change the process' locale by e.g. calling + * `setlocale(3)`, that should also change the return value of this function. + * + * There is no official way for Ruby scripts to manipulate locales, though. + */ +rb_encoding *rb_locale_encoding(void); + +/** + * Queries the "filesystem" encoding. This is the encoding that ruby expects + * info from the OS' file system are in. This affects for instance return + * value of rb_dir_getwd(). Most notably on Windows it can be an alias of OS + * codepage. Most notably on Linux users can set this via default external + * encoding. + * + * @return The "filesystem" encoding. + */ +rb_encoding *rb_filesystem_encoding(void); + +/** + * Queries the "default external" encoding. This is used to interact with + * outer-process things such as File. Though not recommended, you can set this + * using rb_enc_set_default_external(). + * + * @return The "default external" encoding. + */ +rb_encoding *rb_default_external_encoding(void); + +/** + * Queries the "default internal" encoding. This could be a null pointer. + * Otherwise, outer-process info are transcoded from default external encoding + * to this one during reading from an IO. + * + * @return The "default internal" encoding (if any). + */ +rb_encoding *rb_default_internal_encoding(void); + +#ifndef rb_ascii8bit_encindex +RBIMPL_ATTR_CONST() +/** + * Identical to rb_ascii8bit_encoding(), except it returns the encoding's index + * instead of the encoding itself. + * + * @return The index of encoding of ASCII-8BIT. + * + * @internal + * + * This happens to be 0. + */ +int rb_ascii8bit_encindex(void); +#endif + +#ifndef rb_utf8_encindex +RBIMPL_ATTR_CONST() +/** + * Identical to rb_utf8_encoding(), except it returns the encoding's index + * instead of the encoding itself. + * + * @return The index of encoding of UTF-8. + */ +int rb_utf8_encindex(void); +#endif + +#ifndef rb_usascii_encindex +RBIMPL_ATTR_CONST() +/** + * Identical to rb_usascii_encoding(), except it returns the encoding's index + * instead of the encoding itself. + * + * @return The index of encoding of UTF-8. + */ +int rb_usascii_encindex(void); +#endif + +/** + * Identical to rb_locale_encoding(), except it returns the encoding's index + * instead of the encoding itself. + * + * @return The index of the locale encoding. + */ +int rb_locale_encindex(void); + +/** + * Identical to rb_filesystem_encoding(), except it returns the encoding's + * index instead of the encoding itself. + * + * @return The index of the filesystem encoding. + */ +int rb_filesystem_encindex(void); + +/** + * Identical to rb_default_external_encoding(), except it returns the + * Ruby-level counterpart instance of ::rb_cEncoding that corresponds to the + * default external encoding. + * + * @return An instance of ::rb_cEncoding of default external. + */ +VALUE rb_enc_default_external(void); + +/** + * Identical to rb_default_internal_encoding(), except it returns the + * Ruby-level counterpart instance of ::rb_cEncoding that corresponds to the + * default internal encoding. + * + * @return An instance of ::rb_cEncoding of default internal. + */ +VALUE rb_enc_default_internal(void); + +/** + * Destructively assigns the passed encoding as the default external encoding. + * You should not use this API. It has process-global side effects. Also it + * doesn't change encodings of strings that have already been read. + * + * @param[in] encoding Ruby level encoding. + * @exception rb_eArgError `encoding` is ::RUBY_Qnil. + * @post The default external encoding is `encoding`. + */ +void rb_enc_set_default_external(VALUE encoding); + +/** + * Destructively assigns the passed encoding as the default internal encoding. + * You should not use this API. It has process-global side effects. Also it + * doesn't change encodings of strings that have already been read. + * + * @param[in] encoding Ruby level encoding. + * @post The default internal encoding is `encoding`. + * @note Unlike rb_enc_set_default_external() you can pass ::RUBY_Qnil. + */ +void rb_enc_set_default_internal(VALUE encoding); + +/** + * Returns a platform-depended "charmap" of the current locale. This + * information is called a "Codeset name" in IEEE 1003.1 section 13 + * (``). This is a very low-level API. The return value can have + * no corresponding encoding when passed to rb_find_encoding(). + * + * @param[in] klass Ignored for no reason (why...) + * @return The low-level locale charmap, in Ruby's String. + */ +VALUE rb_locale_charmap(VALUE klass); + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_ENCODING_H */ diff --git a/include/ruby/internal/encoding/pathname.h b/include/ruby/internal/encoding/pathname.h new file mode 100644 index 0000000000..0b5e85a524 --- /dev/null +++ b/include/ruby/internal/encoding/pathname.h @@ -0,0 +1,184 @@ +#ifndef RUBY_INTERNAL_ENCODING_PATHNAME_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_PATHNAME_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief Routines to manipulate encodings of pathnames. + */ + +#include "ruby/internal/attr/nonnull.h" +#include "ruby/internal/dllexport.h" +#include "ruby/internal/encoding/encoding.h" +#include "ruby/internal/value.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() +RBIMPL_ATTR_NONNULL(()) +/** + * Returns a path component directly adjacent to the passed pointer. + * + * ``` + * "/multi/byte/encoded/pathname.txt" + * ^ ^ ^ + * | | +--- end + * | +--- @return + * +--- path + * ``` + * + * @param[in] path Where to start scanning. + * @param[in] end End of the path string. + * @param[in] enc Encoding of the string. + * @return A pointer in the passed string where the next path component + * resides, or `end` if there is no next path component. + */ +char *rb_enc_path_next(const char *path, const char *end, rb_encoding *enc); + +RBIMPL_ATTR_NONNULL(()) +/** + * Seeks for non-prefix part of a pathname. This can be a no-op when the OS + * has no such concept like a path prefix. But there are OSes where path + * prefixes do exist. + * + * ``` + * "C:\multi\byte\encoded\pathname.txt" + * ^ ^ ^ + * | | +--- end + * | +--- @return + * +--- path + * ``` + * + * @param[in] path Where to start scanning. + * @param[in] end End of the path string. + * @param[in] enc Encoding of the string. + * @return A pointer in the passed string where non-prefix part starts, or + * `path` if the OS does not have path prefix. + */ +char *rb_enc_path_skip_prefix(const char *path, const char *end, rb_encoding *enc); + +RBIMPL_ATTR_NONNULL(()) +/** + * Returns the last path component. + * + * ``` + * "/multi/byte/encoded/pathname.txt" + * ^ ^ ^ + * | | +--- end + * | +--- @return + * +--- path + * ``` + * + * @param[in] path Where to start scanning. + * @param[in] end End of the path string. + * @param[in] enc Encoding of the string. + * @return A pointer in the passed string where the last path component + * resides, or `end` if there is no more path component. + */ +char *rb_enc_path_last_separator(const char *path, const char *end, rb_encoding *enc); + +RBIMPL_ATTR_NONNULL(()) +/** + * This just returns the passed end basically. It makes difference in case the + * passed string ends with tons of path separators like the following: + * + * ``` + * "/path/that/ends/with/lots/of/slashes//////////////" + * ^ ^ ^ + * | | +--- end + * | +--- @return + * +--- path + * ``` + * + * @param[in] path Where to start scanning. + * @param[in] end End of the path string. + * @param[in] enc Encoding of the string. + * @return A pointer in the passed string where the trailing path + * separators start, or `end` if there is no trailing path + * separators. + * + * @internal + * + * It seems this function was introduced to mimic what POSIX says about + * `basename(3)`. + */ +char *rb_enc_path_end(const char *path, const char *end, rb_encoding *enc); + +RBIMPL_ATTR_NONNULL((1, 4)) +/** + * Our own encoding-aware version of `basename(3)`. Normally, this function + * returns the last path component of the given name. However in case the + * passed name ends with a path separator, it returns the name of the + * directory, not the last (empty) component. Also if the passed name is a + * root directory, it returns that root directory. Note however that Windows + * filesystem have drive letters, which this function does not return. + * + * @param[in] name Target path. + * @param[out] baselen Return buffer. + * @param[in,out] alllen Number of bytes of `name`. + * @param[enc] enc Encoding of `name`. + * @return The rightmost component of `name`. + * @post `baselen`, if passed, is updated to be the number of bytes + * of the returned basename. + * @post `alllen`, if passed, is updated to be the number of bytes of + * strings not considered as the basename. + */ +const char *ruby_enc_find_basename(const char *name, long *baselen, long *alllen, rb_encoding *enc); + +RBIMPL_ATTR_NONNULL((1, 3)) +/** + * Our own encoding-aware version of `extname`. This function first applies + * rb_enc_path_last_separator() to the passed name and only concerns its return + * value (ignores any parent directories). This function returns complicated + * results: + * + * ```CXX + * auto path = "..."; + * auto len = strlen(path); + * auto ret = ruby_enc_find_extname(path, &len, rb_ascii8bit_encoding()); + * + * switch(len) { + * case 0: + * if (ret == 0) { + * // `path` is a file without extensions. + * } + * else { + * // `path` is a dotfile. + * // `ret` is the file's name. + * } + * break; + * + * case 1: + * // `path` _ends_ with a dot. + * // `ret` is that dot. + * break; + * + * default: + * // `path` has an extension. + * // `ret` is that extension. + * } + * ``` + * + * @param[in] name Target path. + * @param[in,out] len Number of bytes of `name`. + * @param[in] enc Encoding of `name`. + * @return See above. + * @post `len`, if passed, is updated (see above). + */ +const char *ruby_enc_find_extname(const char *name, long *len, rb_encoding *enc); + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_PATHNAME_H */ diff --git a/include/ruby/internal/encoding/re.h b/include/ruby/internal/encoding/re.h new file mode 100644 index 0000000000..d0de23bc83 --- /dev/null +++ b/include/ruby/internal/encoding/re.h @@ -0,0 +1,46 @@ +#ifndef RUBY_INTERNAL_ENCODING_RE_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_RE_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief Routines to manipulate encodings of symbols. + */ + +#include "ruby/internal/dllexport.h" +#include "ruby/internal/encoding/encoding.h" +#include "ruby/internal/value.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() + +/** + * Identical to rb_reg_new(), except it additionally takes an encoding. + * + * @param[in] ptr A memory region of `len` bytes length. + * @param[in] len Length of `ptr`, in bytes, not including the + * terminating NUL character. + * @param[in] enc Encoding of `ptr`. + * @param[in] opts Options e.g. ONIG_OPTION_MULTILINE. + * @exception rb_eRegexpError Failed to compile `ptr`. + * @return An allocated new instance of ::rb_cRegexp, of `enc` encoding, + * whose expression is compiled according to `ptr`. + */ +VALUE rb_enc_reg_new(const char *ptr, long len, rb_encoding *enc, int opts); + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_RE_H */ diff --git a/include/ruby/internal/encoding/sprintf.h b/include/ruby/internal/encoding/sprintf.h new file mode 100644 index 0000000000..cb8737b414 --- /dev/null +++ b/include/ruby/internal/encoding/sprintf.h @@ -0,0 +1,78 @@ +#ifndef RUBY_INTERNAL_ENCODING_SPRINTF_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_SPRINTF_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief Routines to manipulate encodings of symbols. + */ +#include "ruby/internal/config.h" +#include +#include "ruby/internal/attr/format.h" +#include "ruby/internal/attr/nonnull.h" +#include "ruby/internal/attr/noreturn.h" +#include "ruby/internal/dllexport.h" +#include "ruby/internal/encoding/encoding.h" +#include "ruby/internal/value.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() +RBIMPL_ATTR_NONNULL((2)) +RBIMPL_ATTR_FORMAT(RBIMPL_PRINTF_FORMAT, 2, 3) +/** + * Identical to rb_sprintf(), except it additionally takes an encoding. The + * passed encoding rules both the incoming format specifier and the resulting + * string. + * + * @param[in] enc Encoding of `fmt`. + * @param[in] fmt A `printf`-like format specifier. + * @param[in] ... Variadic number of contents to format. + * @return A rendered new instance of ::rb_cString, of `enc` encoding. + */ +VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt, ...); + +RBIMPL_ATTR_NONNULL((2)) +RBIMPL_ATTR_FORMAT(RBIMPL_PRINTF_FORMAT, 2, 0) +/** + * Identical to rb_enc_sprintf(), except it takes a `va_list` instead of + * variadic arguments. It can also be seen as a routine identical to + * rb_vsprintf(), except it additionally takes an encoding. + * + * @param[in] enc Encoding of `fmt`. + * @param[in] fmt A `printf`-like format specifier. + * @param[in] ap Contents to format. + * @return A rendered new instance of ::rb_cString, of `enc` encoding. + */ +VALUE rb_enc_vsprintf(rb_encoding *enc, const char *fmt, va_list ap); + +RBIMPL_ATTR_NORETURN() +RBIMPL_ATTR_NONNULL((3)) +RBIMPL_ATTR_FORMAT(RBIMPL_PRINTF_FORMAT, 3, 4) +/** + * Identical to rb_raise(), except it additionally takes an encoding. + * + * @param[in] enc Encoding of the generating exception. + * @param[in] exc A subclass of ::rb_eException. + * @param[in] fmt Format specifier string compatible with rb_sprintf(). + * @param[in] ... Contents of the message. + * @exception exc The specified exception. + * @note It never returns. + */ +void rb_enc_raise(rb_encoding *enc, VALUE exc, const char *fmt, ...); + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_SPRINTF_H */ diff --git a/include/ruby/internal/encoding/string.h b/include/ruby/internal/encoding/string.h new file mode 100644 index 0000000000..87226bec10 --- /dev/null +++ b/include/ruby/internal/encoding/string.h @@ -0,0 +1,337 @@ +#ifndef RUBY_INTERNAL_ENCODING_STRING_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_STRING_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief Routines to manipulate encodings of strings. + */ + +#include "ruby/internal/dllexport.h" +#include "ruby/internal/value.h" +#include "ruby/internal/encoding/encoding.h" +#include "ruby/internal/attr/nonnull.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() + +/** + * Identical to rb_enc_str_new(), except it additionally takes an encoding. + * + * @param[in] ptr A memory region of `len` bytes length. + * @param[in] len Length of `ptr`, in bytes, not including the + * terminating NUL character. + * @param[in] enc Encoding of `ptr`. + * @exception rb_eNoMemError Failed to allocate `len+1` bytes. + * @exception rb_eArgError `len` is negative. + * @return An instance of ::rb_cString, of `len` bytes length, of `enc` + * encoding, whose contents are verbatim copy of `ptr`. + * @pre At least `len` bytes of continuous memory region shall be + * accessible via `ptr`. + * @note `enc` can be a null pointer. It can also be seen as a routine + * identical to rb_usascii_str_new() then. + */ +VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc); + +RBIMPL_ATTR_NONNULL((1)) +/** + * Identical to rb_enc_str_new(), except it assumes the passed pointer is a + * pointer to a C string. It can also be seen as a routine identical to + * rb_str_new_cstr(), except it additionally takes an encoding. + * + * @param[in] ptr A C string. + * @param[in] enc Encoding of `ptr`. + * @exception rb_eNoMemError Failed to allocate memory. + * @return An instance of ::rb_cString, of `enc` encoding, whose contents + * are verbatim copy of `ptr`. + * @pre `ptr` must not be a null pointer. + * @pre Because `ptr` is a C string it makes no sense for `enc` to be + * something like UTF-32. + * @note `enc` can be a null pointer. It can also be seen as a routine + * identical to rb_usascii_str_new_cstr() then. + */ +VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc); + +/** + * Identical to rb_enc_str_new(), except it takes a C string literal. It can + * also be seen as a routine identical to rb_str_new_static(), except it + * additionally takes an encoding. + * + * @param[in] ptr A C string literal. + * @param[in] len `strlen(ptr)`. + * @param[in] enc Encoding of `ptr`. + * @exception rb_eArgError `len` out of range of `size_t`. + * @pre `ptr` must be a C string constant. + * @return An instance of ::rb_cString, of `enc` encoding, whose backend + * storage is the passed C string literal. + * @warning It is a very bad idea to write to a C string literal (often + * immediate SEGV shall occur). Consider return values of this + * function be read-only. + * @note `enc` can be a null pointer. It can also be seen as a routine + * identical to rb_usascii_str_new_static() then. + */ +VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc); + +/** + * Identical to rb_enc_str_new(), except it returns a "f"string. It can also + * be seen as a routine identical to rb_interned_str(), except it additionally + * takes an encoding. + * + * @param[in] ptr A memory region of `len` bytes length. + * @param[in] len Length of `ptr`, in bytes, not including the + * terminating NUL character. + * @param[in] enc Encoding of `ptr`. + * @exception rb_eArgError `len` is negative. + * @return A found or created instance of ::rb_cString, of `len` bytes + * length, of `enc` encoding, whose contents are identical to that + * of `ptr`. + * @pre At least `len` bytes of continuous memory region shall be + * accessible via `ptr`. + * @note `enc` can be a null pointer. + */ +VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc); + +RBIMPL_ATTR_NONNULL((1)) +/** + * Identical to rb_enc_str_new_cstr(), except it returns a "f"string. It can + * also be seen as a routine identical to rb_interned_str_cstr(), except it + * additionally takes an encoding. + * + * @param[in] ptr A memory region of `len` bytes length. + * @param[in] enc Encoding of `ptr`. + * @return A found or created instance of ::rb_cString of `enc` encoding, + * whose contents are identical to that of `ptr`. + * @pre At least `len` bytes of continuous memory region shall be + * accessible via `ptr`. + * @note `enc` can be a null pointer. + */ +VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc); + +/** + * Counts the number of characters of the passed string, according to the + * passed encoding. This has to be complicated. The passed string could be + * invalid and/or broken. This routine would scan from the beginning til the + * end, byte by byte, to seek out character boundaries. Could be super slow. + * + * @param[in] head Leftmost pointer to the string. + * @param[in] tail Rightmost pointer to the string. + * @param[in] enc Encoding of the string. + * @return Number of characters exist in `head` .. `tail`. The definition + * of "character" depends on the passed `enc`. + */ +long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc); + +/** + * Queries the n-th character. Like rb_enc_strlen() this function can be fast + * or slow depending on the contents. Don't expect characters to be uniformly + * distributed across the entire string. + * + * @param[in] head Leftmost pointer to the string. + * @param[in] tail Rightmost pointer to the string. + * @param[in] nth Requested index of characters. + * @param[in] enc Encoding of the string. + * @return Pointer to the first byte of the character that is `nth` + * character ahead of `head`, or `tail` if there is no such + * character (OOB etc). The definition of "character" depends on + * the passed `enc`. + */ +char *rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc); + +/** + * Identical to rb_enc_get_index(), except the return type. + * + * @param[in] obj Object in question. + * @exception rb_eTypeError `obj` is incapable of having an encoding. + * @return `obj`'s encoding. + */ +VALUE rb_obj_encoding(VALUE obj); + +/** + * Identical to rb_str_cat(), except it additionally takes an encoding. + * + * @param[out] str Destination object. + * @param[in] ptr Contents to append. + * @param[in] len Length of `src`, in bytes. + * @param[in] enc Encoding of `ptr`. + * @exception rb_eArgError `len` is negative. + * @exception rb_eEncCompatError `enc` is not compatible with `str`. + * @return The passed `dst`. + * @post The contents of `ptr` is copied, transcoded into `dst`'s + * encoding, then pasted into `dst`'s end. + */ +VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc); + +/** + * Encodes the passed code point into a series of bytes. + * + * @param[in] code Code point. + * @param[in] enc Target encoding scheme. + * @exception rb_eRangeError `enc` does not glean `code`. + * @return An instance of ::rb_cString, of `enc` encoding, whose sole + * contents is `code` represented in `enc`. + * @note No way to encode code points bigger than UINT_MAX. + * + * @internal + * + * In other languages, APIs like this one could be seen as the primitive + * routines where encodings' "encode" feature are implemented. However in case + * of Ruby this is not the primitive one. We directly manipulate encoded + * strings. Encoding conversion routines transocde an encoded string directly + * to another one; not via a code point array. + */ +VALUE rb_enc_uint_chr(unsigned int code, rb_encoding *enc); + +/** + * Identical to rb_external_str_new(), except it additionally takes an + * encoding. However the whole point of rb_external_str_new() is to encode a + * string into default external encoding. Being able to specify arbitrary + * encoding just ruins the designed purpose the function meseems. + * + * @param[in] ptr A memory region of `len` bytes length. + * @param[in] len Length of `ptr`, in bytes, not including the + * terminating NUL character. + * @param[in] enc Target encoding scheme. + * @exception rb_eArgError `len` is negative. + * @return An instance of ::rb_cString. In case encoding conversion from + * "default internal" to `enc` is fully defined over the given + * contents, then the return value is a string of `enc` encoding, + * whose contents are the converted ones. Otherwise the string is + * a junk. + * @warning It doesn't raise on a conversion failure and silently ends up in + * a corrupted output. You can know the failure by querying + * `valid_encoding?` of the result object. + * + * @internal + * + * @shyouhei has no idea why this one does not follow the naming convention + * that others obey. It seems to him that this should have been called + * `rb_enc_external_str_new`. + */ +VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc); + +/** + * Identical to rb_str_export(), except it additionally takes an encoding. + * + * @param[in] obj Target object. + * @param[in] enc Target encoding. + * @exception rb_eTypeError No implicit conversion to String. + * @return Converted ruby string of `enc` encoding. + */ +VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc); + +/** + * Encoding conversion main routine. + * + * @param[in] str String to convert. + * @param[in] from Source encoding. + * @param[in] to Destination encoding. + * @return A copy of `str`, with conversion from `from` to `to` applied. + * @note `from` can be a null pointer. `str`'s encoding is taken then. + * @note `to` can be a null pointer. No-op then. + */ +VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to); + +/** + * Identical to rb_str_conv_enc(), except it additionally takes IO encoder + * options. The extra arguments can be constructed using io_extract_modeenc() + * etc. + * + * @param[in] str String to convert. + * @param[in] from Source encoding. + * @param[in] to Destination encoding. + * @param[in] ecflags A set of enum ::ruby_econv_flag_type. + * @param[in] ecopts Optional hash. + * @return A copy of `str`, with conversion from `from` to `to` applied. + * @note `from` can be a null pointer. `str`'s encoding is taken then. + * @note `to` can be a null pointer. No-op then. + * @note `ecopts` can be ::RUBY_Qnil, which is equivalent to passing an + * empty hash. + */ +VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts); + +/** + * Scans the passed string to collect its code range. Because a Ruby's string + * is mutable, its contents change from time to time; so does its code range. + * A long-lived string tends to fall back to ::RUBY_ENC_CODERANGE_UNKNOWN. + * This API scans it and re-assigns a fine-grained code range constant. + * + * @param[out] str A string. + * @return An enum ::ruby_coderange_type. + */ +int rb_enc_str_coderange(VALUE str); + +/** + * Scans the passed string until it finds something odd. Returns the number of + * bytes scanned. As the name implies this is suitable for repeated call. One + * of its application is `IO#readlines`. The method reads from its receiver's + * read buffer, maybe more than once, looking for newlines. But "newline" can + * be different among encodings. This API is used to detect broken contents to + * properly mark them as such. + * + * @param[in] str String to scan. + * @param[in] end End of `str`. + * @param[in] enc `str`'s encoding. + * @param[out] cr Return buffer. + * @return Distance between `str` and first such byte where broken. + * @post `cr` has the code range type. + */ +long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr); + +/** + * Queries if the passed string is "ASCII only". An ASCII only string is a + * string who doesn't have any non-ASCII characters at all. This doesn't + * necessarily mean the string is in ASCII encoding. For instance a String of + * CP932 encoding can quite much be ASCII only, depending on its contents. + * + * @param[in] str String in question. + * @retval 1 It doesn't have non-ASCII characters. + * @retval 0 It has characters that are out of ASCII. + */ +int rb_enc_str_asciionly_p(VALUE str); + +RBIMPL_ATTR_NONNULL(()) +/** + * Looks for the passed string in the passed buffer. + * + * @param[in] x Buffer that potentially includes `y`. + * @param[in] m Number of bytes of `x`. + * @param[in] y Query string. + * @param[in] n Number of bytes of `y`. + * @param[in] enc Encoding of both `x` and `y`. + * @retval -1 Not found. + * @retval otherwise Found index in `x`. + * @note This API can match at a non-character-boundary. + */ +long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc); + +/** @cond INTERNAL_MACRO */ +#ifdef HAVE_BUILTIN___BUILTIN_CONSTANT_P +#define rb_enc_str_new(str, len, enc) RB_GNUC_EXTENSION_BLOCK( \ + (__builtin_constant_p(str) && __builtin_constant_p(len)) ? \ + rb_enc_str_new_static((str), (len), (enc)) : \ + rb_enc_str_new((str), (len), (enc)) \ +) +#define rb_enc_str_new_cstr(str, enc) RB_GNUC_EXTENSION_BLOCK( \ + (__builtin_constant_p(str)) ? \ + rb_enc_str_new_static((str), (long)strlen(str), (enc)) : \ + rb_enc_str_new_cstr((str), (enc)) \ +) +#endif +/** @endcond */ + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_STRING_H */ diff --git a/include/ruby/internal/encoding/symbol.h b/include/ruby/internal/encoding/symbol.h new file mode 100644 index 0000000000..9cd1b0dbf4 --- /dev/null +++ b/include/ruby/internal/encoding/symbol.h @@ -0,0 +1,100 @@ +#ifndef RUBY_INTERNAL_ENCODING_SYMBOL_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_SYMBOL_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief Routines to manipulate encodings of symbols. + */ + +#include "ruby/internal/attr/nonnull.h" +#include "ruby/internal/dllexport.h" +#include "ruby/internal/encoding/encoding.h" +#include "ruby/internal/value.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() + +/** + * Identical to rb_intern2(), except it additionally takes an encoding. + * + * @param[in] name The name of the id. + * @param[in] len Length of `name`. + * @param[in] enc `name`'s encoding. + * @exception rb_eRuntimeError Too many symbols. + * @return A (possibly new) id whose value is the given name. + * @note These days Ruby internally has two kinds of symbols + * (static/dynamic). Symbols created using this function would + * become static ones; i.e. would never be garbage collected. It + * is up to you to avoid memory leaks. Think twice before using + * it. + */ +ID rb_intern3(const char *name, long len, rb_encoding *enc); + +RBIMPL_ATTR_NONNULL(()) +/** + * Identical to rb_symname_p(), except it additionally takes an encoding. + * + * @param[in] str A C string to check. + * @param[in] enc `str`'s encoding. + * @retval 1 It is a valid symbol name. + * @retval 0 It is invalid as a symbol name. + */ +int rb_enc_symname_p(const char *str, rb_encoding *enc); + +/** + * Identical to rb_enc_symname_p(), except it additionally takes the passed + * string's length. This is needed for strings containing NUL bytes, like in + * case of UTF-32. + * + * @param[in] name A C string to check. + * @param[in] len Number of bytes of `str`. + * @param[in] enc `str`'s encoding. + * @retval 1 It is a valid symbol name. + * @retval 0 It is invalid as a symbol name. + */ +int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc); + +/** + * Identical to rb_check_id(), except it takes a pointer to a memory region + * instead of Ruby's string. + * + * @param[in] ptr A pointer to a memory region. + * @param[in] len Number of bytes of `ptr`. + * @param[in] enc Encoding of `ptr`. + * @exception rb_eEncodingError `ptr` contains non-ASCII according to `enc`. + * @retval 0 No such id ever existed in the history. + * @retval otherwise The id that represents the given name. + */ +ID rb_check_id_cstr(const char *ptr, long len, rb_encoding *enc); + +/** + * Identical to rb_check_id_cstr(), except for the return type. It can also be + * seen as a routine identical to rb_check_symbol(), except it takes a pointer + * to a memory region instead of Ruby's string. + * + * @param[in] ptr A pointer to a memory region. + * @param[in] len Number of bytes of `ptr`. + * @param[in] enc Encoding of `ptr`. + * @exception rb_eEncodingError `ptr` contains non-ASCII according to `enc`. + * @retval RUBY_Qnil No such id ever existed in the history. + * @retval otherwise The id that represents the given name. + */ +VALUE rb_check_symbol_cstr(const char *ptr, long len, rb_encoding *enc); + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_SYMBOL_H */ diff --git a/include/ruby/internal/encoding/transcode.h b/include/ruby/internal/encoding/transcode.h new file mode 100644 index 0000000000..60c96a41c9 --- /dev/null +++ b/include/ruby/internal/encoding/transcode.h @@ -0,0 +1,558 @@ +#ifndef RUBY_INTERNAL_ENCODING_TRANSCODE_H /*-*-C++-*-vi:se ft=cpp:*/ +#define RUBY_INTERNAL_ENCODING_TRANSCODE_H +/** + * @file + * @author Ruby developers + * @copyright This file is a part of the programming language Ruby. + * Permission is hereby granted, to either redistribute and/or + * modify this file, provided that the conditions mentioned in the + * file COPYING are met. Consult the file for details. + * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are + * implementation details. Don't take them as canon. They could + * rapidly appear then vanish. The name (path) of this header file + * is also an implementation detail. Do not expect it to persist + * at the place it is now. Developers are free to move it anywhere + * anytime at will. + * @note To ruby-core: remember that this header can be possibly + * recursively included from extension libraries written in C++. + * Do not expect for instance `__VA_ARGS__` is always available. + * We assume C99 for ruby itself but we don't assume languages of + * extension libraries. They could be written in C++98. + * @brief econv stuff + */ + +#include "ruby/internal/dllexport.h" +#include "ruby/internal/value.h" + +RBIMPL_SYMBOL_EXPORT_BEGIN() + +/** return value of rb_econv_convert() */ +typedef enum { + + /** + * The conversion stopped when it found an invalid sequence. + */ + econv_invalid_byte_sequence, + + /** + * The conversion stopped when it found a character in the input which + * cannot be representable in the output. + */ + econv_undefined_conversion, + + /** + * The conversion stopped because there is no destination. + */ + econv_destination_buffer_full, + + /** + * The conversion stopped because there is no input. + */ + econv_source_buffer_empty, + + /** + * The conversion stopped after converting everything. This is arguably + * the expected normal end of conversion. + */ + econv_finished, + + /** + * The conversion stopped after writing something to somewhere, before + * reading everything. + */ + econv_after_output, + + /** + * The conversion stopped in middle of reading a character, possibly due to + * a partial read of a socket etc. + */ + econv_incomplete_input +} rb_econv_result_t; + +/** An opaque struct that represents a lowest level of encoding conversion. */ +typedef struct rb_econv_t rb_econv_t; + +/** + * Converts the contents of the passed string from its encoding to the passed + * one. + * + * @param[in] str Target string. + * @param[in] to Destination encoding. + * @param[in] ecflags A set of enum + * ::ruby_econv_flag_type. + * @param[in] ecopts A keyword hash, like + * ::rb_io_t::rb_io_enc_t::ecopts. + * @exception rb_eArgError Not fully converted. + * @exception rb_eInvalidByteSequenceError `str` is malformed. + * @exception rb_eUndefinedConversionError `str` has a character not + * representable using `to`. + * @exception rb_eConversionNotFoundError There is no known conversion from + * `str`'s encoding to `to`. + * @return A string whose encoding is `to`, and whose contents is converted + * contents of `str`. + * @note Use rb_econv_prepare_options() to generate `ecopts`. + */ +VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts); + +/** + * Queries if there is more than one way to convert between the passed two + * encodings. Encoding conversion are has_and_belongs_to_many relationships. + * There could be no direct conversion defined for the passed pair. Ruby tries + * to find an indirect way to do so then. For instance ISO-8859-1 has no + * direct conversion to ISO-2022-JP. But there is ISO-8859-1 to UTF-8 + * conversion; then there is UTF-8 to EUC-JP conversion; finally there also is + * EUC-JP to ISO-2022-JP conversion. So in short ISO-8859-1 can be converted + * to ISO-2022-JP using that path. This function returns true. Obviously not + * everything that can be represented using UTF-8 can also be represented using + * EUC-JP. Conversions in practice can fail depending on the actual input, and + * that renders exceptions in case of rb_str_encode(). + * + * @param[in] from_encoding One encoding. + * @param[in] to_encoding Another encoding. + * @retval 0 No way to convert the two. + * @retval 1 At least one way to convert the two. + * + * @internal + * + * Practically @shyouhei knows no way for this function to return 0. It seems + * everything can eventually be converted to/from UTF-8, which connects + * everything. + */ +int rb_econv_has_convpath_p(const char* from_encoding, const char* to_encoding); + +/** + * Identical to rb_econv_prepare_opts(), except it additionally takes the + * initial value of flags. The extra bits are bitwise-ORed to the return + * value. + * + * @param[in] opthash Keyword arguments. + * @param[out] ecopts Return buffer. + * @param[in] ecflags Default set of enum ::ruby_econv_flag_type. + * @exception rb_eArgError Unknown/Broken values passed. + * @return Calculated set of enum ::ruby_econv_flag_type. + * @post `ecopts` holds a hash object suitable for + * ::rb_io_t::rb_io_enc_t::ecopts. + */ +int rb_econv_prepare_options(VALUE opthash, VALUE *ecopts, int ecflags); + +/** + * Splits a keyword arguments hash (that for instance `String#encode` took) + * into a set of enum ::ruby_econv_flag_type and a hash storing replacement + * characters etc. + * + * @param[in] opthash Keyword arguments. + * @param[out] ecopts Return buffer. + * @exception rb_eArgError Unknown/Broken values passed. + * @return Calculated set of enum ::ruby_econv_flag_type. + * @post `ecopts` holds a hash object suitable for + * ::rb_io_t::rb_io_enc_t::ecopts. + */ +int rb_econv_prepare_opts(VALUE opthash, VALUE *ecopts); + +/** + * Creates a new instance of struct ::rb_econv_t. + * + * @param[in] source_encoding Name of an encoding. + * @param[in] destination_encoding Name of another encoding. + * @param[in] ecflags A set of enum ::ruby_econv_flag_type. + * @exception rb_eArgError No such encoding. + * @retval NULL Failed to create a struct ::rb_econv_t. + * @retval otherwise Allocated struct ::rb_econv_t. + * @warning Return value must be passed to rb_econv_close() exactly once. + */ +rb_econv_t *rb_econv_open(const char *source_encoding, const char *destination_encoding, int ecflags); + +/** + * Identical to rb_econv_open(), except it additionally takes a hash of + * optional strings. + * + * + * @param[in] source_encoding Name of an encoding. + * @param[in] destination_encoding Name of another encoding. + * @param[in] ecflags A set of enum ::ruby_econv_flag_type. + * @param[in] ecopts Optional set of strings. + * @exception rb_eArgError No such encoding. + * @retval NULL Failed to create a struct ::rb_econv_t. + * @retval otherwise Allocated struct ::rb_econv_t. + * @warning Return value must be passed to rb_econv_close() exactly once. + */ +rb_econv_t *rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts); + +/** + * Converts a string from an encoding to another. + * + * Possible flags are either ::RUBY_ECONV_PARTIAL_INPUT (means the source + * buffer is a part of much larger one), ::RUBY_ECONV_AFTER_OUTPUT (instructs + * the converter to stop after output before input), or both of them. + * + * @param[in,out] ec Conversion specification/state etc. + * @param[in] source_buffer_ptr Target string. + * @param[in] source_buffer_end End of target string. + * @param[out] destination_buffer_ptr Return buffer. + * @param[out] destination_buffer_end End of return buffer. + * @param[in] flags Flags (see above). + * @return The status of the conversion. + * @post `destination_buffer_ptr` holds conversion results. + */ +rb_econv_result_t rb_econv_convert(rb_econv_t *ec, + const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, + unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, + int flags); + +/** + * Destructs a converter. Note that a converter can have a buffer, and can be + * non-empty. Calling this would lose your data then. + * + * @param[out] ec The converter to destroy. + * @post `ec` is no longer a valid pointer. + */ +void rb_econv_close(rb_econv_t *ec); + +/** + * Assigns the replacement string. The string passed here would appear in + * converted string when it cannot represent its source counterpart. This can + * happen for instance you convert an emoji to ISO-8859-1. + * + * @param[out] ec Target converter. + * @param[in] str Replacement string. + * @param[in] len Number of bytes of `str`. + * @param[in] encname Name of encoding of `str`. + * @retval 0 Success. + * @retval -1 Failure (ENOMEM etc.). + * @post `ec`'s replacement string is set to `str`. + */ +int rb_econv_set_replacement(rb_econv_t *ec, const unsigned char *str, size_t len, const char *encname); + +/** + * "Decorate"s a converter. There are special kind of converters that + * transforms the contents, like replacing CR into CRLF. You can add such + * decorators to a converter using this API. By using this function a + * decorator is prepended at the beginning of a conversion sequence: in case of + * CRLF conversion, newlines are converted before encodings are converted. + * + * @param[out] ec Target converter to decorate. + * @param[in] decorator_name Name of decorator to prepend. + * @retval 0 Success. + * @retval -1 Failure (no such decorator etc.). + * @post Decorator works before encoding conversion happens. + * + * @internal + * + * What is the possible value of the `decorator_name` is not public. You have + * to read through `transcode.c` carefully. + */ +int rb_econv_decorate_at_first(rb_econv_t *ec, const char *decorator_name); + +/** + * Identical to rb_econv_decorate_at_first(), except it adds to the opposite + * direction. For instance CRLF conversion would run _after_ encodings are + * converted. + * + * @param[out] ec Target converter to decorate. + * @param[in] decorator_name Name of decorator to prepend. + * @retval 0 Success. + * @retval -1 Failure (no such decorator etc.). + * @post Decorator works after encoding conversion happens. + */ +int rb_econv_decorate_at_last(rb_econv_t *ec, const char *decorator_name); + +/** + * Creates a `rb_eConverterNotFoundError` exception object (but does not + * raise). + * + * @param[in] senc Name of source encoding. + * @param[in] denc Name of destination encoding. + * @param[in] ecflags A set of enum ::ruby_econv_flag_type. + * @return An instance of `rb_eConverterNotFoundError`. + */ +VALUE rb_econv_open_exc(const char *senc, const char *denc, int ecflags); + +/** + * Appends the passed string to the passed converter's output buffer. This can + * be handy when an encoding needs bytes out of thin air; for instance + * ISO-2022-JP has "shift function" which does not correspond to any + * characters. + * + * @param[out] ec Target converter. + * @param[in] str String to insert. + * @param[in] len Number of bytes of `str`. + * @param[in] str_encoding Encoding of `str`. + * @retval 0 Success. + * @retval -1 Failure (conversion error etc.). + * @note `str_encoding` can be anything, and `str` itself is converted + * when necessary. + */ +int rb_econv_insert_output(rb_econv_t *ec, + const unsigned char *str, size_t len, const char *str_encoding); + +/** + * Queries an encoding name which best suits for rb_econv_insert_output()'s + * last parameter. Strings in this encoding need no conversion when inserted; + * can be both time/space efficient. + * + * @param[in] ec Target converter. + * @return Its encoding for insertion. + */ +const char *rb_econv_encoding_to_insert_output(rb_econv_t *ec); + +/** + * This is a rb_econv_make_exception() + rb_exc_raise() combo. + * + * @param[in] ec (Possibly failed) conversion. + * @exception rb_eInvalidByteSequenceError Invalid byte sequence. + * @exception rb_eUndefinedConversionError Conversion undefined. + * @note This function can return when no error. + */ +void rb_econv_check_error(rb_econv_t *ec); + +/** + * This function makes sense right after rb_econv_convert() returns. As listed + * in ::rb_econv_result_t, rb_econv_convert() can bail out for various reasons. + * This function checks the passed converter's internal state and convert it to + * an appropriate exception object. + * + * @param[in] ec Target converter. + * @retval RUBY_Qnil The converter has no error. + * @retval otherwise Conversion error turned into an exception. + */ +VALUE rb_econv_make_exception(rb_econv_t *ec); + +/** + * Queries if rb_econv_putback() makes sense, i.e. there are invalid byte + * sequences remain in the buffer. + * + * @param[in] ec Target converter. + * @return Number of bytes that can be pushed back. + */ +int rb_econv_putbackable(rb_econv_t *ec); + +/** + * Puts back the bytes. In case of ::econv_invalid_byte_sequence, some of + * those invalid bytes are discarded and the others are buffered to be + * converted later. The latter bytes can be put back using this API. + * + * @param[out] ec Target converter (invalid byte sequence). + * @param[out] p Return buffer. + * @param[in] n Max number of bytes to put back. + * @post At most `n` bytes of what was put back is written to `p`. + */ +void rb_econv_putback(rb_econv_t *ec, unsigned char *p, int n); + +/** + * Queries the passed encoding's corresponding ASCII compatible encoding. "The + * corresponding ASCII compatible encoding" in this context is an ASCII + * compatible encoding which can represent exactly the same character sets as + * the given ASCII incompatible encoding. For instance that of UTF-16LE is + * UTF-8. + * + * @param[in] encname Name of an ASCII incompatible encoding. + * @retval NULL `encname` is already ASCII compatible. + * @retval otherwise The corresponding ASCII compatible encoding. + */ +const char *rb_econv_asciicompat_encoding(const char *encname); + +/** + * Identical to rb_econv_convert(), except it takes Ruby's string instead of + * C's pointer. + * + * @param[in,out] ec Target converter. + * @param[in] src Source string. + * @param[in] flags Flags (see rb_econv_convert). + * @exception rb_eArgError Converted string is too long. + * @exception rb_eInvalidByteSequenceError Invalid byte sequence. + * @exception rb_eUndefinedConversionError Conversion undefined. + * @return The conversion result. + */ +VALUE rb_econv_str_convert(rb_econv_t *ec, VALUE src, int flags); + +/** + * Identical to rb_econv_str_convert(), except it converts only a part of the + * passed string. Can be handy when you for instance want to do line-buffered + * conversion. + * + * @param[in,out] ec Target converter. + * @param[in] src Source string. + * @param[in] byteoff Number of bytes to seek. + * @param[in] bytesize Number of bytes to read. + * @param[in] flags Flags (see rb_econv_convert). + * @exception rb_eArgError Converted string is too long. + * @exception rb_eInvalidByteSequenceError Invalid byte sequence. + * @exception rb_eUndefinedConversionError Conversion undefined. + * @return The conversion result. + */ +VALUE rb_econv_substr_convert(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, int flags); + +/** + * Identical to rb_econv_str_convert(), except it appends the conversion result + * to the additionally passed string instead of creating a new string. It can + * also be seen as a routine identical to rb_econv_append(), except it takes a + * Ruby's string instead of C's pointer. + * + * @param[in,out] ec Target converter. + * @param[in] src Source string. + * @param[in] dst Return buffer. + * @param[in] flags Flags (see rb_econv_convert). + * @exception rb_eArgError Converted string is too long. + * @exception rb_eInvalidByteSequenceError Invalid byte sequence. + * @exception rb_eUndefinedConversionError Conversion undefined. + * @return The conversion result. + */ +VALUE rb_econv_str_append(rb_econv_t *ec, VALUE src, VALUE dst, int flags); + +/** + * Identical to rb_econv_str_append(), except it appends only a part of the + * passed string with conversion. It can also be seen as a routine identical + * to rb_econv_substr_convert(), except it appends the conversion result to the + * additionally passed string instead of creating a new string. + * + * @param[in,out] ec Target converter. + * @param[in] src Source string. + * @param[in] byteoff Number of bytes to seek. + * @param[in] bytesize Number of bytes to read. + * @param[in] dst Return buffer. + * @param[in] flags Flags (see rb_econv_convert). + * @exception rb_eArgError Converted string is too long. + * @exception rb_eInvalidByteSequenceError Invalid byte sequence. + * @exception rb_eUndefinedConversionError Conversion undefined. + * @return The conversion result. + */ +VALUE rb_econv_substr_append(rb_econv_t *ec, VALUE src, long byteoff, long bytesize, VALUE dst, int flags); + +/** + * Converts the passed C's pointer according to the passed converter, then + * append the conversion result to the passed Ruby's string. This way buffer + * overflow is properly avoided to resize the destination properly. + * + * @param[in,out] ec Target converter. + * @param[in] bytesrc Target string. + * @param[in] bytesize Number of bytes of `bytesrc`. + * @param[in] dst Return buffer. + * @param[in] flags Flags (see rb_econv_convert). + * @exception rb_eArgError Converted string is too long. + * @exception rb_eInvalidByteSequenceError Invalid byte sequence. + * @exception rb_eUndefinedConversionError Conversion undefined. + * @return The conversion result. + */ +VALUE rb_econv_append(rb_econv_t *ec, const char *bytesrc, long bytesize, VALUE dst, int flags); + +/** + * This badly named function does not set the destination encoding to binary, + * but instead just nullifies newline conversion decorators if any. Other + * ordinal character conversions still happen after this; something non-binary + * would still be generated. + * + * @param[out] ec Target converter to modify. + * @post Any newline conversions, if any, would be killed. + */ +void rb_econv_binmode(rb_econv_t *ec); + +/** + * This enum is kind of omnibus. Gathers various constants. + */ +enum ruby_econv_flag_type { + + /** + * @name Flags for rb_econv_open() + * + * @{ + */ + + /** Mask for error handling related bits. */ + RUBY_ECONV_ERROR_HANDLER_MASK = 0x000000ff, + + /** Special handling of invalid sequences are there. */ + RUBY_ECONV_INVALID_MASK = 0x0000000f, + + /** Invalid sequences shall be replaced. */ + RUBY_ECONV_INVALID_REPLACE = 0x00000002, + + /** Special handling of undefined conversion are there. */ + RUBY_ECONV_UNDEF_MASK = 0x000000f0, + + /** Undefined characters shall be replaced. */ + RUBY_ECONV_UNDEF_REPLACE = 0x00000020, + + /** Undefined characters shall be escaped. */ + RUBY_ECONV_UNDEF_HEX_CHARREF = 0x00000030, + + /** Decorators are there. */ + RUBY_ECONV_DECORATOR_MASK = 0x0000ff00, + + /** Newline converters are there. */ + RUBY_ECONV_NEWLINE_DECORATOR_MASK = 0x00003f00, + + /** (Unclear; seems unused). */ + RUBY_ECONV_NEWLINE_DECORATOR_READ_MASK = 0x00000f00, + + /** (Unclear; seems unused). */ + RUBY_ECONV_NEWLINE_DECORATOR_WRITE_MASK = 0x00003000, + + /** Universal newline mode. */ + RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR = 0x00000100, + + /** CR to CRLF conversion shall happen. */ + RUBY_ECONV_CRLF_NEWLINE_DECORATOR = 0x00001000, + + /** CRLF to CR conversion shall happen. */ + RUBY_ECONV_CR_NEWLINE_DECORATOR = 0x00002000, + + /** Texts shall be XML-escaped. */ + RUBY_ECONV_XML_TEXT_DECORATOR = 0x00004000, + + /** Texts shall be AttrValue escaped */ + RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR = 0x00008000, + + /** (Unclear; seems unused). */ + RUBY_ECONV_STATEFUL_DECORATOR_MASK = 0x00f00000, + + /** Texts shall be AttrValue escaped. */ + RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR = 0x00100000, + + /** Newline decorator's default. */ + RUBY_ECONV_DEFAULT_NEWLINE_DECORATOR = +#if defined(RUBY_TEST_CRLF_ENVIRONMENT) || defined(_WIN32) + RUBY_ECONV_CRLF_NEWLINE_DECORATOR, +#else + 0, +#endif + +#define ECONV_ERROR_HANDLER_MASK RUBY_ECONV_ERROR_HANDLER_MASK /**< @old{RUBY_ECONV_ERROR_HANDLER_MASK} */ +#define ECONV_INVALID_MASK RUBY_ECONV_INVALID_MASK /**< @old{RUBY_ECONV_INVALID_MASK} */ +#define ECONV_INVALID_REPLACE RUBY_ECONV_INVALID_REPLACE /**< @old{RUBY_ECONV_INVALID_REPLACE} */ +#define ECONV_UNDEF_MASK RUBY_ECONV_UNDEF_MASK /**< @old{RUBY_ECONV_UNDEF_MASK} */ +#define ECONV_UNDEF_REPLACE RUBY_ECONV_UNDEF_REPLACE /**< @old{RUBY_ECONV_UNDEF_REPLACE} */ +#define ECONV_UNDEF_HEX_CHARREF RUBY_ECONV_UNDEF_HEX_CHARREF /**< @old{RUBY_ECONV_UNDEF_HEX_CHARREF} */ +#define ECONV_DECORATOR_MASK RUBY_ECONV_DECORATOR_MASK /**< @old{RUBY_ECONV_DECORATOR_MASK} */ +#define ECONV_NEWLINE_DECORATOR_MASK RUBY_ECONV_NEWLINE_DECORATOR_MASK /**< @old{RUBY_ECONV_NEWLINE_DECORATOR_MASK} */ +#define ECONV_NEWLINE_DECORATOR_READ_MASK RUBY_ECONV_NEWLINE_DECORATOR_READ_MASK /**< @old{RUBY_ECONV_NEWLINE_DECORATOR_READ_MASK} */ +#define ECONV_NEWLINE_DECORATOR_WRITE_MASK RUBY_ECONV_NEWLINE_DECORATOR_WRITE_MASK /**< @old{RUBY_ECONV_NEWLINE_DECORATOR_WRITE_MASK} */ +#define ECONV_UNIVERSAL_NEWLINE_DECORATOR RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_UNIVERSAL_NEWLINE_DECORATOR} */ +#define ECONV_CRLF_NEWLINE_DECORATOR RUBY_ECONV_CRLF_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_CRLF_NEWLINE_DECORATOR} */ +#define ECONV_CR_NEWLINE_DECORATOR RUBY_ECONV_CR_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_CR_NEWLINE_DECORATOR} */ +#define ECONV_XML_TEXT_DECORATOR RUBY_ECONV_XML_TEXT_DECORATOR /**< @old{RUBY_ECONV_XML_TEXT_DECORATOR} */ +#define ECONV_XML_ATTR_CONTENT_DECORATOR RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR /**< @old{RUBY_ECONV_XML_ATTR_CONTENT_DECORATOR} */ +#define ECONV_STATEFUL_DECORATOR_MASK RUBY_ECONV_STATEFUL_DECORATOR_MASK /**< @old{RUBY_ECONV_STATEFUL_DECORATOR_MASK} */ +#define ECONV_XML_ATTR_QUOTE_DECORATOR RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR /**< @old{RUBY_ECONV_XML_ATTR_QUOTE_DECORATOR} */ +#define ECONV_DEFAULT_NEWLINE_DECORATOR RUBY_ECONV_DEFAULT_NEWLINE_DECORATOR /**< @old{RUBY_ECONV_DEFAULT_NEWLINE_DECORATOR} */ + /** @} */ + + /** + * @name Flags for rb_econv_convert() + * + * @{ + */ + + /** Indicates the input is a part of much larger one. */ + RUBY_ECONV_PARTIAL_INPUT = 0x00010000, + + /** Instructs the converter to stop after output. */ + RUBY_ECONV_AFTER_OUTPUT = 0x00020000, +#define ECONV_PARTIAL_INPUT RUBY_ECONV_PARTIAL_INPUT /**< @old{RUBY_ECONV_PARTIAL_INPUT} */ +#define ECONV_AFTER_OUTPUT RUBY_ECONV_AFTER_OUTPUT /**< @old{RUBY_ECONV_AFTER_OUTPUT} */ + + RUBY_ECONV_FLAGS_PLACEHOLDER /**< Placeholder (not used) */ +}; + +RBIMPL_SYMBOL_EXPORT_END() + +#endif /* RUBY_INTERNAL_ENCODING_TRANSCODE_H */ -- cgit v1.2.3