#ifndef RBIMPL_CTYPE_H /*-*-C++-*-vi:se ft=cpp:*/ #define RBIMPL_CTYPE_H /** * @file * @author Ruby developers * @copyright This file is a part of the programming language Ruby. * Permission is hereby granted, to either redistribute and/or * modify this file, provided that the conditions mentioned in the * file COPYING are met. Consult the file for details. * @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are * implementation details. Don't take them as canon. They could * rapidly appear then vanish. The name (path) of this header file * is also an implementation detail. Do not expect it to persist * at the place it is now. Developers are free to move it anywhere * anytime at will. * @note To ruby-core: remember that this header can be possibly * recursively included from extension libraries written in C++. * Do not expect for instance `__VA_ARGS__` is always available. * We assume C99 for ruby itself but we don't assume languages of * extension libraries. They could be written in C++98. * @brief Our own, locale independent, character handling routines. */ #include "ruby/internal/config.h" #ifdef STDC_HEADERS # include #endif #include "ruby/internal/attr/artificial.h" #include "ruby/internal/attr/const.h" #include "ruby/internal/attr/constexpr.h" #include "ruby/internal/attr/nonnull.h" #include "ruby/internal/dllexport.h" /** * @name Old character classification macros * * What is this #ISPRINT business? Well, according to our VCS and some * internet surfing, it appears that the initial intent of these macros were to * mimic codes appear in common in several GNU projects. As far as @shyouhei * detects they seem to originate GNU regex (that standalone one rather than * Gnulib or Glibc), and at least date back to 1995. * * Let me lawfully quote from a GNU coreutils commit * https://git.savannah.gnu.org/cgit/coreutils.git/commit/?id=49803907f5dbd7646184a8912c9db9b09dcd0f22 * * > Jim Meyering writes: * > * > "... Some ctype macros are valid only for character codes that * > isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when * > using /bin/cc or gcc but without giving an ansi option). So, all * > ctype uses should be through macros like ISPRINT... If * > STDC_HEADERS is defined, then autoconf has verified that the ctype * > macros don't need to be guarded with references to isascii. ... * > Defining isascii to 1 should let any compiler worth its salt * > eliminate the && through constant folding." * > * > Bruno Haible adds: * > * > "... Furthermore, isupper(c) etc. have an undefined result if c is * > outside the range -1 <= c <= 255. One is tempted to write isupper(c) * > with c being of type `char', but this is wrong if c is an 8-bit * > character >= 128 which gets sign-extended to a negative value. * > The macro ISUPPER protects against this as well." * * So the intent was to reroute old problematic systems that no longer exist. * At the same time the problems described above no longer hurt us, because we * decided to completely avoid using system-provided isupper etc. to reinvent * the wheel. These macros are entirely legacy; please ignore them. * * But let me also put stress that GNU people are wise; they use those macros * only inside of their own implementations and never let them be public. On * the other hand ruby has thoughtlessly publicised them to 3rd party libraries * since its beginning, which is a very bad idea. These macros are too easy to * get conflicted with definitions elsewhere. * * New programs should stick to the `rb_` prefixed names. * * @note It seems we just mimic the API. We do not share their implementation * with GPL-ed programs. * * @{ */ #ifndef ISPRINT # define ISASCII rb_isascii /**< @old{rb_isascii}*/ # define ISPRINT rb_isprint /**< @old{rb_isprint}*/ # define ISGRAPH rb_isgraph /**< @old{rb_isgraph}*/ # define ISSPACE rb_isspace /**< @old{rb_isspace}*/ # define ISUPPER rb_isupper /**< @old{rb_isupper}*/ # define ISLOWER rb_islower /**< @old{rb_islower}*/ # define ISALNUM rb_isalnum /**< @old{rb_isalnum}*/ # define ISALPHA rb_isalpha /**< @old{rb_isalpha}*/ # define ISDIGIT rb_isdigit /**< @old{rb_isdigit}*/ # define ISXDIGIT rb_isxdigit /**< @old{rb_isxdigit}*/ # define ISBLANK rb_isblank /**< @old{rb_isblank}*/ # define ISCNTRL rb_iscntrl /**< @old{rb_iscntrl}*/ # define ISPUNCT rb_ispunct /**< @old{rb_ispunct}*/ #endif #define TOUPPER rb_toupper /**< @old{rb_toupper}*/ #define TOLOWER rb_tolower /**< @old{rb_tolower}*/ #define STRCASECMP st_locale_insensitive_strcasecmp /**< @old{st_locale_insensitive_strcasecmp}*/ #define STRNCASECMP st_locale_insensitive_strncasecmp /**< @old{st_locale_insensitive_strncasecmp}*/ #define STRTOUL ruby_strtoul /**< @old{ruby_strtoul}*/ /** @} */ RBIMPL_SYMBOL_EXPORT_BEGIN() /** @name locale insensitive functions * @{ */ /* In descriptions below, `the POSIX Locale` and `the "C" locale` are tactfully * used as to whether the described function mimics POSIX or C99. */ RBIMPL_ATTR_NONNULL(()) /** * Our own locale-insensitive version of `strcasecmp(3)`. The "case" here * always means that of the POSIX Locale. It doesn't depend on runtime locale * settings. * * @param[in] s1 Comparison LHS. * @param[in] s2 Comparison RHS. * @retval -1 `s1` is "less" than `s2`. * @retval 0 Both strings converted into lowercase would be identical. * @retval 1 `s1` is "greater" than `s2`. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. */ int st_locale_insensitive_strcasecmp(const char *s1, const char *s2); RBIMPL_ATTR_NONNULL(()) /** * Our own locale-insensitive version of `strcnasecmp(3)`. The "case" here * always means that of the POSIX Locale. It doesn't depend on runtime locale * settings. * * @param[in] s1 Comparison LHS. * @param[in] s2 Comparison RHS. * @param[in] n Comparison shall stop after first `n` bytes are scanned. * @retval -1 `s1` is "less" than `s2`. * @retval 0 Both strings converted into lowercase would be identical. * @retval 1 `s1` is "greater" than `s2`. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning This function is _not_ timing safe. */ int st_locale_insensitive_strncasecmp(const char *s1, const char *s2, size_t n); RBIMPL_ATTR_NONNULL((1)) /** * Our own locale-insensitive version of `strtoul(3)`. The conversion is done * as if the current locale is set to the "C" locale, no matter actual runtime * locale settings. * * @note This is needed because `strtoul("i", 0, 36)` would return zero * if it is locale sensitive and the current locale is `tr_TR`. * @param[in] str String of digits, optionally preceded with whitespaces * (ignored) and optionally `+` or `-` sign. * @param[out] endptr NULL, or an arbitrary pointer (overwritten on return). * @param[in] base `2` to `36` inclusive for each base, or special case * `0` to detect the base from the contents of the string. * @return Converted integer, casted to unsigned long. * @post If `endptr` is not NULL, it is updated to point the first such * byte where conversion failed. * @note This function sets `errno` on failure. * - `EINVAL`: Passed `base` is out of range. * - `ERANGE`: Converted integer is out of range of `long`. * @warning As far as @shyouhei reads ISO/IEC 9899:2018 section 7.22.1.4, a * conforming `strtoul` implementation shall render `ERANGE` * whenever it finds the input string represents a negative * integer. Such thing can never be representable using `unsigned * long`. However this implementation does not honour that * language. It just casts such negative value to the return * type, resulting a very big return value. This behaviour is at * least questionable. But we can no longer change that at this * point. * @note Not only does this function works under the "C" locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. */ unsigned long ruby_strtoul(const char *str, char **endptr, int base); RBIMPL_SYMBOL_EXPORT_END() /* * We are making the functions below to return `int` instead of `bool`. They * have been as such since their birth at 5f237d79033b2109afb768bc889611fa9630. */ RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isascii(3)`. * * @param[in] c Byte in question to query. * @retval false `c` is out of range of ASCII character set. * @retval true Yes it is. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isascii(int c) { return '\0' <= c && c <= '\x7f'; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isupper(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "upper". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isupper(int c) { return 'A' <= c && c <= 'Z'; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `islower(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "lower". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_islower(int c) { return 'a' <= c && c <= 'z'; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isalpha(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in either IEEE 1003.1 section 7.3.1.1 * "upper" or "lower". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isalpha(int c) { return rb_isupper(c) || rb_islower(c); } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isdigit(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "digit". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isdigit(int c) { return '0' <= c && c <= '9'; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isalnum(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in either IEEE 1003.1 section 7.3.1.1 * "upper", "lower", or "digit". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isalnum(int c) { return rb_isalpha(c) || rb_isdigit(c); } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isxdigit(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "xdigit". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isxdigit(int c) { return rb_isdigit(c) || ('A' <= c && c <= 'F') || ('a' <= c && c <= 'f'); } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isblank(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "blank". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isblank(int c) { return c == ' ' || c == '\t'; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isspace(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "space". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isspace(int c) { return c == ' ' || ('\t' <= c && c <= '\r'); } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `iscntrl(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "cntrl". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_iscntrl(int c) { return ('\0' <= c && c < ' ') || c == '\x7f'; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Identical to rb_isgraph(), except it also returns true for `' '`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in either IEEE 1003.1 section 7.3.1.1 * "upper", "lower", "digit", "punct", or a `' '`. * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isprint(int c) { return ' ' <= c && c <= '\x7e'; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `ispunct(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in IEEE 1003.1 section 7.3.1.1 "punct". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_ispunct(int c) { return !rb_isalnum(c); } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `isgraph(3)`. * * @param[in] c Byte in question to query. * @retval true `c` is listed in either IEEE 1003.1 section 7.3.1.1 * "upper", "lower", "digit", or "punct". * @retval false Anything else. * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_isgraph(int c) { return '!' <= c && c <= '\x7e'; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `tolower(3)`. * * @param[in] c Byte in question to convert. * @retval c The byte is not listed in in IEEE 1003.1 section * 7.3.1.1 "upper". * @retval otherwise Byte converted using the map defined in IEEE 1003.1 * section 7.3.1 "tolower". * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_tolower(int c) { return rb_isupper(c) ? (c|0x20) : c; } RBIMPL_ATTR_CONST() RBIMPL_ATTR_CONSTEXPR(CXX11) RBIMPL_ATTR_ARTIFICIAL() /** * Our own locale-insensitive version of `toupper(3)`. * * @param[in] c Byte in question to convert. * @retval c The byte is not listed in in IEEE 1003.1 section * 7.3.1.1 "lower". * @retval otherwise Byte converted using the map defined in IEEE 1003.1 * section 7.3.1 "toupper". * @note Not only does this function works under the POSIX Locale, but * also assumes its execution character set be what ruby calls an * ASCII-compatible character set; which does not include for * instance EBCDIC or UTF-16LE. * @warning `c` is an int. This means that when you pass a `char` value * here, it experiences "integer promotion" as defined in ISO/IEC * 9899:2018 section 6.3.1.1 paragraph 1. */ static inline int rb_toupper(int c) { return rb_islower(c) ? (c&0x5f) : c; } /** @} */ #endif /* RBIMPL_CTYPE_H */