1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
|
#ifndef RUBY_INTERNAL_ENCODING_STRING_H /*-*-C++-*-vi:se ft=cpp:*/
#define RUBY_INTERNAL_ENCODING_STRING_H
/**
* @file
* @author Ruby developers <ruby-core@ruby-lang.org>
* @copyright This file is a part of the programming language Ruby.
* Permission is hereby granted, to either redistribute and/or
* modify this file, provided that the conditions mentioned in the
* file COPYING are met. Consult the file for details.
* @warning Symbols prefixed with either `RBIMPL` or `rbimpl` are
* implementation details. Don't take them as canon. They could
* rapidly appear then vanish. The name (path) of this header file
* is also an implementation detail. Do not expect it to persist
* at the place it is now. Developers are free to move it anywhere
* anytime at will.
* @note To ruby-core: remember that this header can be possibly
* recursively included from extension libraries written in C++.
* Do not expect for instance `__VA_ARGS__` is always available.
* We assume C99 for ruby itself but we don't assume languages of
* extension libraries. They could be written in C++98.
* @brief Routines to manipulate encodings of strings.
*/
#include "ruby/internal/dllexport.h"
#include "ruby/internal/value.h"
#include "ruby/internal/encoding/encoding.h"
#include "ruby/internal/attr/nonnull.h"
#include "ruby/internal/intern/string.h" /* rbimpl_strlen */
RBIMPL_SYMBOL_EXPORT_BEGIN()
/**
* Identical to rb_enc_str_new(), except it additionally takes an encoding.
*
* @param[in] ptr A memory region of `len` bytes length.
* @param[in] len Length of `ptr`, in bytes, not including the
* terminating NUL character.
* @param[in] enc Encoding of `ptr`.
* @exception rb_eNoMemError Failed to allocate `len+1` bytes.
* @exception rb_eArgError `len` is negative.
* @return An instance of ::rb_cString, of `len` bytes length, of `enc`
* encoding, whose contents are verbatim copy of `ptr`.
* @pre At least `len` bytes of continuous memory region shall be
* accessible via `ptr`.
* @note `enc` can be a null pointer. It can also be seen as a routine
* identical to rb_usascii_str_new() then.
*/
VALUE rb_enc_str_new(const char *ptr, long len, rb_encoding *enc);
RBIMPL_ATTR_NONNULL((1))
/**
* Identical to rb_enc_str_new(), except it assumes the passed pointer is a
* pointer to a C string. It can also be seen as a routine identical to
* rb_str_new_cstr(), except it additionally takes an encoding.
*
* @param[in] ptr A C string.
* @param[in] enc Encoding of `ptr`.
* @exception rb_eNoMemError Failed to allocate memory.
* @return An instance of ::rb_cString, of `enc` encoding, whose contents
* are verbatim copy of `ptr`.
* @pre `ptr` must not be a null pointer.
* @pre Because `ptr` is a C string it makes no sense for `enc` to be
* something like UTF-32.
* @note `enc` can be a null pointer. It can also be seen as a routine
* identical to rb_usascii_str_new_cstr() then.
*/
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc);
/**
* Identical to rb_enc_str_new(), except it takes a C string literal. It can
* also be seen as a routine identical to rb_str_new_static(), except it
* additionally takes an encoding.
*
* @param[in] ptr A C string literal.
* @param[in] len `strlen(ptr)`.
* @param[in] enc Encoding of `ptr`.
* @exception rb_eArgError `len` out of range of `size_t`.
* @pre `ptr` must be a C string constant.
* @return An instance of ::rb_cString, of `enc` encoding, whose backend
* storage is the passed C string literal.
* @warning It is a very bad idea to write to a C string literal (often
* immediate SEGV shall occur). Consider return values of this
* function be read-only.
* @note `enc` can be a null pointer. It can also be seen as a routine
* identical to rb_usascii_str_new_static() then.
*/
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc);
/**
* Identical to rb_enc_str_new(), except it returns a "f"string. It can also
* be seen as a routine identical to rb_interned_str(), except it additionally
* takes an encoding.
*
* @param[in] ptr A memory region of `len` bytes length.
* @param[in] len Length of `ptr`, in bytes, not including the
* terminating NUL character.
* @param[in] enc Encoding of `ptr`.
* @exception rb_eArgError `len` is negative.
* @return A found or created instance of ::rb_cString, of `len` bytes
* length, of `enc` encoding, whose contents are identical to that
* of `ptr`.
* @pre At least `len` bytes of continuous memory region shall be
* accessible via `ptr`.
* @note `enc` can be a null pointer.
*/
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc);
RBIMPL_ATTR_NONNULL((1))
/**
* Identical to rb_enc_str_new_cstr(), except it returns a "f"string. It can
* also be seen as a routine identical to rb_interned_str_cstr(), except it
* additionally takes an encoding.
*
* @param[in] ptr A memory region of `len` bytes length.
* @param[in] enc Encoding of `ptr`.
* @return A found or created instance of ::rb_cString of `enc` encoding,
* whose contents are identical to that of `ptr`.
* @pre At least `len` bytes of continuous memory region shall be
* accessible via `ptr`.
* @note `enc` can be a null pointer.
*/
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc);
/**
* Counts the number of characters of the passed string, according to the
* passed encoding. This has to be complicated. The passed string could be
* invalid and/or broken. This routine would scan from the beginning til the
* end, byte by byte, to seek out character boundaries. Could be super slow.
*
* @param[in] head Leftmost pointer to the string.
* @param[in] tail Rightmost pointer to the string.
* @param[in] enc Encoding of the string.
* @return Number of characters exist in `head` .. `tail`. The definition
* of "character" depends on the passed `enc`.
*/
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc);
/**
* Queries the n-th character. Like rb_enc_strlen() this function can be fast
* or slow depending on the contents. Don't expect characters to be uniformly
* distributed across the entire string.
*
* @param[in] head Leftmost pointer to the string.
* @param[in] tail Rightmost pointer to the string.
* @param[in] nth Requested index of characters.
* @param[in] enc Encoding of the string.
* @return Pointer to the first byte of the character that is `nth`
* character ahead of `head`, or `tail` if there is no such
* character (OOB etc). The definition of "character" depends on
* the passed `enc`.
*/
char *rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc);
/**
* Identical to rb_enc_get_index(), except the return type.
*
* @param[in] obj Object in question.
* @exception rb_eTypeError `obj` is incapable of having an encoding.
* @return `obj`'s encoding.
*/
VALUE rb_obj_encoding(VALUE obj);
/**
* Identical to rb_str_cat(), except it additionally takes an encoding.
*
* @param[out] str Destination object.
* @param[in] ptr Contents to append.
* @param[in] len Length of `src`, in bytes.
* @param[in] enc Encoding of `ptr`.
* @exception rb_eArgError `len` is negative.
* @exception rb_eEncCompatError `enc` is not compatible with `str`.
* @return The passed `dst`.
* @post The contents of `ptr` is copied, transcoded into `dst`'s
* encoding, then pasted into `dst`'s end.
*/
VALUE rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *enc);
/**
* Encodes the passed code point into a series of bytes.
*
* @param[in] code Code point.
* @param[in] enc Target encoding scheme.
* @exception rb_eRangeError `enc` does not glean `code`.
* @return An instance of ::rb_cString, of `enc` encoding, whose sole
* contents is `code` represented in `enc`.
* @note No way to encode code points bigger than UINT_MAX.
*
* @internal
*
* In other languages, APIs like this one could be seen as the primitive
* routines where encodings' "encode" feature are implemented. However in case
* of Ruby this is not the primitive one. We directly manipulate encoded
* strings. Encoding conversion routines transcode an encoded string directly
* to another one; not via a code point array.
*/
VALUE rb_enc_uint_chr(unsigned int code, rb_encoding *enc);
/**
* Identical to rb_external_str_new(), except it additionally takes an
* encoding. However the whole point of rb_external_str_new() is to encode a
* string into default external encoding. Being able to specify arbitrary
* encoding just ruins the designed purpose the function meseems.
*
* @param[in] ptr A memory region of `len` bytes length.
* @param[in] len Length of `ptr`, in bytes, not including the
* terminating NUL character.
* @param[in] enc Target encoding scheme.
* @exception rb_eArgError `len` is negative.
* @return An instance of ::rb_cString. In case encoding conversion from
* "default internal" to `enc` is fully defined over the given
* contents, then the return value is a string of `enc` encoding,
* whose contents are the converted ones. Otherwise the string is
* a junk.
* @warning It doesn't raise on a conversion failure and silently ends up in
* a corrupted output. You can know the failure by querying
* `valid_encoding?` of the result object.
*
* @internal
*
* @shyouhei has no idea why this one does not follow the naming convention
* that others obey. It seems to him that this should have been called
* `rb_enc_external_str_new`.
*/
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc);
/**
* Identical to rb_str_export(), except it additionally takes an encoding.
*
* @param[in] obj Target object.
* @param[in] enc Target encoding.
* @exception rb_eTypeError No implicit conversion to String.
* @return Converted ruby string of `enc` encoding.
*/
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc);
/**
* Encoding conversion main routine.
*
* @param[in] str String to convert.
* @param[in] from Source encoding.
* @param[in] to Destination encoding.
* @return A copy of `str`, with conversion from `from` to `to` applied.
* @note `from` can be a null pointer. `str`'s encoding is taken then.
* @note `to` can be a null pointer. No-op then.
*/
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to);
/**
* Identical to rb_str_conv_enc(), except it additionally takes IO encoder
* options. The extra arguments can be constructed using io_extract_modeenc()
* etc.
*
* @param[in] str String to convert.
* @param[in] from Source encoding.
* @param[in] to Destination encoding.
* @param[in] ecflags A set of enum ::ruby_econv_flag_type.
* @param[in] ecopts Optional hash.
* @return A copy of `str`, with conversion from `from` to `to` applied.
* @note `from` can be a null pointer. `str`'s encoding is taken then.
* @note `to` can be a null pointer. No-op then.
* @note `ecopts` can be ::RUBY_Qnil, which is equivalent to passing an
* empty hash.
*/
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts);
/**
* Scans the passed string to collect its code range. Because a Ruby's string
* is mutable, its contents change from time to time; so does its code range.
* A long-lived string tends to fall back to ::RUBY_ENC_CODERANGE_UNKNOWN.
* This API scans it and re-assigns a fine-grained code range constant.
*
* @param[out] str A string.
* @return An enum ::ruby_coderange_type.
*/
int rb_enc_str_coderange(VALUE str);
/**
* Scans the passed string until it finds something odd. Returns the number of
* bytes scanned. As the name implies this is suitable for repeated call. One
* of its application is `IO#readlines`. The method reads from its receiver's
* read buffer, maybe more than once, looking for newlines. But "newline" can
* be different among encodings. This API is used to detect broken contents to
* properly mark them as such.
*
* @param[in] str String to scan.
* @param[in] end End of `str`.
* @param[in] enc `str`'s encoding.
* @param[out] cr Return buffer.
* @return Distance between `str` and first such byte where broken.
* @post `cr` has the code range type.
*/
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr);
/**
* Queries if the passed string is "ASCII only". An ASCII only string is a
* string who doesn't have any non-ASCII characters at all. This doesn't
* necessarily mean the string is in ASCII encoding. For instance a String of
* CP932 encoding can quite much be ASCII only, depending on its contents.
*
* @param[in] str String in question.
* @retval 1 It doesn't have non-ASCII characters.
* @retval 0 It has characters that are out of ASCII.
*/
int rb_enc_str_asciionly_p(VALUE str);
RBIMPL_ATTR_NONNULL(())
/**
* Looks for the passed string in the passed buffer.
*
* @param[in] x Buffer that potentially includes `y`.
* @param[in] m Number of bytes of `x`.
* @param[in] y Query string.
* @param[in] n Number of bytes of `y`.
* @param[in] enc Encoding of both `x` and `y`.
* @retval -1 Not found.
* @retval otherwise Found index in `x`.
* @note This API can match at a non-character-boundary.
*/
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc);
/** @cond INTERNAL_MACRO */
RBIMPL_ATTR_NONNULL(())
static inline VALUE
rbimpl_enc_str_new_cstr(const char *str, rb_encoding *enc)
{
long len = rbimpl_strlen(str);
return rb_enc_str_new_static(str, len, enc);
}
#define rb_enc_str_new(str, len, enc) \
((RBIMPL_CONSTANT_P(str) && \
RBIMPL_CONSTANT_P(len) ? \
rb_enc_str_new_static: \
rb_enc_str_new) ((str), (len), (enc)))
#define rb_enc_str_new_cstr(str, enc) \
((RBIMPL_CONSTANT_P(str) ? \
rbimpl_enc_str_new_cstr : \
rb_enc_str_new_cstr) ((str), (enc)))
/** @endcond */
RBIMPL_SYMBOL_EXPORT_END()
#endif /* RUBY_INTERNAL_ENCODING_STRING_H */
|