diff options
Diffstat (limited to 'ext/strscan')
| -rw-r--r-- | ext/strscan/depend | 43 | ||||
| -rw-r--r-- | ext/strscan/extconf.rb | 14 | ||||
| -rw-r--r-- | ext/strscan/lib/strscan.rb | 20 | ||||
| -rw-r--r-- | ext/strscan/lib/strscan/strscan.rb | 49 | ||||
| -rw-r--r-- | ext/strscan/strscan.c | 1834 | ||||
| -rw-r--r-- | ext/strscan/strscan.gemspec | 35 |
6 files changed, 1406 insertions, 589 deletions
diff --git a/ext/strscan/depend b/ext/strscan/depend index f297134d3b..b40a025230 100644 --- a/ext/strscan/depend +++ b/ext/strscan/depend @@ -1,6 +1,20 @@ # AUTOGENERATED DEPENDENCIES START strscan.o: $(RUBY_EXTCONF_H) strscan.o: $(arch_hdrdir)/ruby/config.h +strscan.o: $(hdrdir)/ruby/assert.h +strscan.o: $(hdrdir)/ruby/backward.h +strscan.o: $(hdrdir)/ruby/backward/2/assume.h +strscan.o: $(hdrdir)/ruby/backward/2/attributes.h +strscan.o: $(hdrdir)/ruby/backward/2/bool.h +strscan.o: $(hdrdir)/ruby/backward/2/inttypes.h +strscan.o: $(hdrdir)/ruby/backward/2/limits.h +strscan.o: $(hdrdir)/ruby/backward/2/long_long.h +strscan.o: $(hdrdir)/ruby/backward/2/stdalign.h +strscan.o: $(hdrdir)/ruby/backward/2/stdarg.h +strscan.o: $(hdrdir)/ruby/defines.h +strscan.o: $(hdrdir)/ruby/encoding.h +strscan.o: $(hdrdir)/ruby/intern.h +strscan.o: $(hdrdir)/ruby/internal/abi.h strscan.o: $(hdrdir)/ruby/internal/anyargs.h strscan.o: $(hdrdir)/ruby/internal/arithmetic.h strscan.o: $(hdrdir)/ruby/internal/arithmetic/char.h @@ -38,6 +52,7 @@ strscan.o: $(hdrdir)/ruby/internal/attr/noexcept.h strscan.o: $(hdrdir)/ruby/internal/attr/noinline.h strscan.o: $(hdrdir)/ruby/internal/attr/nonnull.h strscan.o: $(hdrdir)/ruby/internal/attr/noreturn.h +strscan.o: $(hdrdir)/ruby/internal/attr/packed_struct.h strscan.o: $(hdrdir)/ruby/internal/attr/pure.h strscan.o: $(hdrdir)/ruby/internal/attr/restrict.h strscan.o: $(hdrdir)/ruby/internal/attr/returns_nonnull.h @@ -71,6 +86,15 @@ strscan.o: $(hdrdir)/ruby/internal/core/rtypeddata.h strscan.o: $(hdrdir)/ruby/internal/ctype.h strscan.o: $(hdrdir)/ruby/internal/dllexport.h strscan.o: $(hdrdir)/ruby/internal/dosish.h +strscan.o: $(hdrdir)/ruby/internal/encoding/coderange.h +strscan.o: $(hdrdir)/ruby/internal/encoding/ctype.h +strscan.o: $(hdrdir)/ruby/internal/encoding/encoding.h +strscan.o: $(hdrdir)/ruby/internal/encoding/pathname.h +strscan.o: $(hdrdir)/ruby/internal/encoding/re.h +strscan.o: $(hdrdir)/ruby/internal/encoding/sprintf.h +strscan.o: $(hdrdir)/ruby/internal/encoding/string.h +strscan.o: $(hdrdir)/ruby/internal/encoding/symbol.h +strscan.o: $(hdrdir)/ruby/internal/encoding/transcode.h strscan.o: $(hdrdir)/ruby/internal/error.h strscan.o: $(hdrdir)/ruby/internal/eval.h strscan.o: $(hdrdir)/ruby/internal/event.h @@ -98,7 +122,6 @@ strscan.o: $(hdrdir)/ruby/internal/intern/enumerator.h strscan.o: $(hdrdir)/ruby/internal/intern/error.h strscan.o: $(hdrdir)/ruby/internal/intern/eval.h strscan.o: $(hdrdir)/ruby/internal/intern/file.h -strscan.o: $(hdrdir)/ruby/internal/intern/gc.h strscan.o: $(hdrdir)/ruby/internal/intern/hash.h strscan.o: $(hdrdir)/ruby/internal/intern/io.h strscan.o: $(hdrdir)/ruby/internal/intern/load.h @@ -115,6 +138,7 @@ strscan.o: $(hdrdir)/ruby/internal/intern/re.h strscan.o: $(hdrdir)/ruby/internal/intern/ruby.h strscan.o: $(hdrdir)/ruby/internal/intern/select.h strscan.o: $(hdrdir)/ruby/internal/intern/select/largesize.h +strscan.o: $(hdrdir)/ruby/internal/intern/set.h strscan.o: $(hdrdir)/ruby/internal/intern/signal.h strscan.o: $(hdrdir)/ruby/internal/intern/sprintf.h strscan.o: $(hdrdir)/ruby/internal/intern/string.h @@ -129,33 +153,18 @@ strscan.o: $(hdrdir)/ruby/internal/memory.h strscan.o: $(hdrdir)/ruby/internal/method.h strscan.o: $(hdrdir)/ruby/internal/module.h strscan.o: $(hdrdir)/ruby/internal/newobj.h -strscan.o: $(hdrdir)/ruby/internal/rgengc.h strscan.o: $(hdrdir)/ruby/internal/scan_args.h strscan.o: $(hdrdir)/ruby/internal/special_consts.h strscan.o: $(hdrdir)/ruby/internal/static_assert.h strscan.o: $(hdrdir)/ruby/internal/stdalign.h strscan.o: $(hdrdir)/ruby/internal/stdbool.h +strscan.o: $(hdrdir)/ruby/internal/stdckdint.h strscan.o: $(hdrdir)/ruby/internal/symbol.h -strscan.o: $(hdrdir)/ruby/internal/token_paste.h strscan.o: $(hdrdir)/ruby/internal/value.h strscan.o: $(hdrdir)/ruby/internal/value_type.h strscan.o: $(hdrdir)/ruby/internal/variable.h strscan.o: $(hdrdir)/ruby/internal/warning_push.h strscan.o: $(hdrdir)/ruby/internal/xmalloc.h -strscan.o: $(hdrdir)/ruby/assert.h -strscan.o: $(hdrdir)/ruby/backward.h -strscan.o: $(hdrdir)/ruby/backward/2/assume.h -strscan.o: $(hdrdir)/ruby/backward/2/attributes.h -strscan.o: $(hdrdir)/ruby/backward/2/bool.h -strscan.o: $(hdrdir)/ruby/backward/2/gcc_version_since.h -strscan.o: $(hdrdir)/ruby/backward/2/inttypes.h -strscan.o: $(hdrdir)/ruby/backward/2/limits.h -strscan.o: $(hdrdir)/ruby/backward/2/long_long.h -strscan.o: $(hdrdir)/ruby/backward/2/stdalign.h -strscan.o: $(hdrdir)/ruby/backward/2/stdarg.h -strscan.o: $(hdrdir)/ruby/defines.h -strscan.o: $(hdrdir)/ruby/encoding.h -strscan.o: $(hdrdir)/ruby/intern.h strscan.o: $(hdrdir)/ruby/missing.h strscan.o: $(hdrdir)/ruby/onigmo.h strscan.o: $(hdrdir)/ruby/oniguruma.h diff --git a/ext/strscan/extconf.rb b/ext/strscan/extconf.rb index f0ecbf85d8..2b4ec25be3 100644 --- a/ext/strscan/extconf.rb +++ b/ext/strscan/extconf.rb @@ -1,5 +1,13 @@ # frozen_string_literal: true require 'mkmf' -$INCFLAGS << " -I$(top_srcdir)" if $extmk -have_func("onig_region_memsize", "ruby.h") -create_makefile 'strscan' +if RUBY_ENGINE == 'ruby' + $INCFLAGS << " -I$(top_srcdir)" if $extmk + have_func("onig_region_memsize(NULL)") + have_func("rb_reg_onig_match", "ruby/re.h") + have_func("rb_deprecate_constant") + have_func("rb_gc_location", "ruby.h") # RUBY_VERSION >= 2.7 + have_const("RUBY_TYPED_EMBEDDABLE", "ruby.h") # RUBY_VERSION >= 3.3 + create_makefile 'strscan' +else + File.write('Makefile', dummy_makefile("").join) +end diff --git a/ext/strscan/lib/strscan.rb b/ext/strscan/lib/strscan.rb new file mode 100644 index 0000000000..4e8910d141 --- /dev/null +++ b/ext/strscan/lib/strscan.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +case RUBY_ENGINE +when 'ruby' + require 'strscan.so' + require_relative 'strscan/strscan' +when 'jruby' + require 'strscan.jar' + JRuby::Util.load_ext('org.jruby.ext.strscan.StringScannerLibrary') + require_relative 'strscan/strscan' +when 'truffleruby' + if RUBY_ENGINE_VERSION.to_i >= 34 + require 'strscan/truffleruby' + else + $LOAD_PATH.delete __dir__ + require 'strscan' + end +else + raise NotImplementedError, "Unknown Ruby: #{RUBY_ENGINE}" +end diff --git a/ext/strscan/lib/strscan/strscan.rb b/ext/strscan/lib/strscan/strscan.rb new file mode 100644 index 0000000000..07ed102d9a --- /dev/null +++ b/ext/strscan/lib/strscan/strscan.rb @@ -0,0 +1,49 @@ +# frozen_string_literal: true + +class StringScanner + # :markup: markdown + # + # call-seq: + # scan_integer(base: 10) -> integer or nil + # + # Returns an integer scanned from `self`, + # beginning at the current position; + # returns `nil` if no such integer was available. + # + # When `base` is `10` (the default), + # equivalent to calling #scan with argument +pattern+ + # as `'[+-]?\d+'`: + # + # ```ruby + # scanner = StringScanner.new('Form 27B/6') + # scanner.scan_integer # => nil # No integer at position 0. + # scanner.pos = 5 + # scanner.scan_integer # => 27 + # scanner.matched # => "27" + # scanner.pos # => 7 + # ``` + # + # When `base` is `16` (the only other value allowed), + # equivalent to calling #scan with argument `pattern` + # as `'[+-]?(0x)?[0-9a-fA-F]+'`: + # + # ```ruby + # scanner.pos = 5 + # scanner.scan_integer(base: 16) # => 635 + # scanner.matched # => "27B" + # scanner.pos # => 8 + # ``` + # + # Raises Encoding::CompatibilityError if `self` does not have + # an ASCII compatible encoding. + def scan_integer(base: 10) + case base + when 10 + scan_base10_integer + when 16 + scan_base16_integer + else + raise ArgumentError, "Unsupported integer base: #{base.inspect}, expected 10 or 16" + end + end +end diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 88074a0a26..d35df7e43b 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -22,7 +22,15 @@ extern size_t onig_region_memsize(const struct re_registers *regs); #include <stdbool.h> -#define STRSCAN_VERSION "3.0.0" +#define STRSCAN_VERSION "3.1.9.dev" + + +#ifdef HAVE_RB_DEPRECATE_CONSTANT +/* In ruby 3.0, defined but exposed in external headers */ +extern void rb_deprecate_constant(VALUE mod, const char *name); +#else +# define rb_deprecate_constant(mod, name) ((void)0) +#endif /* ======================================================================= Data Type Definitions @@ -30,7 +38,8 @@ extern size_t onig_region_memsize(const struct re_registers *regs); static VALUE StringScanner; static VALUE ScanError; -static ID id_byteslice; + +static int usascii_encindex, utf8_encindex, binary_encindex; struct strscanner { @@ -56,8 +65,13 @@ struct strscanner }; #define MATCHED_P(s) ((s)->flags & FLAG_MATCHED) -#define MATCHED(s) (s)->flags |= FLAG_MATCHED -#define CLEAR_MATCH_STATUS(s) (s)->flags &= ~FLAG_MATCHED +#define MATCHED(s) ((s)->flags |= FLAG_MATCHED) +#define CLEAR_MATCHED(s) ((s)->flags &= ~FLAG_MATCHED) +#define CLEAR_NAMED_CAPTURES(s) ((s)->regex = Qnil) +#define CLEAR_MATCH_STATUS(s) do {\ + CLEAR_MATCHED(s);\ + CLEAR_NAMED_CAPTURES(s);\ +} while (0) #define S_PBEG(s) (RSTRING_PTR((s)->str)) #define S_LEN(s) (RSTRING_LEN((s)->str)) @@ -90,7 +104,6 @@ static VALUE strscan_init_copy _((VALUE vself, VALUE vorig)); static VALUE strscan_s_mustc _((VALUE self)); static VALUE strscan_terminate _((VALUE self)); -static VALUE strscan_clear _((VALUE self)); static VALUE strscan_get_string _((VALUE self)); static VALUE strscan_set_string _((VALUE self, VALUE str)); static VALUE strscan_concat _((VALUE self, VALUE str)); @@ -112,13 +125,11 @@ static VALUE strscan_search_full _((VALUE self, VALUE re, static void adjust_registers_to_matched _((struct strscanner *p)); static VALUE strscan_getch _((VALUE self)); static VALUE strscan_get_byte _((VALUE self)); -static VALUE strscan_getbyte _((VALUE self)); static VALUE strscan_peek _((VALUE self, VALUE len)); -static VALUE strscan_peep _((VALUE self, VALUE len)); +static VALUE strscan_scan_base10_integer _((VALUE self)); static VALUE strscan_unscan _((VALUE self)); static VALUE strscan_bol_p _((VALUE self)); static VALUE strscan_eos_p _((VALUE self)); -static VALUE strscan_empty_p _((VALUE self)); static VALUE strscan_rest_p _((VALUE self)); static VALUE strscan_matched_p _((VALUE self)); static VALUE strscan_matched _((VALUE self)); @@ -171,12 +182,35 @@ extract_beg_len(struct strscanner *p, long beg_i, long len) Constructor ======================================================================= */ +#ifdef RUBY_TYPED_EMBEDDABLE +# define HAVE_RUBY_TYPED_EMBEDDABLE 1 +#else +# ifdef HAVE_CONST_RUBY_TYPED_EMBEDDABLE +# define RUBY_TYPED_EMBEDDABLE RUBY_TYPED_EMBEDDABLE +# define HAVE_RUBY_TYPED_EMBEDDABLE 1 +# else +# define RUBY_TYPED_EMBEDDABLE 0 +# endif +#endif + +#ifdef HAVE_RB_GC_LOCATION +static void +strscan_compact(void *ptr) +{ + struct strscanner *p = ptr; + p->str = rb_gc_location(p->str); + p->regex = rb_gc_location(p->regex); +} +#else +#define rb_gc_mark_movable rb_gc_mark +#endif + static void strscan_mark(void *ptr) { struct strscanner *p = ptr; - rb_gc_mark(p->str); - rb_gc_mark(p->regex); + rb_gc_mark_movable(p->str); + rb_gc_mark_movable(p->regex); } static void @@ -184,24 +218,37 @@ strscan_free(void *ptr) { struct strscanner *p = ptr; onig_region_free(&(p->regs), 0); +#ifndef HAVE_RUBY_TYPED_EMBEDDABLE ruby_xfree(p); +#endif } static size_t strscan_memsize(const void *ptr) { - const struct strscanner *p = ptr; - size_t size = sizeof(*p) - sizeof(p->regs); + size_t size = 0; +#ifndef HAVE_RUBY_TYPED_EMBEDDABLE + size += sizeof(struct strscanner); +#endif + #ifdef HAVE_ONIG_REGION_MEMSIZE - size += onig_region_memsize(&p->regs); + const struct strscanner *p = ptr; + size += onig_region_memsize(&p->regs) - sizeof(p->regs); #endif return size; } static const rb_data_type_t strscanner_type = { - "StringScanner", - {strscan_mark, strscan_free, strscan_memsize}, - 0, 0, RUBY_TYPED_FREE_IMMEDIATELY + .wrap_struct_name = "StringScanner", + .function = { + .dmark = strscan_mark, + .dfree = strscan_free, + .dsize = strscan_memsize, +#ifdef HAVE_RB_GC_LOCATION + .dcompact = strscan_compact, +#endif + }, + .flags = RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED | RUBY_TYPED_EMBEDDABLE }; static VALUE @@ -213,21 +260,32 @@ strscan_s_allocate(VALUE klass) CLEAR_MATCH_STATUS(p); onig_region_init(&(p->regs)); p->str = Qnil; - p->regex = Qnil; return obj; } /* - * call-seq: - * StringScanner.new(string, fixed_anchor: false) - * StringScanner.new(string, dup = false) - * - * Creates a new StringScanner object to scan over the given +string+. + * :markup: markdown + * :include: strscan/link_refs.txt * - * If +fixed_anchor+ is +true+, +\A+ always matches the beginning of - * the string. Otherwise, +\A+ always matches the current position. + * call-seq: + * StringScanner.new(string, fixed_anchor: false) -> string_scanner + * + * Returns a new `StringScanner` object whose [stored string][1] + * is the given `string`; + * sets the [fixed-anchor property][10]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.string # => "foobarbaz" + * scanner.fixed_anchor? # => false + * put_situation(scanner) + * # Situation: + * # pos: 0 + * # charpos: 0 + * # rest: "foobarbaz" + * # rest_size: 9 + * ``` * - * +dup+ argument is obsolete and not used now. */ static VALUE strscan_initialize(int argc, VALUE *argv, VALUE self) @@ -254,7 +312,7 @@ strscan_initialize(int argc, VALUE *argv, VALUE self) p->fixed_anchor_p = false; } StringValue(str); - p->str = str; + RB_OBJ_WRITE(self, &p->str, str); return self; } @@ -266,11 +324,14 @@ check_strscan(VALUE obj) } /* + * :markup: markdown + * :include: strscan/link_refs.txt + * * call-seq: - * dup - * clone + * dup -> shallow_copy * - * Duplicates a StringScanner object. + * Returns a shallow copy of `self`; + * the [stored string][1] in the copy is the same string as in `self`. */ static VALUE strscan_init_copy(VALUE vself, VALUE vorig) @@ -281,7 +342,7 @@ strscan_init_copy(VALUE vself, VALUE vorig) orig = check_strscan(vorig); if (self != orig) { self->flags = orig->flags; - self->str = orig->str; + RB_OBJ_WRITE(vself, &self->str, orig->str); self->prev = orig->prev; self->curr = orig->curr; if (rb_reg_region_copy(&self->regs, &orig->regs)) @@ -297,10 +358,13 @@ strscan_init_copy(VALUE vself, VALUE vorig) ======================================================================= */ /* - * call-seq: StringScanner.must_C_version + * call-seq: + * StringScanner.must_C_version -> self * - * This method is defined for backward compatibility. + * Returns +self+; defined for backward compatibility. */ + + /* :nodoc: */ static VALUE strscan_s_mustc(VALUE self) { @@ -308,7 +372,30 @@ strscan_s_mustc(VALUE self) } /* - * Reset the scan pointer (index 0) and clear matching data. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * reset -> self + * + * Sets both [byte position][2] and [character position][7] to zero, + * and clears [match values][9]; + * returns +self+: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.exist?(/bar/) # => 6 + * scanner.reset # => #<StringScanner 0/9 @ "fooba..."> + * put_situation(scanner) + * # Situation: + * # pos: 0 + * # charpos: 0 + * # rest: "foobarbaz" + * # rest_size: 9 + * # => nil + * match_values_cleared?(scanner) # => true + * ``` + * */ static VALUE strscan_reset(VALUE self) @@ -322,11 +409,12 @@ strscan_reset(VALUE self) } /* - * call-seq: - * terminate - * clear + * :markup: markdown + * :call-seq: + * terminate -> self * - * Sets the scan pointer to the end of the string and clear matching data. + * :include: strscan/link_refs.txt + * :include: strscan/methods/terminate.md */ static VALUE strscan_terminate(VALUE self) @@ -340,18 +428,21 @@ strscan_terminate(VALUE self) } /* - * Equivalent to #terminate. - * This method is obsolete; use #terminate instead. - */ -static VALUE -strscan_clear(VALUE self) -{ - rb_warning("StringScanner#clear is obsolete; use #terminate instead"); - return strscan_terminate(self); -} - -/* - * Returns the string being scanned. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * string -> stored_string + * + * Returns the [stored string][1]: + * + * ```rb + * scanner = StringScanner.new('foobar') + * scanner.string # => "foobar" + * scanner.concat('baz') + * scanner.string # => "foobarbaz" + * ``` + * */ static VALUE strscan_get_string(VALUE self) @@ -363,10 +454,39 @@ strscan_get_string(VALUE self) } /* - * call-seq: string=(str) + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * string = other_string -> other_string + * + * Replaces the [stored string][1] with the given `other_string`: + * + * - Sets both [positions][11] to zero. + * - Clears [match values][9]. + * - Returns `other_string`. + * + * ```rb + * scanner = StringScanner.new('foobar') + * scanner.scan(/foo/) + * put_situation(scanner) + * # Situation: + * # pos: 3 + * # charpos: 3 + * # rest: "bar" + * # rest_size: 3 + * match_values_cleared?(scanner) # => false + * + * scanner.string = 'baz' # => "baz" + * put_situation(scanner) + * # Situation: + * # pos: 0 + * # charpos: 0 + * # rest: "baz" + * # rest_size: 3 + * match_values_cleared?(scanner) # => true + * ``` * - * Changes the string being scanned to +str+ and resets the scanner. - * Returns +str+. */ static VALUE strscan_set_string(VALUE self, VALUE str) @@ -374,25 +494,40 @@ strscan_set_string(VALUE self, VALUE str) struct strscanner *p = check_strscan(self); StringValue(str); - p->str = str; + RB_OBJ_WRITE(self, &p->str, str); p->curr = 0; CLEAR_MATCH_STATUS(p); return str; } /* - * call-seq: - * concat(str) - * <<(str) + * :markup: markdown + * :include: strscan/link_refs.txt * - * Appends +str+ to the string being scanned. - * This method does not affect scan pointer. + * call-seq: + * concat(more_string) -> self + * + * - Appends the given `more_string` + * to the [stored string][1]. + * - Returns `self`. + * - Does not affect the [positions][11] + * or [match values][9]. + * + * + * ```rb + * scanner = StringScanner.new('foo') + * scanner.string # => "foo" + * scanner.terminate + * scanner.concat('barbaz') # => #<StringScanner 3/9 "foo" @ "barba..."> + * scanner.string # => "foobarbaz" + * put_situation(scanner) + * # Situation: + * # pos: 3 + * # charpos: 3 + * # rest: "barbaz" + * # rest_size: 6 + * ``` * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.scan(/Fri /) - * s << " +1000 GMT" - * s.string # -> "Fri Dec 12 1975 14:39 +1000 GMT" - * s.scan(/Dec/) # -> "Dec" */ static VALUE strscan_concat(VALUE self, VALUE str) @@ -406,18 +541,12 @@ strscan_concat(VALUE self, VALUE str) } /* - * Returns the byte position of the scan pointer. In the 'reset' position, this - * value is zero. In the 'terminated' position (i.e. the string is exhausted), - * this value is the bytesize of the string. + * :markup: markdown + * :call-seq: + * pos -> byte_position * - * In short, it's a 0-based index into bytes of the string. - * - * s = StringScanner.new('test string') - * s.pos # -> 0 - * s.scan_until /str/ # -> "test str" - * s.pos # -> 8 - * s.terminate # -> #<StringScanner fin> - * s.pos # -> 11 + * :include: strscan/link_refs.txt + * :include: strscan/methods/get_pos.md */ static VALUE strscan_get_pos(VALUE self) @@ -425,43 +554,35 @@ strscan_get_pos(VALUE self) struct strscanner *p; GET_SCANNER(self, p); - return INT2FIX(p->curr); + return LONG2NUM(p->curr); } /* - * Returns the character position of the scan pointer. In the 'reset' position, this - * value is zero. In the 'terminated' position (i.e. the string is exhausted), - * this value is the size of the string. - * - * In short, it's a 0-based index into the string. + * :markup: markdown + * :call-seq: + * charpos -> character_position * - * s = StringScanner.new("abcädeföghi") - * s.charpos # -> 0 - * s.scan_until(/ä/) # -> "abcä" - * s.pos # -> 5 - * s.charpos # -> 4 + * :include: strscan/link_refs.txt + * :include: strscan/methods/get_charpos.md */ static VALUE strscan_get_charpos(VALUE self) { struct strscanner *p; - VALUE substr; GET_SCANNER(self, p); - substr = rb_funcall(p->str, id_byteslice, 2, INT2FIX(0), LONG2NUM(p->curr)); - - return rb_str_length(substr); + return LONG2NUM(rb_enc_strlen(S_PBEG(p), CURPTR(p), rb_enc_get(p->str))); } /* - * call-seq: pos=(n) - * - * Sets the byte position of the scan pointer. + * :markup: markdown + * :call-seq: + * pos = n -> n + * pointer = n -> n * - * s = StringScanner.new('test string') - * s.pos = 7 # -> 7 - * s.rest # -> "ring" + * :include: strscan/link_refs.txt + * :include: strscan/methods/set_pos.md */ static VALUE strscan_set_pos(VALUE self, VALUE v) @@ -470,7 +591,7 @@ strscan_set_pos(VALUE self, VALUE v) long i; GET_SCANNER(self, p); - i = NUM2INT(v); + i = NUM2LONG(v); if (i < 0) i += S_LEN(p); if (i < 0) rb_raise(rb_eRangeError, "index out of range"); if (i > S_LEN(p)) rb_raise(rb_eRangeError, "index out of range"); @@ -491,19 +612,20 @@ match_target(struct strscanner *p) } static inline void -set_registers(struct strscanner *p, size_t length) +set_registers(struct strscanner *p, size_t pos, size_t length) { const int at = 0; OnigRegion *regs = &(p->regs); onig_region_clear(regs); if (onig_region_set(regs, at, 0, 0)) return; if (p->fixed_anchor_p) { - regs->beg[at] = p->curr; - regs->end[at] = p->curr + length; + regs->beg[at] = pos + p->curr; + regs->end[at] = pos + p->curr + length; } else { - regs->end[at] = length; + regs->beg[at] = pos; + regs->end[at] = pos + length; } } @@ -542,19 +664,82 @@ adjust_register_position(struct strscanner *p, long position) } } -static VALUE -strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly) +/* rb_reg_onig_match is available in Ruby 3.3 and later. */ +#ifndef HAVE_RB_REG_ONIG_MATCH +static OnigPosition +rb_reg_onig_match(VALUE re, VALUE str, + OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args), + void *args, struct re_registers *regs) { - struct strscanner *p; + OnigPosition result; + regex_t *reg = rb_reg_prepare_re(re, str); + + bool tmpreg = reg != RREGEXP_PTR(re); + if (!tmpreg) RREGEXP(re)->usecnt++; - if (headonly) { - if (!RB_TYPE_P(pattern, T_REGEXP)) { - StringValue(pattern); + result = match(reg, str, regs, args); + + if (!tmpreg) RREGEXP(re)->usecnt--; + if (tmpreg) { + if (RREGEXP(re)->usecnt) { + onig_free(reg); + } + else { + onig_free(RREGEXP_PTR(re)); + RREGEXP_PTR(re) = reg; } } - else { - Check_Type(pattern, T_REGEXP); + + if (result < 0) { + if (result != ONIG_MISMATCH) { + rb_raise(ScanError, "regexp buffer overflow"); + } + } + + return result; +} +#endif + +static OnigPosition +strscan_match(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr) +{ + struct strscanner *p = (struct strscanner *)args_ptr; + + return onig_match(reg, + match_target(p), + (UChar* )(CURPTR(p) + S_RESTLEN(p)), + (UChar* )CURPTR(p), + regs, + ONIG_OPTION_NONE); +} + +static OnigPosition +strscan_search(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr) +{ + struct strscanner *p = (struct strscanner *)args_ptr; + + return onig_search(reg, + match_target(p), + (UChar *)(CURPTR(p) + S_RESTLEN(p)), + (UChar *)CURPTR(p), + (UChar *)(CURPTR(p) + S_RESTLEN(p)), + regs, + ONIG_OPTION_NONE); +} + +static void +strscan_enc_check(VALUE str1, VALUE str2) +{ + if (RB_ENCODING_GET(str1) != RB_ENCODING_GET(str2)) { + rb_enc_check(str1, str2); } +} + +static VALUE +strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly) +{ + struct strscanner *p; + GET_SCANNER(self, p); CLEAR_MATCH_STATUS(p); @@ -563,59 +748,42 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly } if (RB_TYPE_P(pattern, T_REGEXP)) { - regex_t *rb_reg_prepare_re(VALUE re, VALUE str); - regex_t *re; - long ret; - int tmpreg; - - p->regex = pattern; - re = rb_reg_prepare_re(pattern, p->str); - tmpreg = re != RREGEXP_PTR(pattern); - if (!tmpreg) RREGEXP(pattern)->usecnt++; - - if (headonly) { - ret = onig_match(re, - match_target(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - (UChar* )CURPTR(p), - &(p->regs), - ONIG_OPTION_NONE); - } - else { - ret = onig_search(re, - match_target(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - (UChar* )CURPTR(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - &(p->regs), - ONIG_OPTION_NONE); - } - if (!tmpreg) RREGEXP(pattern)->usecnt--; - if (tmpreg) { - if (RREGEXP(pattern)->usecnt) { - onig_free(re); - } - else { - onig_free(RREGEXP_PTR(pattern)); - RREGEXP_PTR(pattern) = re; - } - } - - if (ret == -2) rb_raise(ScanError, "regexp buffer overflow"); - if (ret < 0) { - /* not matched */ + OnigPosition ret; + RB_OBJ_WRITE(self, &p->regex, pattern); + ret = rb_reg_onig_match(p->regex, + p->str, + headonly ? strscan_match : strscan_search, + (void *)p, + &(p->regs)); + + if (ret == ONIG_MISMATCH) { return Qnil; } } else { - rb_enc_check(p->str, pattern); + StringValue(pattern); if (S_RESTLEN(p) < RSTRING_LEN(pattern)) { + strscan_enc_check(p->str, pattern); return Qnil; } - if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) { - return Qnil; + + if (headonly) { + strscan_enc_check(p->str, pattern); + + if (memcmp(CURPTR(p), RSTRING_PTR(pattern), RSTRING_LEN(pattern)) != 0) { + return Qnil; + } + set_registers(p, 0, RSTRING_LEN(pattern)); + } + else { + rb_encoding *enc = rb_enc_check(p->str, pattern); + long pos = rb_memsearch(RSTRING_PTR(pattern), RSTRING_LEN(pattern), + CURPTR(p), S_RESTLEN(p), enc); + if (pos == -1) { + return Qnil; + } + set_registers(p, pos, RSTRING_LEN(pattern)); } - set_registers(p, RSTRING_LEN(pattern)); } MATCHED(p); @@ -636,20 +804,12 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly } /* - * call-seq: scan(pattern) => String - * - * Tries to match with +pattern+ at the current position. If there's a match, - * the scanner advances the "scan pointer" and returns the matched string. - * Otherwise, the scanner returns +nil+. - * - * s = StringScanner.new('test string') - * p s.scan(/\w+/) # -> "test" - * p s.scan(/\w+/) # -> nil - * p s.scan(/\s+/) # -> " " - * p s.scan("str") # -> "str" - * p s.scan(/\w+/) # -> "ing" - * p s.scan(/./) # -> nil + * :markup: markdown + * :call-seq: + * scan(pattern) -> substring or nil * + * :include: strscan/link_refs.txt + * :include: strscan/methods/scan.md */ static VALUE strscan_scan(VALUE self, VALUE re) @@ -658,16 +818,60 @@ strscan_scan(VALUE self, VALUE re) } /* - * call-seq: match?(pattern) + * :markup: markdown + * :include: strscan/link_refs.txt * - * Tests whether the given +pattern+ is matched from the current scan pointer. - * Returns the length of the match, or +nil+. The scan pointer is not advanced. + * call-seq: + * match?(pattern) -> match_size or nil + * + * Attempts to [match][17] the given `pattern` + * at the beginning of the [target substring][3]; + * does not modify the [positions][11]. + * + * If the match succeeds: + * + * - Sets [match values][9]. + * - Returns the size in bytes of the matched substring. + * + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.pos = 3 + * scanner.match?(/bar/) => 3 + * put_match_values(scanner) + * # Basic match values: + * # matched?: true + * # matched_size: 3 + * # pre_match: "foo" + * # matched : "bar" + * # post_match: "baz" + * # Captured match values: + * # size: 1 + * # captures: [] + * # named_captures: {} + * # values_at: ["bar", nil] + * # []: + * # [0]: "bar" + * # [1]: nil + * put_situation(scanner) + * # Situation: + * # pos: 3 + * # charpos: 3 + * # rest: "barbaz" + * # rest_size: 6 + * ``` + * + * If the match fails: + * + * - Clears match values. + * - Returns `nil`. + * - Does not increment positions. + * + * ```rb + * scanner.match?(/nope/) # => nil + * match_values_cleared?(scanner) # => true + * ``` * - * s = StringScanner.new('test string') - * p s.match?(/\w+/) # -> 4 - * p s.match?(/\w+/) # -> 4 - * p s.match?("test") # -> 4 - * p s.match?(/\s+/) # -> nil */ static VALUE strscan_match_p(VALUE self, VALUE re) @@ -676,22 +880,12 @@ strscan_match_p(VALUE self, VALUE re) } /* - * call-seq: skip(pattern) - * - * Attempts to skip over the given +pattern+ beginning with the scan pointer. - * If it matches, the scan pointer is advanced to the end of the match, and the - * length of the match is returned. Otherwise, +nil+ is returned. - * - * It's similar to #scan, but without returning the matched string. - * - * s = StringScanner.new('test string') - * p s.skip(/\w+/) # -> 4 - * p s.skip(/\w+/) # -> nil - * p s.skip(/\s+/) # -> 1 - * p s.skip("st") # -> 2 - * p s.skip(/\w+/) # -> 4 - * p s.skip(/./) # -> nil + * :markup: markdown + * call-seq: + * skip(pattern) -> match_size or nil * + * :include: strscan/link_refs.txt + * :include: strscan/methods/skip.md */ static VALUE strscan_skip(VALUE self, VALUE re) @@ -700,19 +894,59 @@ strscan_skip(VALUE self, VALUE re) } /* - * call-seq: check(pattern) - * - * This returns the value that #scan would return, without advancing the scan - * pointer. The match register is affected, though. + * :markup: markdown + * :include: strscan/link_refs.txt * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.check /Fri/ # -> "Fri" - * s.pos # -> 0 - * s.matched # -> "Fri" - * s.check /12/ # -> nil - * s.matched # -> nil + * call-seq: + * check(pattern) -> matched_substring or nil + * + * Attempts to [match][17] the given `pattern` + * at the beginning of the [target substring][3]; + * does not modify the [positions][11]. + * + * If the match succeeds: + * + * - Returns the matched substring. + * - Sets all [match values][9]. + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.pos = 3 + * scanner.check('bar') # => "bar" + * put_match_values(scanner) + * # Basic match values: + * # matched?: true + * # matched_size: 3 + * # pre_match: "foo" + * # matched : "bar" + * # post_match: "baz" + * # Captured match values: + * # size: 1 + * # captures: [] + * # named_captures: {} + * # values_at: ["bar", nil] + * # []: + * # [0]: "bar" + * # [1]: nil + * # => 0..1 + * put_situation(scanner) + * # Situation: + * # pos: 3 + * # charpos: 3 + * # rest: "barbaz" + * # rest_size: 6 + * ``` + * + * If the match fails: + * + * - Returns `nil`. + * - Clears all [match values][9]. + * + * ```rb + * scanner.check(/nope/) # => nil + * match_values_cleared?(scanner) # => true + * ``` * - * Mnemonic: it "checks" to see whether a #scan will return a value. */ static VALUE strscan_check(VALUE self, VALUE re) @@ -721,15 +955,24 @@ strscan_check(VALUE self, VALUE re) } /* - * call-seq: scan_full(pattern, advance_pointer_p, return_string_p) + * call-seq: + * scan_full(pattern, advance_pointer_p, return_string_p) -> matched_substring or length or nil + * + * Equivalent to one of the following: + * + * - +advance_pointer_p+ +true+: * - * Tests whether the given +pattern+ is matched from the current scan pointer. - * Advances the scan pointer if +advance_pointer_p+ is true. - * Returns the matched string if +return_string_p+ is true. - * The match register is affected. + * - +return_string_p+ +true+: StringScanner#scan(pattern). + * - +return_string_p+ +false+: StringScanner#skip(pattern). + * + * - +advance_pointer_p+ +false+: + * + * - +return_string_p+ +true+: StringScanner#check(pattern). + * - +return_string_p+ +false+: StringScanner#match?(pattern). * - * "full" means "#scan with full parameters". */ + + /* :nodoc: */ static VALUE strscan_scan_full(VALUE self, VALUE re, VALUE s, VALUE f) { @@ -737,16 +980,12 @@ strscan_scan_full(VALUE self, VALUE re, VALUE s, VALUE f) } /* - * call-seq: scan_until(pattern) + * :markup: markdown + * :call-seq: + * scan_until(pattern) -> substring or nil * - * Scans the string _until_ the +pattern+ is matched. Returns the substring up - * to and including the end of the match, advancing the scan pointer to that - * location. If there is no match, +nil+ is returned. - * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.scan_until(/1/) # -> "Fri Dec 1" - * s.pre_match # -> "Fri Dec " - * s.scan_until(/XYZ/) # -> nil + * :include: strscan/link_refs.txt + * :include: strscan/methods/scan_until.md */ static VALUE strscan_scan_until(VALUE self, VALUE re) @@ -755,17 +994,61 @@ strscan_scan_until(VALUE self, VALUE re) } /* - * call-seq: exist?(pattern) + * :markup: markdown + * :include: strscan/link_refs.txt * - * Looks _ahead_ to see if the +pattern+ exists _anywhere_ in the string, - * without advancing the scan pointer. This predicates whether a #scan_until - * will return a value. + * call-seq: + * exist?(pattern) -> byte_offset or nil + * + * Attempts to [match][17] the given `pattern` + * anywhere (at any [position][2]) + * n the [target substring][3]; + * does not modify the [positions][11]. + * + * If the match succeeds: + * + * - Returns a byte offset: + * the distance in bytes between the current [position][2] + * and the end of the matched substring. + * - Sets all [match values][9]. + * + * ```rb + * scanner = StringScanner.new('foobarbazbatbam') + * scanner.pos = 6 + * scanner.exist?(/bat/) # => 6 + * put_match_values(scanner) + * # Basic match values: + * # matched?: true + * # matched_size: 3 + * # pre_match: "foobarbaz" + * # matched : "bat" + * # post_match: "bam" + * # Captured match values: + * # size: 1 + * # captures: [] + * # named_captures: {} + * # values_at: ["bat", nil] + * # []: + * # [0]: "bat" + * # [1]: nil + * put_situation(scanner) + * # Situation: + * # pos: 6 + * # charpos: 6 + * # rest: "bazbatbam" + * # rest_size: 9 + * ``` + * + * If the match fails: + * + * - Returns `nil`. + * - Clears all [match values][9]. + * + * ```rb + * scanner.exist?(/nope/) # => nil + * match_values_cleared?(scanner) # => true + * ``` * - * s = StringScanner.new('test string') - * s.exist? /s/ # -> 3 - * s.scan /test/ # -> "test" - * s.exist? /s/ # -> 2 - * s.exist? /e/ # -> nil */ static VALUE strscan_exist_p(VALUE self, VALUE re) @@ -774,20 +1057,12 @@ strscan_exist_p(VALUE self, VALUE re) } /* - * call-seq: skip_until(pattern) - * - * Advances the scan pointer until +pattern+ is matched and consumed. Returns - * the number of bytes advanced, or +nil+ if no match was found. - * - * Look ahead to match +pattern+, and advance the scan pointer to the _end_ - * of the match. Return the number of characters advanced, or +nil+ if the - * match was unsuccessful. - * - * It's similar to #scan_until, but without returning the intervening string. + * :markup: markdown + * :call-seq: + * skip_until(pattern) -> matched_substring_size or nil * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.skip_until /12/ # -> 10 - * s # + * :include: strscan/link_refs.txt + * :include: strscan/methods/skip_until.md */ static VALUE strscan_skip_until(VALUE self, VALUE re) @@ -796,17 +1071,61 @@ strscan_skip_until(VALUE self, VALUE re) } /* - * call-seq: check_until(pattern) + * :markup: markdown + * :include: strscan/link_refs.txt * - * This returns the value that #scan_until would return, without advancing the - * scan pointer. The match register is affected, though. - * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.check_until /12/ # -> "Fri Dec 12" - * s.pos # -> 0 - * s.matched # -> 12 + * call-seq: + * check_until(pattern) -> substring or nil + * + * Attempts to [match][17] the given `pattern` + * anywhere (at any [position][2]) + * in the [target substring][3]; + * does not modify the [positions][11]. + * + * If the match succeeds: + * + * - Sets all [match values][9]. + * - Returns the matched substring, + * which extends from the current [position][2] + * to the end of the matched substring. + * + * ```rb + * scanner = StringScanner.new('foobarbazbatbam') + * scanner.pos = 6 + * scanner.check_until(/bat/) # => "bazbat" + * put_match_values(scanner) + * # Basic match values: + * # matched?: true + * # matched_size: 3 + * # pre_match: "foobarbaz" + * # matched : "bat" + * # post_match: "bam" + * # Captured match values: + * # size: 1 + * # captures: [] + * # named_captures: {} + * # values_at: ["bat", nil] + * # []: + * # [0]: "bat" + * # [1]: nil + * put_situation(scanner) + * # Situation: + * # pos: 6 + * # charpos: 6 + * # rest: "bazbatbam" + * # rest_size: 9 + * ``` + * + * If the match fails: + * + * - Clears all [match values][9]. + * - Returns `nil`. + * + * ```rb + * scanner.check_until(/nope/) # => nil + * match_values_cleared?(scanner) # => true + * ``` * - * Mnemonic: it "checks" to see whether a #scan_until will return a value. */ static VALUE strscan_check_until(VALUE self, VALUE re) @@ -815,14 +1134,24 @@ strscan_check_until(VALUE self, VALUE re) } /* - * call-seq: search_full(pattern, advance_pointer_p, return_string_p) + * call-seq: + * search_full(pattern, advance_pointer_p, return_string_p) -> matched_substring or position_delta or nil + * + * Equivalent to one of the following: + * + * - +advance_pointer_p+ +true+: + * + * - +return_string_p+ +true+: StringScanner#scan_until(pattern). + * - +return_string_p+ +false+: StringScanner#skip_until(pattern). + * + * - +advance_pointer_p+ +false+: + * + * - +return_string_p+ +true+: StringScanner#check_until(pattern). + * - +return_string_p+ +false+: StringScanner#exist?(pattern). * - * Scans the string _until_ the +pattern+ is matched. - * Advances the scan pointer if +advance_pointer_p+, otherwise not. - * Returns the matched string if +return_string_p+ is true, otherwise - * returns the number of bytes advanced. - * This method does affect the match register. */ + + /* :nodoc: */ static VALUE strscan_search_full(VALUE self, VALUE re, VALUE s, VALUE f) { @@ -842,17 +1171,12 @@ adjust_registers_to_matched(struct strscanner *p) } /* - * Scans one character and returns it. - * This method is multibyte character sensitive. + * :markup: markdown + * :call-seq: + * getch -> character or nil * - * s = StringScanner.new("ab") - * s.getch # => "a" - * s.getch # => "b" - * s.getch # => nil - * - * s = StringScanner.new("\244\242".force_encoding("euc-jp")) - * s.getch # => "\x{A4A2}" # Japanese hira-kana "A" in EUC-JP - * s.getch # => nil + * :include: strscan/link_refs.txt + * :include: strscan/methods/getch.md */ static VALUE strscan_getch(VALUE self) @@ -877,59 +1201,95 @@ strscan_getch(VALUE self) } /* - * Scans one byte and returns it. + * call-seq: + * scan_byte -> integer_byte or nil + * + * Scans one byte and returns it as an integer. * This method is not multibyte character sensitive. * See also: #getch. * - * s = StringScanner.new('ab') - * s.get_byte # => "a" - * s.get_byte # => "b" - * s.get_byte # => nil - * - * s = StringScanner.new("\244\242".force_encoding("euc-jp")) - * s.get_byte # => "\xA4" - * s.get_byte # => "\xA2" - * s.get_byte # => nil */ static VALUE -strscan_get_byte(VALUE self) +strscan_scan_byte(VALUE self) { struct strscanner *p; + VALUE byte; GET_SCANNER(self, p); CLEAR_MATCH_STATUS(p); if (EOS_P(p)) return Qnil; + byte = INT2FIX((unsigned char)*CURPTR(p)); p->prev = p->curr; p->curr++; MATCHED(p); adjust_registers_to_matched(p); - return extract_range(p, - adjust_register_position(p, p->regs.beg[0]), - adjust_register_position(p, p->regs.end[0])); + return byte; +} + +/* + * Peeks at the current byte and returns it as an integer. + * + * s = StringScanner.new('ab') + * s.peek_byte # => 97 + */ +static VALUE +strscan_peek_byte(VALUE self) +{ + struct strscanner *p; + + GET_SCANNER(self, p); + if (EOS_P(p)) + return Qnil; + + return INT2FIX((unsigned char)*CURPTR(p)); } /* - * Equivalent to #get_byte. - * This method is obsolete; use #get_byte instead. + * :markup: markdown + * :call-seq: + * get_byte -> byte_as_character or nil + * + * :include: strscan/link_refs.txt + * :include: strscan/methods/get_byte.md */ static VALUE -strscan_getbyte(VALUE self) +strscan_get_byte(VALUE self) { - rb_warning("StringScanner#getbyte is obsolete; use #get_byte instead"); - return strscan_get_byte(self); + struct strscanner *p; + + GET_SCANNER(self, p); + CLEAR_MATCH_STATUS(p); + if (EOS_P(p)) + return Qnil; + + p->prev = p->curr; + p->curr++; + MATCHED(p); + adjust_registers_to_matched(p); + return extract_range(p, + adjust_register_position(p, p->regs.beg[0]), + adjust_register_position(p, p->regs.end[0])); } /* - * call-seq: peek(len) + * :markup: markdown + * :include: strscan/link_refs.txt * - * Extracts a string corresponding to <tt>string[pos,len]</tt>, without - * advancing the scan pointer. + * call-seq: + * peek(length) -> substring * - * s = StringScanner.new('test string') - * s.peek(7) # => "test st" - * s.peek(7) # => "test st" + * Returns the substring `string[pos, length]`; + * does not update [match values][9] or [positions][11]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.pos = 3 + * scanner.peek(3) # => "bar" + * scanner.terminate + * scanner.peek(3) # => "" + * ``` * */ static VALUE @@ -948,27 +1308,161 @@ strscan_peek(VALUE self, VALUE vlen) return extract_beg_len(p, p->curr, len); } -/* - * Equivalent to #peek. - * This method is obsolete; use #peek instead. - */ static VALUE -strscan_peep(VALUE self, VALUE vlen) +strscan_parse_integer(struct strscanner *p, int base, long len) +{ + VALUE buffer_v, integer; + + char *buffer = RB_ALLOCV_N(char, buffer_v, len + 1); + + MEMCPY(buffer, CURPTR(p), char, len); + buffer[len] = '\0'; + integer = rb_cstr2inum(buffer, base); + RB_ALLOCV_END(buffer_v); + p->curr += len; + + MATCHED(p); + adjust_registers_to_matched(p); + + return integer; +} + +static inline bool +strscan_ascii_compat_fastpath(VALUE str) +{ + int encindex = ENCODING_GET_INLINED(str); + /* The overwhelming majority of strings are in one of these 3 encodings. */ + return encindex == utf8_encindex || encindex == binary_encindex || encindex == usascii_encindex; +} + +static inline void +strscan_must_ascii_compat(VALUE str) { - rb_warning("StringScanner#peep is obsolete; use #peek instead"); - return strscan_peek(self, vlen); + /* The overwhelming majority of strings are in one of these 3 encodings. */ + if (RB_LIKELY(strscan_ascii_compat_fastpath(str))) { + return; + } + + rb_must_asciicompat(str); +} + +/* :nodoc: */ +static VALUE +strscan_scan_base10_integer(VALUE self) +{ + char *ptr; + long len = 0, remaining_len; + struct strscanner *p; + + GET_SCANNER(self, p); + CLEAR_MATCH_STATUS(p); + + strscan_must_ascii_compat(p->str); + + ptr = CURPTR(p); + + remaining_len = S_RESTLEN(p); + + if (remaining_len <= 0) { + return Qnil; + } + + if (ptr[len] == '-' || ptr[len] == '+') { + len++; + } + + if (!rb_isdigit(ptr[len])) { + return Qnil; + } + + p->prev = p->curr; + + while (len < remaining_len && rb_isdigit(ptr[len])) { + len++; + } + + return strscan_parse_integer(p, 10, len); +} + +/* :nodoc: */ +static VALUE +strscan_scan_base16_integer(VALUE self) +{ + char *ptr; + long len = 0, remaining_len; + struct strscanner *p; + + GET_SCANNER(self, p); + CLEAR_MATCH_STATUS(p); + + strscan_must_ascii_compat(p->str); + + ptr = CURPTR(p); + + remaining_len = S_RESTLEN(p); + + if (remaining_len <= 0) { + return Qnil; + } + + if (ptr[len] == '-' || ptr[len] == '+') { + len++; + } + + if ((remaining_len >= (len + 3)) && ptr[len] == '0' && ptr[len + 1] == 'x' && rb_isxdigit(ptr[len + 2])) { + len += 2; + } + + if (len >= remaining_len || !rb_isxdigit(ptr[len])) { + return Qnil; + } + + p->prev = p->curr; + + while (len < remaining_len && rb_isxdigit(ptr[len])) { + len++; + } + + return strscan_parse_integer(p, 16, len); } /* - * Sets the scan pointer to the previous position. Only one previous position is - * remembered, and it changes with each scanning operation. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * unscan -> self + * + * Sets the [position][2] to its value previous to the recent successful + * [match][17] attempt: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.scan(/foo/) + * put_situation(scanner) + * # Situation: + * # pos: 3 + * # charpos: 3 + * # rest: "barbaz" + * # rest_size: 6 + * scanner.unscan + * # => #<StringScanner 0/9 @ "fooba..."> + * put_situation(scanner) + * # Situation: + * # pos: 0 + * # charpos: 0 + * # rest: "foobarbaz" + * # rest_size: 9 + * ``` + * + * Raises an exception if match values are clear: + * + * ```rb + * scanner.scan(/nope/) # => nil + * match_values_cleared?(scanner) # => true + * scanner.unscan # Raises StringScanner::Error. + * ``` * - * s = StringScanner.new('test string') - * s.scan(/\w+/) # => "test" - * s.unscan - * s.scan(/../) # => "te" - * s.scan(/\d/) # => nil - * s.unscan # ScanError: unscan failed: previous match record not exist */ static VALUE strscan_unscan(VALUE self) @@ -984,16 +1478,37 @@ strscan_unscan(VALUE self) } /* - * Returns +true+ iff the scan pointer is at the beginning of the line. - * - * s = StringScanner.new("test\ntest\n") - * s.bol? # => true - * s.scan(/te/) - * s.bol? # => false - * s.scan(/st\n/) - * s.bol? # => true - * s.terminate - * s.bol? # => true + * + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * beginning_of_line? -> true or false + * + * Returns whether the [position][2] is at the beginning of a line; + * that is, at the beginning of the [stored string][1] + * or immediately after a newline: + * + * scanner = StringScanner.new(MULTILINE_TEXT) + * scanner.string + * # => "Go placidly amid the noise and haste,\nand remember what peace there may be in silence.\n" + * scanner.pos # => 0 + * scanner.beginning_of_line? # => true + * + * scanner.scan_until(/,/) # => "Go placidly amid the noise and haste," + * scanner.beginning_of_line? # => false + * + * scanner.scan(/\n/) # => "\n" + * scanner.beginning_of_line? # => true + * + * scanner.terminate + * scanner.beginning_of_line? # => true + * + * scanner.concat('x') + * scanner.terminate + * scanner.beginning_of_line? # => false + * + * StringScanner#bol? is an alias for StringScanner#beginning_of_line?. */ static VALUE strscan_bol_p(VALUE self) @@ -1007,14 +1522,24 @@ strscan_bol_p(VALUE self) } /* - * Returns +true+ if the scan pointer is at the end of the string. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * eos? -> true or false + * + * Returns whether the [position][2] + * is at the end of the [stored string][1]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.eos? # => false + * pos = 3 + * scanner.eos? # => false + * scanner.terminate + * scanner.eos? # => true + * ``` * - * s = StringScanner.new('test string') - * p s.eos? # => false - * s.scan(/test/) - * p s.eos? # => false - * s.terminate - * p s.eos? # => true */ static VALUE strscan_eos_p(VALUE self) @@ -1026,24 +1551,18 @@ strscan_eos_p(VALUE self) } /* - * Equivalent to #eos?. - * This method is obsolete, use #eos? instead. - */ -static VALUE -strscan_empty_p(VALUE self) -{ - rb_warning("StringScanner#empty? is obsolete; use #eos? instead"); - return strscan_eos_p(self); -} - -/* - * Returns true iff there is more data in the string. See #eos?. - * This method is obsolete; use #eos? instead. + * call-seq: + * rest? + * + * Returns true if and only if there is more data in the string. See #eos?. * * s = StringScanner.new('test string') - * s.eos? # These two - * s.rest? # are opposites. + * # These two are opposites + * s.eos? # => false + * s.rest? # => true */ + + /* :nodoc: */ static VALUE strscan_rest_p(VALUE self) { @@ -1054,13 +1573,26 @@ strscan_rest_p(VALUE self) } /* - * Returns +true+ iff the last match was successful. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * matched? -> true or false + * + * Returns `true` of the most recent [match attempt][17] was successful, + * `false` otherwise; + * see [Basic Matched Values][18]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.matched? # => false + * scanner.pos = 3 + * scanner.exist?(/baz/) # => 6 + * scanner.matched? # => true + * scanner.exist?(/nope/) # => nil + * scanner.matched? # => false + * ``` * - * s = StringScanner.new('test string') - * s.match?(/\w+/) # => 4 - * s.matched? # => true - * s.match?(/\d+/) # => nil - * s.matched? # => false */ static VALUE strscan_matched_p(VALUE self) @@ -1072,11 +1604,27 @@ strscan_matched_p(VALUE self) } /* - * Returns the last matched string. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * matched -> matched_substring or nil + * + * Returns the matched substring from the most recent [match][17] attempt + * if it was successful, + * or `nil` otherwise; + * see [Basic Matched Values][18]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.matched # => nil + * scanner.pos = 3 + * scanner.match?(/bar/) # => 3 + * scanner.matched # => "bar" + * scanner.match?(/nope/) # => nil + * scanner.matched # => nil + * ``` * - * s = StringScanner.new('test string') - * s.match?(/\w+/) # -> 4 - * s.matched # -> "test" */ static VALUE strscan_matched(VALUE self) @@ -1091,15 +1639,29 @@ strscan_matched(VALUE self) } /* - * Returns the size of the most recent match in bytes, or +nil+ if there - * was no recent match. This is different than <tt>matched.size</tt>, - * which will return the size in characters. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * matched_size -> substring_size or nil + * + * Returns the size (in bytes) of the matched substring + * from the most recent match [match attempt][17] if it was successful, + * or `nil` otherwise; + * see [Basic Matched Values][18]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.matched_size # => nil + * + * pos = 3 + * scanner.exist?(/baz/) # => 9 + * scanner.matched_size # => 3 + * + * scanner.exist?(/nope/) # => nil + * scanner.matched_size # => nil + * ``` * - * s = StringScanner.new('test string') - * s.check /\w+/ # -> "test" - * s.matched_size # -> 4 - * s.check /\d+/ # -> nil - * s.matched_size # -> nil */ static VALUE strscan_matched_size(VALUE self) @@ -1114,46 +1676,89 @@ strscan_matched_size(VALUE self) static int name_to_backref_number(struct re_registers *regs, VALUE regexp, const char* name, const char* name_end, rb_encoding *enc) { - int num; - - num = onig_name_to_backref_number(RREGEXP_PTR(regexp), - (const unsigned char* )name, (const unsigned char* )name_end, regs); - if (num >= 1) { - return num; - } - else { - rb_enc_raise(enc, rb_eIndexError, "undefined group name reference: %.*s", - rb_long2int(name_end - name), name); + if (RTEST(regexp)) { + int num = onig_name_to_backref_number(RREGEXP_PTR(regexp), + (const unsigned char* )name, + (const unsigned char* )name_end, + regs); + if (num >= 1) { + return num; + } } - - UNREACHABLE; + rb_enc_raise(enc, rb_eIndexError, "undefined group name reference: %.*s", + rb_long2int(name_end - name), name); } /* - * call-seq: [](n) - * - * Returns the n-th subgroup in the most recent match. - * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 " - * s[0] # -> "Fri Dec 12 " - * s[1] # -> "Fri" - * s[2] # -> "Dec" - * s[3] # -> "12" - * s.post_match # -> "1975 14:39" - * s.pre_match # -> "" - * - * s.reset - * s.scan(/(?<wday>\w+) (?<month>\w+) (?<day>\d+) /) # -> "Fri Dec 12 " - * s[0] # -> "Fri Dec 12 " - * s[1] # -> "Fri" - * s[2] # -> "Dec" - * s[3] # -> "12" - * s[:wday] # -> "Fri" - * s[:month] # -> "Dec" - * s[:day] # -> "12" - * s.post_match # -> "1975 14:39" - * s.pre_match # -> "" + * + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * [](specifier) -> substring or nil + * + * Returns a captured substring or `nil`; + * see [Captured Match Values][13]. + * + * When there are captures: + * + * ```rb + * scanner = StringScanner.new('Fri Dec 12 1975 14:39') + * scanner.scan(/(?<wday>\w+) (?<month>\w+) (?<day>\d+) /) + * ``` + * + * - `specifier` zero: returns the entire matched substring: + * + * ```rb + * scanner[0] # => "Fri Dec 12 " + * scanner.pre_match # => "" + * scanner.post_match # => "1975 14:39" + * ``` + * + * - `specifier` positive integer. returns the `n`th capture, or `nil` if out of range: + * + * ```rb + * scanner[1] # => "Fri" + * scanner[2] # => "Dec" + * scanner[3] # => "12" + * scanner[4] # => nil + * ``` + * + * - `specifier` negative integer. counts backward from the last subgroup: + * + * ```rb + * scanner[-1] # => "12" + * scanner[-4] # => "Fri Dec 12 " + * scanner[-5] # => nil + * ``` + * + * - `specifier` symbol or string. returns the named subgroup, or `nil` if no such: + * + * ```rb + * scanner[:wday] # => "Fri" + * scanner['wday'] # => "Fri" + * scanner[:month] # => "Dec" + * scanner[:day] # => "12" + * scanner[:nope] # => nil + * ``` + * + * When there are no captures, only `[0]` returns non-`nil`: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.exist?(/bar/) + * scanner[0] # => "bar" + * scanner[1] # => nil + * ``` + * + * For a failed match, even `[0]` returns `nil`: + * + * ```rb + * scanner.scan(/nope/) # => nil + * scanner[0] # => nil + * scanner[1] # => nil + * ``` + * */ static VALUE strscan_aref(VALUE self, VALUE idx) @@ -1170,7 +1775,6 @@ strscan_aref(VALUE self, VALUE idx) idx = rb_sym2str(idx); /* fall through */ case T_STRING: - if (!RTEST(p->regex)) return Qnil; RSTRING_GETMEM(idx, name, i); i = name_to_backref_number(&(p->regs), p->regex, name, name + i, rb_enc_get(idx)); break; @@ -1190,14 +1794,28 @@ strscan_aref(VALUE self, VALUE idx) } /* - * call-seq: size + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * size -> captures_count * - * Returns the amount of subgroups in the most recent match. - * The full match counts as a subgroup. + * Returns the count of captures if the most recent match attempt succeeded, `nil` otherwise; + * see [Captures Match Values][13]: + * + * ```rb + * scanner = StringScanner.new('Fri Dec 12 1975 14:39') + * scanner.size # => nil + * + * pattern = /(?<wday>\w+) (?<month>\w+) (?<day>\d+) / + * scanner.match?(pattern) + * scanner.values_at(*0..scanner.size) # => ["Fri Dec 12 ", "Fri", "Dec", "12", nil] + * scanner.size # => 4 + * + * scanner.match?(/nope/) # => nil + * scanner.size # => nil + * ``` * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 " - * s.size # -> 4 */ static VALUE strscan_size(VALUE self) @@ -1210,16 +1828,30 @@ strscan_size(VALUE self) } /* - * call-seq: captures + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * captures -> substring_array or nil + * + * Returns the array of [captured match values][13] at indexes `(1..)` + * if the most recent match attempt succeeded, or `nil` otherwise: * - * Returns the subgroups in the most recent match (not including the full match). - * If nothing was priorly matched, it returns nil. + * ```rb + * scanner = StringScanner.new('Fri Dec 12 1975 14:39') + * scanner.captures # => nil + * + * scanner.exist?(/(?<wday>\w+) (?<month>\w+) (?<day>\d+) /) + * scanner.captures # => ["Fri", "Dec", "12"] + * scanner.values_at(*0..4) # => ["Fri Dec 12 ", "Fri", "Dec", "12", nil] + * + * scanner.exist?(/Fri/) + * scanner.captures # => [] + * + * scanner.scan(/nope/) + * scanner.captures # => nil + * ``` * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 " - * s.captures # -> ["Fri", "Dec", "12"] - * s.scan(/(\w+) (\w+) (\d+) /) # -> nil - * s.captures # -> nil */ static VALUE strscan_captures(VALUE self) @@ -1235,9 +1867,13 @@ strscan_captures(VALUE self) new_ary = rb_ary_new2(num_regs); for (i = 1; i < num_regs; i++) { - VALUE str = extract_range(p, - adjust_register_position(p, p->regs.beg[i]), - adjust_register_position(p, p->regs.end[i])); + VALUE str; + if (p->regs.beg[i] == -1) + str = Qnil; + else + str = extract_range(p, + adjust_register_position(p, p->regs.beg[i]), + adjust_register_position(p, p->regs.end[i])); rb_ary_push(new_ary, str); } @@ -1245,17 +1881,25 @@ strscan_captures(VALUE self) } /* - * call-seq: - * scanner.values_at( i1, i2, ... iN ) -> an_array + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * values_at(*specifiers) -> array_of_captures or nil + * + * Returns an array of captured substrings, or `nil` of none. * - * Returns the subgroups in the most recent match at the given indices. - * If nothing was priorly matched, it returns nil. + * For each `specifier`, the returned substring is `[specifier]`; + * see #[]. + * + * ```rb + * scanner = StringScanner.new('Fri Dec 12 1975 14:39') + * pattern = /(?<wday>\w+) (?<month>\w+) (?<day>\d+) / + * scanner.match?(pattern) + * scanner.values_at(*0..3) # => ["Fri Dec 12 ", "Fri", "Dec", "12"] + * scanner.values_at(*%i[wday month day]) # => ["Fri", "Dec", "12"] + * ``` * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 " - * s.values_at 0, -1, 5, 2 # -> ["Fri Dec 12 ", "12", nil, "Dec"] - * s.scan(/(\w+) (\w+) (\d+) /) # -> nil - * s.values_at 0, -1, 5, 2 # -> nil */ static VALUE @@ -1277,13 +1921,29 @@ strscan_values_at(int argc, VALUE *argv, VALUE self) } /* - * Returns the <i><b>pre</b>-match</i> (in the regular expression sense) of the last scan. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * pre_match -> substring + * + * Returns the substring that precedes the matched substring + * from the most recent match attempt if it was successful, + * or `nil` otherwise; + * see [Basic Match Values][18]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.pre_match # => nil + * + * scanner.pos = 3 + * scanner.exist?(/baz/) # => 6 + * scanner.pre_match # => "foobar" # Substring of entire string, not just target string. + * + * scanner.exist?(/nope/) # => nil + * scanner.pre_match # => nil + * ``` * - * s = StringScanner.new('test string') - * s.scan(/\w+/) # -> "test" - * s.scan(/\s+/) # -> " " - * s.pre_match # -> "test" - * s.post_match # -> "string" */ static VALUE strscan_pre_match(VALUE self) @@ -1298,13 +1958,29 @@ strscan_pre_match(VALUE self) } /* - * Returns the <i><b>post</b>-match</i> (in the regular expression sense) of the last scan. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * post_match -> substring + * + * Returns the substring that follows the matched substring + * from the most recent match attempt if it was successful, + * or `nil` otherwise; + * see [Basic Match Values][18]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.post_match # => nil + * + * scanner.pos = 3 + * scanner.match?(/bar/) # => 3 + * scanner.post_match # => "baz" + * + * scanner.match?(/nope/) # => nil + * scanner.post_match # => nil + * ``` * - * s = StringScanner.new('test string') - * s.scan(/\w+/) # -> "test" - * s.scan(/\s+/) # -> " " - * s.pre_match # -> "test" - * s.post_match # -> "string" */ static VALUE strscan_post_match(VALUE self) @@ -1319,8 +1995,24 @@ strscan_post_match(VALUE self) } /* - * Returns the "rest" of the string (i.e. everything after the scan pointer). - * If there is no more data (eos? = true), it returns <tt>""</tt>. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * rest -> target_substring + * + * Returns the 'rest' of the [stored string][1] (all after the current [position][2]), + * which is the [target substring][3]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.rest # => "foobarbaz" + * scanner.pos = 3 + * scanner.rest # => "barbaz" + * scanner.terminate + * scanner.rest # => "" + * ``` + * */ static VALUE strscan_rest(VALUE self) @@ -1335,7 +2027,26 @@ strscan_rest(VALUE self) } /* - * <tt>s.rest_size</tt> is equivalent to <tt>s.rest.size</tt>. + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * rest_size -> integer + * + * Returns the size (in bytes) of the #rest of the [stored string][1]: + * + * ```rb + * scanner = StringScanner.new('foobarbaz') + * scanner.rest # => "foobarbaz" + * scanner.rest_size # => 9 + * scanner.pos = 3 + * scanner.rest # => "barbaz" + * scanner.rest_size # => 6 + * scanner.terminate + * scanner.rest # => "" + * scanner.rest_size # => 0 + * ``` + * */ static VALUE strscan_rest_size(VALUE self) @@ -1351,29 +2062,42 @@ strscan_rest_size(VALUE self) return INT2FIX(i); } -/* - * <tt>s.restsize</tt> is equivalent to <tt>s.rest_size</tt>. - * This method is obsolete; use #rest_size instead. - */ -static VALUE -strscan_restsize(VALUE self) -{ - rb_warning("StringScanner#restsize is obsolete; use #rest_size instead"); - return strscan_rest_size(self); -} - #define INSPECT_LENGTH 5 /* - * Returns a string that represents the StringScanner object, showing: - * - the current position - * - the size of the string - * - the characters surrounding the scan pointer - * - * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.inspect # -> '#<StringScanner 0/21 @ "Fri D...">' - * s.scan_until /12/ # -> "Fri Dec 12" - * s.inspect # -> '#<StringScanner 10/21 "...ec 12" @ " 1975...">' + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * inspect -> string + * + * Returns a string representation of `self` that may show: + * + * 1. The current [position][2]. + * 2. The size (in bytes) of the [stored string][1]. + * 3. The substring preceding the current position. + * 4. The substring following the current position (which is also the [target substring][3]). + * + * ```rb + * scanner = StringScanner.new("Fri Dec 12 1975 14:39") + * scanner.pos = 11 + * scanner.inspect # => "#<StringScanner 11/21 \"...c 12 \" @ \"1975 ...\">" + * ``` + * + * If at beginning-of-string, item 4 above (following substring) is omitted: + * + * ```rb + * scanner.reset + * scanner.inspect # => "#<StringScanner 0/21 @ \"Fri D...\">" + * ``` + * + * If at end-of-string, all items above are omitted: + * + * ```rb + * scanner.terminate + * scanner.inspect # => "#<StringScanner fin>" + * ``` + * */ static VALUE strscan_inspect(VALUE self) @@ -1445,13 +2169,13 @@ inspect2(struct strscanner *p) } /* - * call-seq: - * scanner.fixed_anchor? -> true or false + * :markup: markdown + * :include: strscan/link_refs.txt * - * Whether +scanner+ uses fixed anchor mode or not. + * call-seq: + * fixed_anchor? -> true or false * - * If fixed anchor mode is used, +\A+ always matches the beginning of - * the string. Otherwise, +\A+ always matches the current position. + * Returns whether the [fixed-anchor property][10] is set. */ static VALUE strscan_fixed_anchor_p(VALUE self) @@ -1461,112 +2185,96 @@ strscan_fixed_anchor_p(VALUE self) return p->fixed_anchor_p ? Qtrue : Qfalse; } +typedef struct { + VALUE self; + VALUE captures; +} named_captures_data; + +static int +named_captures_iter(const OnigUChar *name, + const OnigUChar *name_end, + int back_num, + int *back_refs, + OnigRegex regex, + void *arg) +{ + named_captures_data *data = arg; + + VALUE key = rb_str_new((const char *)name, name_end - name); + VALUE value = RUBY_Qnil; + int i; + for (i = 0; i < back_num; i++) { + VALUE v = strscan_aref(data->self, INT2NUM(back_refs[i])); + if (!RB_NIL_P(v)) { + value = v; + } + } + rb_hash_aset(data->captures, key, value); + return 0; +} + +/* + * :markup: markdown + * :include: strscan/link_refs.txt + * + * call-seq: + * named_captures -> hash + * + * Returns a hash of named captures for the most recent regexp match, + * or an empty hash if there are no named captures; + * see [Captured Match Values][13]: + * + * ```rb + * scanner = StringScanner.new('Fri Dec 12 1975 14:39') + * scanner.named_captures # => {} + * + * pattern = /(?<wday>\w+) (?<month>\w+) (?<day>\d+) / + * scanner.match?(pattern) + * scanner.named_captures # => {"wday"=>"Fri", "month"=>"Dec", "day"=>"12"} + * + * scanner.string = 'nope' + * scanner.match?(pattern) + * scanner.named_captures # => {"wday"=>nil, "month"=>nil, "day"=>nil} + * + * scanner.match?(/nosuch/) + * scanner.named_captures # => {} + * ``` + * + */ +static VALUE +strscan_named_captures(VALUE self) +{ + struct strscanner *p; + named_captures_data data; + GET_SCANNER(self, p); + data.self = self; + data.captures = rb_hash_new(); + if (!RB_NIL_P(p->regex)) { + onig_foreach_name(RREGEXP_PTR(p->regex), named_captures_iter, &data); + } + + return data.captures; +} + /* ======================================================================= Ruby Interface ======================================================================= */ /* + * Document-class: StringScanner::Error + * + * The error class for StringScanner. + * See StringScanner#unscan. + */ + +/* * Document-class: StringScanner * - * StringScanner provides for lexical scanning operations on a String. Here is - * an example of its usage: - * - * s = StringScanner.new('This is an example string') - * s.eos? # -> false - * - * p s.scan(/\w+/) # -> "This" - * p s.scan(/\w+/) # -> nil - * p s.scan(/\s+/) # -> " " - * p s.scan(/\s+/) # -> nil - * p s.scan(/\w+/) # -> "is" - * s.eos? # -> false - * - * p s.scan(/\s+/) # -> " " - * p s.scan(/\w+/) # -> "an" - * p s.scan(/\s+/) # -> " " - * p s.scan(/\w+/) # -> "example" - * p s.scan(/\s+/) # -> " " - * p s.scan(/\w+/) # -> "string" - * s.eos? # -> true - * - * p s.scan(/\s+/) # -> nil - * p s.scan(/\w+/) # -> nil - * - * Scanning a string means remembering the position of a <i>scan pointer</i>, - * which is just an index. The point of scanning is to move forward a bit at - * a time, so matches are sought after the scan pointer; usually immediately - * after it. - * - * Given the string "test string", here are the pertinent scan pointer - * positions: - * - * t e s t s t r i n g - * 0 1 2 ... 1 - * 0 - * - * When you #scan for a pattern (a regular expression), the match must occur - * at the character after the scan pointer. If you use #scan_until, then the - * match can occur anywhere after the scan pointer. In both cases, the scan - * pointer moves <i>just beyond</i> the last character of the match, ready to - * scan again from the next character onwards. This is demonstrated by the - * example above. - * - * == Method Categories - * - * There are other methods besides the plain scanners. You can look ahead in - * the string without actually scanning. You can access the most recent match. - * You can modify the string being scanned, reset or terminate the scanner, - * find out or change the position of the scan pointer, skip ahead, and so on. - * - * === Advancing the Scan Pointer - * - * - #getch - * - #get_byte - * - #scan - * - #scan_until - * - #skip - * - #skip_until - * - * === Looking Ahead - * - * - #check - * - #check_until - * - #exist? - * - #match? - * - #peek - * - * === Finding Where we Are - * - * - #beginning_of_line? (#bol?) - * - #eos? - * - #rest? - * - #rest_size - * - #pos - * - * === Setting Where we Are - * - * - #reset - * - #terminate - * - #pos= - * - * === Match Data - * - * - #matched - * - #matched? - * - #matched_size - * - [] - * - #pre_match - * - #post_match - * - * === Miscellaneous - * - * - << - * - #concat - * - #string - * - #string= - * - #unscan - * - * There are aliases to several of the methods. + * :markup: markdown + * + * :include: strscan/link_refs.txt + * :include: strscan/strscan.md + * */ void Init_strscan(void) @@ -1579,12 +2287,15 @@ Init_strscan(void) ID id_scanerr = rb_intern("ScanError"); VALUE tmp; - id_byteslice = rb_intern("byteslice"); + usascii_encindex = rb_usascii_encindex(); + utf8_encindex = rb_utf8_encindex(); + binary_encindex = rb_ascii8bit_encindex(); StringScanner = rb_define_class("StringScanner", rb_cObject); ScanError = rb_define_class_under(StringScanner, "Error", rb_eStandardError); if (!rb_const_defined(rb_cObject, id_scanerr)) { rb_const_set(rb_cObject, id_scanerr, ScanError); + rb_deprecate_constant(rb_cObject, "ScanError"); } tmp = rb_str_new2(STRSCAN_VERSION); rb_obj_freeze(tmp); @@ -1592,6 +2303,7 @@ Init_strscan(void) tmp = rb_str_new2("$Id$"); rb_obj_freeze(tmp); rb_const_set(StringScanner, rb_intern("Id"), tmp); + rb_deprecate_constant(StringScanner, "Id"); rb_define_alloc_func(StringScanner, strscan_s_allocate); rb_define_private_method(StringScanner, "initialize", strscan_initialize, -1); @@ -1599,7 +2311,6 @@ Init_strscan(void) rb_define_singleton_method(StringScanner, "must_C_version", strscan_s_mustc, 0); rb_define_method(StringScanner, "reset", strscan_reset, 0); rb_define_method(StringScanner, "terminate", strscan_terminate, 0); - rb_define_method(StringScanner, "clear", strscan_clear, 0); rb_define_method(StringScanner, "string", strscan_get_string, 0); rb_define_method(StringScanner, "string=", strscan_set_string, 1); rb_define_method(StringScanner, "concat", strscan_concat, 1); @@ -1624,16 +2335,18 @@ Init_strscan(void) rb_define_method(StringScanner, "getch", strscan_getch, 0); rb_define_method(StringScanner, "get_byte", strscan_get_byte, 0); - rb_define_method(StringScanner, "getbyte", strscan_getbyte, 0); + rb_define_method(StringScanner, "scan_byte", strscan_scan_byte, 0); rb_define_method(StringScanner, "peek", strscan_peek, 1); - rb_define_method(StringScanner, "peep", strscan_peep, 1); + rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0); + + rb_define_private_method(StringScanner, "scan_base10_integer", strscan_scan_base10_integer, 0); + rb_define_private_method(StringScanner, "scan_base16_integer", strscan_scan_base16_integer, 0); rb_define_method(StringScanner, "unscan", strscan_unscan, 0); rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0); rb_alias(StringScanner, rb_intern("bol?"), rb_intern("beginning_of_line?")); rb_define_method(StringScanner, "eos?", strscan_eos_p, 0); - rb_define_method(StringScanner, "empty?", strscan_empty_p, 0); rb_define_method(StringScanner, "rest?", strscan_rest_p, 0); rb_define_method(StringScanner, "matched?", strscan_matched_p, 0); @@ -1648,9 +2361,10 @@ Init_strscan(void) rb_define_method(StringScanner, "rest", strscan_rest, 0); rb_define_method(StringScanner, "rest_size", strscan_rest_size, 0); - rb_define_method(StringScanner, "restsize", strscan_restsize, 0); rb_define_method(StringScanner, "inspect", strscan_inspect, 0); rb_define_method(StringScanner, "fixed_anchor?", strscan_fixed_anchor_p, 0); + + rb_define_method(StringScanner, "named_captures", strscan_named_captures, 0); } diff --git a/ext/strscan/strscan.gemspec b/ext/strscan/strscan.gemspec index fa9b895a9c..a51285fa7e 100644 --- a/ext/strscan/strscan.gemspec +++ b/ext/strscan/strscan.gemspec @@ -16,17 +16,34 @@ Gem::Specification.new do |s| s.summary = "Provides lexical scanning operations on a String." s.description = "Provides lexical scanning operations on a String." - s.require_path = %w{lib} - s.files = %w{ext/strscan/extconf.rb ext/strscan/strscan.c} - s.extensions = %w{ext/strscan/extconf.rb} + files = %w[ + COPYING + LICENSE.txt + lib/strscan.rb + lib/strscan/strscan.rb + lib/strscan/truffleruby.rb + ] + + s.require_paths = %w{lib} + + if RUBY_ENGINE == "jruby" + files << "lib/strscan.jar" + s.platform = "java" + else + files << "ext/strscan/extconf.rb" + files << "ext/strscan/strscan.c" + s.rdoc_options << "-idoc" + s.extra_rdoc_files = [ + ".rdoc_options", + *Dir.glob("doc/strscan/**/*") + ] + s.extensions = %w{ext/strscan/extconf.rb} + end + s.files = files s.required_ruby_version = ">= 2.4.0" - s.authors = ["Minero Aoki", "Sutou Kouhei"] - s.email = [nil, "kou@cozmixng.org"] + s.authors = ["Minero Aoki", "Sutou Kouhei", "Charles Oliver Nutter"] + s.email = [nil, "kou@cozmixng.org", "headius@headius.com"] s.homepage = "https://github.com/ruby/strscan" s.licenses = ["Ruby", "BSD-2-Clause"] - - s.add_development_dependency "rake-compiler" - s.add_development_dependency "benchmark-driver" - s.add_development_dependency "test-unit" end |
