diff options
Diffstat (limited to 'ext/strscan/strscan.c')
-rw-r--r-- | ext/strscan/strscan.c | 251 |
1 files changed, 197 insertions, 54 deletions
diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 42027cd8d2..70a3ce5260 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -22,7 +22,7 @@ extern size_t onig_region_memsize(const struct re_registers *regs); #include <stdbool.h> -#define STRSCAN_VERSION "3.0.0" +#define STRSCAN_VERSION "3.1.1" /* ======================================================================= Data Type Definitions @@ -435,11 +435,11 @@ strscan_get_pos(VALUE self) * * In short, it's a 0-based index into the string. * - * s = StringScanner.new("abcädeföghi") - * s.charpos # -> 0 - * s.scan_until(/ä/) # -> "abcä" - * s.pos # -> 5 - * s.charpos # -> 4 + * s = StringScanner.new("abc\u00e4def\u00f6ghi") + * s.charpos # -> 0 + * s.scan_until(/\u00e4/) # -> "abc\u00E4" + * s.pos # -> 5 + * s.charpos # -> 4 */ static VALUE strscan_get_charpos(VALUE self) @@ -539,6 +539,68 @@ adjust_register_position(struct strscanner *p, long position) } } +/* rb_reg_onig_match is available in Ruby 3.3 and later. */ +#ifndef HAVE_RB_REG_ONIG_MATCH +static OnigPosition +rb_reg_onig_match(VALUE re, VALUE str, + OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args), + void *args, struct re_registers *regs) +{ + regex_t *reg = rb_reg_prepare_re(re, str); + + bool tmpreg = reg != RREGEXP_PTR(re); + if (!tmpreg) RREGEXP(re)->usecnt++; + + OnigPosition result = match(reg, str, regs, args); + + if (!tmpreg) RREGEXP(re)->usecnt--; + if (tmpreg) { + if (RREGEXP(re)->usecnt) { + onig_free(reg); + } + else { + onig_free(RREGEXP_PTR(re)); + RREGEXP_PTR(re) = reg; + } + } + + if (result < 0) { + if (result != ONIG_MISMATCH) { + rb_raise(ScanError, "regexp buffer overflow"); + } + } + + return result; +} +#endif + +static OnigPosition +strscan_match(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr) +{ + struct strscanner *p = (struct strscanner *)args_ptr; + + return onig_match(reg, + match_target(p), + (UChar* )(CURPTR(p) + S_RESTLEN(p)), + (UChar* )CURPTR(p), + regs, + ONIG_OPTION_NONE); +} + +static OnigPosition +strscan_search(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr) +{ + struct strscanner *p = (struct strscanner *)args_ptr; + + return onig_search(reg, + match_target(p), + (UChar *)(CURPTR(p) + S_RESTLEN(p)), + (UChar *)CURPTR(p), + (UChar *)(CURPTR(p) + S_RESTLEN(p)), + regs, + ONIG_OPTION_NONE); +} + static VALUE strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly) { @@ -560,47 +622,14 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly } if (RB_TYPE_P(pattern, T_REGEXP)) { - regex_t *rb_reg_prepare_re(VALUE re, VALUE str); - regex_t *re; - long ret; - int tmpreg; - p->regex = pattern; - re = rb_reg_prepare_re(pattern, p->str); - tmpreg = re != RREGEXP_PTR(pattern); - if (!tmpreg) RREGEXP(pattern)->usecnt++; - - if (headonly) { - ret = onig_match(re, - match_target(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - (UChar* )CURPTR(p), - &(p->regs), - ONIG_OPTION_NONE); - } - else { - ret = onig_search(re, - match_target(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - (UChar* )CURPTR(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - &(p->regs), - ONIG_OPTION_NONE); - } - if (!tmpreg) RREGEXP(pattern)->usecnt--; - if (tmpreg) { - if (RREGEXP(pattern)->usecnt) { - onig_free(re); - } - else { - onig_free(RREGEXP_PTR(pattern)); - RREGEXP_PTR(pattern) = re; - } - } + OnigPosition ret = rb_reg_onig_match(pattern, + p->str, + headonly ? strscan_match : strscan_search, + (void *)p, + &(p->regs)); - if (ret == -2) rb_raise(ScanError, "regexp buffer overflow"); - if (ret < 0) { - /* not matched */ + if (ret == ONIG_MISMATCH) { return Qnil; } } @@ -874,6 +903,57 @@ strscan_getch(VALUE self) } /* + * Scans one byte and returns it as an integer. + * This method is not multibyte character sensitive. + * See also: #getch. + * + * s = StringScanner.new('ab') + * s.scan_byte # => 97 + * s.scan_byte # => 98 + * s.scan_byte # => nil + * + * s = StringScanner.new("\244\242".force_encoding("euc-jp")) + * s.scan_byte # => 0xA4 + * s.scan_byte # => 0xA2 + * s.scan_byte # => nil + */ +static VALUE +strscan_scan_byte(VALUE self) +{ + struct strscanner *p; + + GET_SCANNER(self, p); + CLEAR_MATCH_STATUS(p); + if (EOS_P(p)) + return Qnil; + + VALUE byte = INT2FIX((unsigned char)*CURPTR(p)); + p->prev = p->curr; + p->curr++; + MATCHED(p); + adjust_registers_to_matched(p); + return byte; +} + +/* + * Peeks at the current byte and returns it as an integer. + * + * s = StringScanner.new('ab') + * s.peek_byte # => 97 + */ +static VALUE +strscan_peek_byte(VALUE self) +{ + struct strscanner *p; + + GET_SCANNER(self, p); + if (EOS_P(p)) + return Qnil; + + return INT2FIX((unsigned char)*CURPTR(p)); +} + +/* * Scans one byte and returns it. * This method is not multibyte character sensitive. * See also: #getch. @@ -1038,8 +1118,9 @@ strscan_empty_p(VALUE self) * This method is obsolete; use #eos? instead. * * s = StringScanner.new('test string') - * s.eos? # These two - * s.rest? # are opposites. + * # These two are opposites + * s.eos? # => false + * s.rest? # => true */ static VALUE strscan_rest_p(VALUE self) @@ -1213,10 +1294,10 @@ strscan_size(VALUE self) * If nothing was priorly matched, it returns nil. * * s = StringScanner.new("Fri Dec 12 1975 14:39") - * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 " - * s.captures # -> ["Fri", "Dec", "12"] - * s.scan(/(\w+) (\w+) (\d+) /) # -> nil - * s.captures # -> nil + * s.scan(/(\w+) (\w+) (\d+) (1980)?/) # -> "Fri Dec 12 " + * s.captures # -> ["Fri", "Dec", "12", nil] + * s.scan(/(\w+) (\w+) (\d+) (1980)?/) # -> nil + * s.captures # -> nil */ static VALUE strscan_captures(VALUE self) @@ -1232,9 +1313,13 @@ strscan_captures(VALUE self) new_ary = rb_ary_new2(num_regs); for (i = 1; i < num_regs; i++) { - VALUE str = extract_range(p, - adjust_register_position(p, p->regs.beg[i]), - adjust_register_position(p, p->regs.end[i])); + VALUE str; + if (p->regs.beg[i] == -1) + str = Qnil; + else + str = extract_range(p, + adjust_register_position(p, p->regs.beg[i]), + adjust_register_position(p, p->regs.end[i])); rb_ary_push(new_ary, str); } @@ -1458,6 +1543,56 @@ strscan_fixed_anchor_p(VALUE self) return p->fixed_anchor_p ? Qtrue : Qfalse; } +typedef struct { + VALUE self; + VALUE captures; +} named_captures_data; + +static int +named_captures_iter(const OnigUChar *name, + const OnigUChar *name_end, + int back_num, + int *back_refs, + OnigRegex regex, + void *arg) +{ + named_captures_data *data = arg; + + VALUE key = rb_str_new((const char *)name, name_end - name); + VALUE value = RUBY_Qnil; + int i; + for (i = 0; i < back_num; i++) { + value = strscan_aref(data->self, INT2NUM(back_refs[i])); + } + rb_hash_aset(data->captures, key, value); + return 0; +} + +/* + * call-seq: + * scanner.named_captures -> hash + * + * Returns a hash of string variables matching the regular expression. + * + * scan = StringScanner.new('foobarbaz') + * scan.match?(/(?<f>foo)(?<r>bar)(?<z>baz)/) + * scan.named_captures # -> {"f"=>"foo", "r"=>"bar", "z"=>"baz"} + */ +static VALUE +strscan_named_captures(VALUE self) +{ + struct strscanner *p; + GET_SCANNER(self, p); + named_captures_data data; + data.self = self; + data.captures = rb_hash_new(); + if (!RB_NIL_P(p->regex)) { + onig_foreach_name(RREGEXP_PTR(p->regex), named_captures_iter, &data); + } + + return data.captures; +} + /* ======================================================================= Ruby Interface ======================================================================= */ @@ -1468,6 +1603,8 @@ strscan_fixed_anchor_p(VALUE self) * StringScanner provides for lexical scanning operations on a String. Here is * an example of its usage: * + * require 'strscan' + * * s = StringScanner.new('This is an example string') * s.eos? # -> false * @@ -1519,6 +1656,7 @@ strscan_fixed_anchor_p(VALUE self) * * - #getch * - #get_byte + * - #scan_byte * - #scan * - #scan_until * - #skip @@ -1531,6 +1669,7 @@ strscan_fixed_anchor_p(VALUE self) * - #exist? * - #match? * - #peek + * - #peek_byte * * === Finding Where we Are * @@ -1622,7 +1761,9 @@ Init_strscan(void) rb_define_method(StringScanner, "getch", strscan_getch, 0); rb_define_method(StringScanner, "get_byte", strscan_get_byte, 0); rb_define_method(StringScanner, "getbyte", strscan_getbyte, 0); + rb_define_method(StringScanner, "scan_byte", strscan_scan_byte, 0); rb_define_method(StringScanner, "peek", strscan_peek, 1); + rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0); rb_define_method(StringScanner, "peep", strscan_peep, 1); rb_define_method(StringScanner, "unscan", strscan_unscan, 0); @@ -1650,4 +1791,6 @@ Init_strscan(void) rb_define_method(StringScanner, "inspect", strscan_inspect, 0); rb_define_method(StringScanner, "fixed_anchor?", strscan_fixed_anchor_p, 0); + + rb_define_method(StringScanner, "named_captures", strscan_named_captures, 0); } |