summaryrefslogtreecommitdiff
path: root/ext/strscan/strscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/strscan/strscan.c')
-rw-r--r--ext/strscan/strscan.c251
1 files changed, 197 insertions, 54 deletions
diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c
index 42027cd8d2..70a3ce5260 100644
--- a/ext/strscan/strscan.c
+++ b/ext/strscan/strscan.c
@@ -22,7 +22,7 @@ extern size_t onig_region_memsize(const struct re_registers *regs);
#include <stdbool.h>
-#define STRSCAN_VERSION "3.0.0"
+#define STRSCAN_VERSION "3.1.1"
/* =======================================================================
Data Type Definitions
@@ -435,11 +435,11 @@ strscan_get_pos(VALUE self)
*
* In short, it's a 0-based index into the string.
*
- * s = StringScanner.new("abcädeföghi")
- * s.charpos # -> 0
- * s.scan_until(/ä/) # -> "abcä"
- * s.pos # -> 5
- * s.charpos # -> 4
+ * s = StringScanner.new("abc\u00e4def\u00f6ghi")
+ * s.charpos # -> 0
+ * s.scan_until(/\u00e4/) # -> "abc\u00E4"
+ * s.pos # -> 5
+ * s.charpos # -> 4
*/
static VALUE
strscan_get_charpos(VALUE self)
@@ -539,6 +539,68 @@ adjust_register_position(struct strscanner *p, long position)
}
}
+/* rb_reg_onig_match is available in Ruby 3.3 and later. */
+#ifndef HAVE_RB_REG_ONIG_MATCH
+static OnigPosition
+rb_reg_onig_match(VALUE re, VALUE str,
+ OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args),
+ void *args, struct re_registers *regs)
+{
+ regex_t *reg = rb_reg_prepare_re(re, str);
+
+ bool tmpreg = reg != RREGEXP_PTR(re);
+ if (!tmpreg) RREGEXP(re)->usecnt++;
+
+ OnigPosition result = match(reg, str, regs, args);
+
+ if (!tmpreg) RREGEXP(re)->usecnt--;
+ if (tmpreg) {
+ if (RREGEXP(re)->usecnt) {
+ onig_free(reg);
+ }
+ else {
+ onig_free(RREGEXP_PTR(re));
+ RREGEXP_PTR(re) = reg;
+ }
+ }
+
+ if (result < 0) {
+ if (result != ONIG_MISMATCH) {
+ rb_raise(ScanError, "regexp buffer overflow");
+ }
+ }
+
+ return result;
+}
+#endif
+
+static OnigPosition
+strscan_match(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr)
+{
+ struct strscanner *p = (struct strscanner *)args_ptr;
+
+ return onig_match(reg,
+ match_target(p),
+ (UChar* )(CURPTR(p) + S_RESTLEN(p)),
+ (UChar* )CURPTR(p),
+ regs,
+ ONIG_OPTION_NONE);
+}
+
+static OnigPosition
+strscan_search(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr)
+{
+ struct strscanner *p = (struct strscanner *)args_ptr;
+
+ return onig_search(reg,
+ match_target(p),
+ (UChar *)(CURPTR(p) + S_RESTLEN(p)),
+ (UChar *)CURPTR(p),
+ (UChar *)(CURPTR(p) + S_RESTLEN(p)),
+ regs,
+ ONIG_OPTION_NONE);
+}
+
static VALUE
strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly)
{
@@ -560,47 +622,14 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly
}
if (RB_TYPE_P(pattern, T_REGEXP)) {
- regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
- regex_t *re;
- long ret;
- int tmpreg;
-
p->regex = pattern;
- re = rb_reg_prepare_re(pattern, p->str);
- tmpreg = re != RREGEXP_PTR(pattern);
- if (!tmpreg) RREGEXP(pattern)->usecnt++;
-
- if (headonly) {
- ret = onig_match(re,
- match_target(p),
- (UChar* )(CURPTR(p) + S_RESTLEN(p)),
- (UChar* )CURPTR(p),
- &(p->regs),
- ONIG_OPTION_NONE);
- }
- else {
- ret = onig_search(re,
- match_target(p),
- (UChar* )(CURPTR(p) + S_RESTLEN(p)),
- (UChar* )CURPTR(p),
- (UChar* )(CURPTR(p) + S_RESTLEN(p)),
- &(p->regs),
- ONIG_OPTION_NONE);
- }
- if (!tmpreg) RREGEXP(pattern)->usecnt--;
- if (tmpreg) {
- if (RREGEXP(pattern)->usecnt) {
- onig_free(re);
- }
- else {
- onig_free(RREGEXP_PTR(pattern));
- RREGEXP_PTR(pattern) = re;
- }
- }
+ OnigPosition ret = rb_reg_onig_match(pattern,
+ p->str,
+ headonly ? strscan_match : strscan_search,
+ (void *)p,
+ &(p->regs));
- if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
- if (ret < 0) {
- /* not matched */
+ if (ret == ONIG_MISMATCH) {
return Qnil;
}
}
@@ -874,6 +903,57 @@ strscan_getch(VALUE self)
}
/*
+ * Scans one byte and returns it as an integer.
+ * This method is not multibyte character sensitive.
+ * See also: #getch.
+ *
+ * s = StringScanner.new('ab')
+ * s.scan_byte # => 97
+ * s.scan_byte # => 98
+ * s.scan_byte # => nil
+ *
+ * s = StringScanner.new("\244\242".force_encoding("euc-jp"))
+ * s.scan_byte # => 0xA4
+ * s.scan_byte # => 0xA2
+ * s.scan_byte # => nil
+ */
+static VALUE
+strscan_scan_byte(VALUE self)
+{
+ struct strscanner *p;
+
+ GET_SCANNER(self, p);
+ CLEAR_MATCH_STATUS(p);
+ if (EOS_P(p))
+ return Qnil;
+
+ VALUE byte = INT2FIX((unsigned char)*CURPTR(p));
+ p->prev = p->curr;
+ p->curr++;
+ MATCHED(p);
+ adjust_registers_to_matched(p);
+ return byte;
+}
+
+/*
+ * Peeks at the current byte and returns it as an integer.
+ *
+ * s = StringScanner.new('ab')
+ * s.peek_byte # => 97
+ */
+static VALUE
+strscan_peek_byte(VALUE self)
+{
+ struct strscanner *p;
+
+ GET_SCANNER(self, p);
+ if (EOS_P(p))
+ return Qnil;
+
+ return INT2FIX((unsigned char)*CURPTR(p));
+}
+
+/*
* Scans one byte and returns it.
* This method is not multibyte character sensitive.
* See also: #getch.
@@ -1038,8 +1118,9 @@ strscan_empty_p(VALUE self)
* This method is obsolete; use #eos? instead.
*
* s = StringScanner.new('test string')
- * s.eos? # These two
- * s.rest? # are opposites.
+ * # These two are opposites
+ * s.eos? # => false
+ * s.rest? # => true
*/
static VALUE
strscan_rest_p(VALUE self)
@@ -1213,10 +1294,10 @@ strscan_size(VALUE self)
* If nothing was priorly matched, it returns nil.
*
* s = StringScanner.new("Fri Dec 12 1975 14:39")
- * s.scan(/(\w+) (\w+) (\d+) /) # -> "Fri Dec 12 "
- * s.captures # -> ["Fri", "Dec", "12"]
- * s.scan(/(\w+) (\w+) (\d+) /) # -> nil
- * s.captures # -> nil
+ * s.scan(/(\w+) (\w+) (\d+) (1980)?/) # -> "Fri Dec 12 "
+ * s.captures # -> ["Fri", "Dec", "12", nil]
+ * s.scan(/(\w+) (\w+) (\d+) (1980)?/) # -> nil
+ * s.captures # -> nil
*/
static VALUE
strscan_captures(VALUE self)
@@ -1232,9 +1313,13 @@ strscan_captures(VALUE self)
new_ary = rb_ary_new2(num_regs);
for (i = 1; i < num_regs; i++) {
- VALUE str = extract_range(p,
- adjust_register_position(p, p->regs.beg[i]),
- adjust_register_position(p, p->regs.end[i]));
+ VALUE str;
+ if (p->regs.beg[i] == -1)
+ str = Qnil;
+ else
+ str = extract_range(p,
+ adjust_register_position(p, p->regs.beg[i]),
+ adjust_register_position(p, p->regs.end[i]));
rb_ary_push(new_ary, str);
}
@@ -1458,6 +1543,56 @@ strscan_fixed_anchor_p(VALUE self)
return p->fixed_anchor_p ? Qtrue : Qfalse;
}
+typedef struct {
+ VALUE self;
+ VALUE captures;
+} named_captures_data;
+
+static int
+named_captures_iter(const OnigUChar *name,
+ const OnigUChar *name_end,
+ int back_num,
+ int *back_refs,
+ OnigRegex regex,
+ void *arg)
+{
+ named_captures_data *data = arg;
+
+ VALUE key = rb_str_new((const char *)name, name_end - name);
+ VALUE value = RUBY_Qnil;
+ int i;
+ for (i = 0; i < back_num; i++) {
+ value = strscan_aref(data->self, INT2NUM(back_refs[i]));
+ }
+ rb_hash_aset(data->captures, key, value);
+ return 0;
+}
+
+/*
+ * call-seq:
+ * scanner.named_captures -> hash
+ *
+ * Returns a hash of string variables matching the regular expression.
+ *
+ * scan = StringScanner.new('foobarbaz')
+ * scan.match?(/(?<f>foo)(?<r>bar)(?<z>baz)/)
+ * scan.named_captures # -> {"f"=>"foo", "r"=>"bar", "z"=>"baz"}
+ */
+static VALUE
+strscan_named_captures(VALUE self)
+{
+ struct strscanner *p;
+ GET_SCANNER(self, p);
+ named_captures_data data;
+ data.self = self;
+ data.captures = rb_hash_new();
+ if (!RB_NIL_P(p->regex)) {
+ onig_foreach_name(RREGEXP_PTR(p->regex), named_captures_iter, &data);
+ }
+
+ return data.captures;
+}
+
/* =======================================================================
Ruby Interface
======================================================================= */
@@ -1468,6 +1603,8 @@ strscan_fixed_anchor_p(VALUE self)
* StringScanner provides for lexical scanning operations on a String. Here is
* an example of its usage:
*
+ * require 'strscan'
+ *
* s = StringScanner.new('This is an example string')
* s.eos? # -> false
*
@@ -1519,6 +1656,7 @@ strscan_fixed_anchor_p(VALUE self)
*
* - #getch
* - #get_byte
+ * - #scan_byte
* - #scan
* - #scan_until
* - #skip
@@ -1531,6 +1669,7 @@ strscan_fixed_anchor_p(VALUE self)
* - #exist?
* - #match?
* - #peek
+ * - #peek_byte
*
* === Finding Where we Are
*
@@ -1622,7 +1761,9 @@ Init_strscan(void)
rb_define_method(StringScanner, "getch", strscan_getch, 0);
rb_define_method(StringScanner, "get_byte", strscan_get_byte, 0);
rb_define_method(StringScanner, "getbyte", strscan_getbyte, 0);
+ rb_define_method(StringScanner, "scan_byte", strscan_scan_byte, 0);
rb_define_method(StringScanner, "peek", strscan_peek, 1);
+ rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0);
rb_define_method(StringScanner, "peep", strscan_peep, 1);
rb_define_method(StringScanner, "unscan", strscan_unscan, 0);
@@ -1650,4 +1791,6 @@ Init_strscan(void)
rb_define_method(StringScanner, "inspect", strscan_inspect, 0);
rb_define_method(StringScanner, "fixed_anchor?", strscan_fixed_anchor_p, 0);
+
+ rb_define_method(StringScanner, "named_captures", strscan_named_captures, 0);
}