summaryrefslogtreecommitdiff
path: root/parse.y
diff options
context:
space:
mode:
authoryui-knk <spiketeika@gmail.com>2022-09-23 22:40:02 +0900
committerYuichiro Kaneko <spiketeika@gmail.com>2022-11-21 09:01:34 +0900
commitd8601621edcf29e3323b90dcf04b774edd9fb45e (patch)
treefd9b08ca7008ce5ad7f8acafd83accff2347697f /parse.y
parentbbc4cf5f76c0e17e645ba81ae0b3625c8cbfc4c4 (diff)
Enhance keep_tokens option for RubyVM::AbstractSyntaxTree parsing methods
Implementation for Language Server Protocol (LSP) sometimes needs token information. For example both `m(1)` and `m(1, )` has same AST structure other than node locations then it's impossible to check the existence of `,` from AST. However in later case, it might be better to suggest variables list for the second argument. Token information is important for such case. This commit adds these methods. * Add `keep_tokens` option for `RubyVM::AbstractSyntaxTree.parse`, `.parse_file` and `.of` * Add `RubyVM::AbstractSyntaxTree::Node#tokens` which returns tokens for the node including tokens for descendants nodes. * Add `RubyVM::AbstractSyntaxTree::Node#all_tokens` which returns all tokens for the input script regardless the receiver node. [Feature #19070] Impacts on memory usage and performance are below: Memory usage: ``` $ cat test.rb root = RubyVM::AbstractSyntaxTree.parse_file(File.expand_path('../test/ruby/test_keyword.rb', __FILE__), keep_tokens: true) $ /usr/bin/time -f %Mkb /usr/local/bin/ruby -v ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux] 11408kb # keep_tokens :false $ /usr/bin/time -f %Mkb /usr/local/bin/ruby test.rb 17508kb # keep_tokens :true $ /usr/bin/time -f %Mkb /usr/local/bin/ruby test.rb 30960kb ``` Performance: ``` $ cat ../ast_keep_tokens.yml prelude: | src = <<~SRC module M class C def m1(a, b) 1 + a + b end end end SRC benchmark: without_keep_tokens: | RubyVM::AbstractSyntaxTree.parse(src, keep_tokens: false) with_keep_tokens: | RubyVM::AbstractSyntaxTree.parse(src, keep_tokens: true) $ make benchmark COMPARE_RUBY="./ruby" ARGS=../ast_keep_tokens.yml /home/kaneko.y/.rbenv/shims/ruby --disable=gems -rrubygems -I../benchmark/lib ../benchmark/benchmark-driver/exe/benchmark-driver \ --executables="compare-ruby::./ruby -I.ext/common --disable-gem" \ --executables="built-ruby::./miniruby -I../lib -I. -I.ext/common ../tool/runruby.rb --extout=.ext -- --disable-gems --disable-gem" \ --output=markdown --output-compare -v ../ast_keep_tokens.yml compare-ruby: ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux] built-ruby: ruby 3.2.0dev (2022-11-19T09:41:54Z 19070-keep_tokens d3af1b8057) [x86_64-linux] warming up.. | |compare-ruby|built-ruby| |:--------------------|-----------:|---------:| |without_keep_tokens | 21.659k| 21.303k| | | 1.02x| -| |with_keep_tokens | 6.220k| 5.691k| | | 1.09x| -| ```
Notes
Notes: Merged: https://github.com/ruby/ruby/pull/6770
Diffstat (limited to 'parse.y')
-rw-r--r--parse.y480
1 files changed, 419 insertions, 61 deletions
diff --git a/parse.y b/parse.y
index 846136bd03..57cfe7db62 100644
--- a/parse.y
+++ b/parse.y
@@ -124,7 +124,13 @@ RBIMPL_WARNING_POP()
#define RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(Current) \
rb_parser_set_location_from_strterm_heredoc(p, &p->lex.strterm->u.heredoc, &(Current))
-#define RUBY_SET_YYLLOC_OF_NONE(Current) \
+#define RUBY_SET_YYLLOC_OF_DELAYED_TOKEN(Current) \
+ rb_parser_set_location_of_delayed_token(p, &(Current))
+#define RUBY_SET_YYLLOC_OF_HEREDOC_END(Current) \
+ rb_parser_set_location_of_heredoc_end(p, &(Current))
+#define RUBY_SET_YYLLOC_OF_DUMMY_END(Current) \
+ rb_parser_set_location_of_dummy_end(p, &(Current))
+#define RUBY_SET_YYLLOC_OF_NONE(Current) \
rb_parser_set_location_of_none(p, &(Current))
#define RUBY_SET_YYLLOC(Current) \
rb_parser_set_location(p, &(Current))
@@ -272,12 +278,12 @@ struct parser_params {
rb_imemo_tmpbuf_t *heap;
YYSTYPE *lval;
+ YYLTYPE *yylloc;
struct {
rb_strterm_t *strterm;
VALUE (*gets)(struct parser_params*,VALUE);
VALUE input;
- VALUE prevline;
VALUE lastline;
VALUE nextline;
const char *pbeg;
@@ -320,6 +326,14 @@ struct parser_params {
VALUE debug_buffer;
VALUE debug_output;
+ struct {
+ VALUE token;
+ int beg_line;
+ int beg_col;
+ int end_line;
+ int end_col;
+ } delayed;
+
ID cur_arg;
rb_ast_t *ast;
@@ -351,6 +365,7 @@ struct parser_params {
unsigned int do_split: 1;
unsigned int keep_script_lines: 1;
unsigned int error_tolerant: 1;
+ unsigned int keep_tokens: 1;
NODE *eval_tree_begin;
NODE *eval_tree;
@@ -359,15 +374,13 @@ struct parser_params {
const struct rb_iseq_struct *parent_iseq;
/* store specific keyword locations to generate dummy end token */
VALUE end_expect_token_locations;
+ /* id for terms */
+ int token_id;
+ /* Array for term tokens */
+ VALUE tokens;
#else
/* Ripper only */
- struct {
- VALUE token;
- int line;
- int col;
- } delayed;
-
VALUE value;
VALUE result;
VALUE parsing_thread;
@@ -447,6 +460,177 @@ peek_end_expect_token_locations(struct parser_params *p)
if(NIL_P(p->end_expect_token_locations)) return Qnil;
return rb_ary_last(0, 0, p->end_expect_token_locations);
}
+
+static ID
+parser_token2id(enum yytokentype tok)
+{
+ switch ((int) tok) {
+#define TOKEN2ID(tok) case tok: return rb_intern(#tok);
+#define TOKEN2ID2(tok, name) case tok: return rb_intern(name);
+ TOKEN2ID2(' ', "words_sep")
+ TOKEN2ID2('!', "!")
+ TOKEN2ID2('%', "%");
+ TOKEN2ID2('&', "&");
+ TOKEN2ID2('*', "*");
+ TOKEN2ID2('+', "+");
+ TOKEN2ID2('-', "-");
+ TOKEN2ID2('/', "/");
+ TOKEN2ID2('<', "<");
+ TOKEN2ID2('=', "=");
+ TOKEN2ID2('>', ">");
+ TOKEN2ID2('?', "?");
+ TOKEN2ID2('^', "^");
+ TOKEN2ID2('|', "|");
+ TOKEN2ID2('~', "~");
+ TOKEN2ID2(':', ":");
+ TOKEN2ID2(',', ",");
+ TOKEN2ID2('.', ".");
+ TOKEN2ID2(';', ";");
+ TOKEN2ID2('`', "`");
+ TOKEN2ID2('\n', "nl");
+ TOKEN2ID2('{', "{");
+ TOKEN2ID2('}', "}");
+ TOKEN2ID2('[', "[");
+ TOKEN2ID2(']', "]");
+ TOKEN2ID2('(', "(");
+ TOKEN2ID2(')', ")");
+ TOKEN2ID(keyword_class);
+ TOKEN2ID(keyword_module);
+ TOKEN2ID(keyword_def);
+ TOKEN2ID(keyword_undef);
+ TOKEN2ID(keyword_begin);
+ TOKEN2ID(keyword_rescue);
+ TOKEN2ID(keyword_ensure);
+ TOKEN2ID(keyword_end);
+ TOKEN2ID(keyword_if);
+ TOKEN2ID(keyword_unless);
+ TOKEN2ID(keyword_then);
+ TOKEN2ID(keyword_elsif);
+ TOKEN2ID(keyword_else);
+ TOKEN2ID(keyword_case);
+ TOKEN2ID(keyword_when);
+ TOKEN2ID(keyword_while);
+ TOKEN2ID(keyword_until);
+ TOKEN2ID(keyword_for);
+ TOKEN2ID(keyword_break);
+ TOKEN2ID(keyword_next);
+ TOKEN2ID(keyword_redo);
+ TOKEN2ID(keyword_retry);
+ TOKEN2ID(keyword_in);
+ TOKEN2ID(keyword_do);
+ TOKEN2ID(keyword_do_cond);
+ TOKEN2ID(keyword_do_block);
+ TOKEN2ID(keyword_do_LAMBDA);
+ TOKEN2ID(keyword_return);
+ TOKEN2ID(keyword_yield);
+ TOKEN2ID(keyword_super);
+ TOKEN2ID(keyword_self);
+ TOKEN2ID(keyword_nil);
+ TOKEN2ID(keyword_true);
+ TOKEN2ID(keyword_false);
+ TOKEN2ID(keyword_and);
+ TOKEN2ID(keyword_or);
+ TOKEN2ID(keyword_not);
+ TOKEN2ID(modifier_if);
+ TOKEN2ID(modifier_unless);
+ TOKEN2ID(modifier_while);
+ TOKEN2ID(modifier_until);
+ TOKEN2ID(modifier_rescue);
+ TOKEN2ID(keyword_alias);
+ TOKEN2ID(keyword_defined);
+ TOKEN2ID(keyword_BEGIN);
+ TOKEN2ID(keyword_END);
+ TOKEN2ID(keyword__LINE__);
+ TOKEN2ID(keyword__FILE__);
+ TOKEN2ID(keyword__ENCODING__);
+ TOKEN2ID(tIDENTIFIER);
+ TOKEN2ID(tFID);
+ TOKEN2ID(tGVAR);
+ TOKEN2ID(tIVAR);
+ TOKEN2ID(tCONSTANT);
+ TOKEN2ID(tCVAR);
+ TOKEN2ID(tLABEL);
+ TOKEN2ID(tINTEGER);
+ TOKEN2ID(tFLOAT);
+ TOKEN2ID(tRATIONAL);
+ TOKEN2ID(tIMAGINARY);
+ TOKEN2ID(tCHAR);
+ TOKEN2ID(tNTH_REF);
+ TOKEN2ID(tBACK_REF);
+ TOKEN2ID(tSTRING_CONTENT);
+ TOKEN2ID(tREGEXP_END);
+ TOKEN2ID(tDUMNY_END);
+ TOKEN2ID(tSP);
+ TOKEN2ID(tUPLUS);
+ TOKEN2ID(tUMINUS);
+ TOKEN2ID(tPOW);
+ TOKEN2ID(tCMP);
+ TOKEN2ID(tEQ);
+ TOKEN2ID(tEQQ);
+ TOKEN2ID(tNEQ);
+ TOKEN2ID(tGEQ);
+ TOKEN2ID(tLEQ);
+ TOKEN2ID(tANDOP);
+ TOKEN2ID(tOROP);
+ TOKEN2ID(tMATCH);
+ TOKEN2ID(tNMATCH);
+ TOKEN2ID(tDOT2);
+ TOKEN2ID(tDOT3);
+ TOKEN2ID(tBDOT2);
+ TOKEN2ID(tBDOT3);
+ TOKEN2ID(tAREF);
+ TOKEN2ID(tASET);
+ TOKEN2ID(tLSHFT);
+ TOKEN2ID(tRSHFT);
+ TOKEN2ID(tANDDOT);
+ TOKEN2ID(tCOLON2);
+ TOKEN2ID(tCOLON3);
+ TOKEN2ID(tOP_ASGN);
+ TOKEN2ID(tASSOC);
+ TOKEN2ID(tLPAREN);
+ TOKEN2ID(tLPAREN_ARG);
+ TOKEN2ID(tRPAREN);
+ TOKEN2ID(tLBRACK);
+ TOKEN2ID(tLBRACE);
+ TOKEN2ID(tLBRACE_ARG);
+ TOKEN2ID(tSTAR);
+ TOKEN2ID(tDSTAR);
+ TOKEN2ID(tAMPER);
+ TOKEN2ID(tLAMBDA);
+ TOKEN2ID(tSYMBEG);
+ TOKEN2ID(tSTRING_BEG);
+ TOKEN2ID(tXSTRING_BEG);
+ TOKEN2ID(tREGEXP_BEG);
+ TOKEN2ID(tWORDS_BEG);
+ TOKEN2ID(tQWORDS_BEG);
+ TOKEN2ID(tSYMBOLS_BEG);
+ TOKEN2ID(tQSYMBOLS_BEG);
+ TOKEN2ID(tSTRING_END);
+ TOKEN2ID(tSTRING_DEND);
+ TOKEN2ID(tSTRING_DBEG);
+ TOKEN2ID(tSTRING_DVAR);
+ TOKEN2ID(tLAMBEG);
+ TOKEN2ID(tLABEL_END);
+ TOKEN2ID(tIGNORED_NL);
+ TOKEN2ID(tCOMMENT);
+ TOKEN2ID(tEMBDOC_BEG);
+ TOKEN2ID(tEMBDOC);
+ TOKEN2ID(tEMBDOC_END);
+ TOKEN2ID(tHEREDOC_BEG);
+ TOKEN2ID(tHEREDOC_END);
+ TOKEN2ID(k__END__);
+ TOKEN2ID(tLOWEST);
+ TOKEN2ID(tUMINUS_NUM);
+ TOKEN2ID(tLAST_TOKEN);
+#undef TOKEN2ID
+#undef TOKEN2ID2
+ }
+
+ rb_bug("parser_token2id: unknown token %d", tok);
+
+ UNREACHABLE_RETURN(0);
+}
+
#endif
RBIMPL_ATTR_NONNULL((1, 2, 3))
@@ -457,6 +641,9 @@ static int parser_yyerror0(struct parser_params*, const char*);
#define yyerror1(loc, msg) parser_yyerror(p, (loc), (msg))
#define yyerror(yylloc, p, msg) parser_yyerror(p, yylloc, msg)
#define token_flush(ptr) ((ptr)->lex.ptok = (ptr)->lex.pcur)
+#define lex_goto_eol(p) ((p)->lex.pcur = (p)->lex.pend)
+#define lex_eol_p(p) ((p)->lex.pcur >= (p)->lex.pend)
+#define lex_eol_n_p(p,n) ((p)->lex.pcur+(n) >= (p)->lex.pend)
static void token_info_setup(token_info *ptinfo, const char *ptr, const rb_code_location_t *loc);
static void token_info_push(struct parser_params*, const char *token, const rb_code_location_t *loc);
@@ -707,6 +894,9 @@ VALUE rb_parser_lex_state_name(enum lex_state_e state);
void rb_parser_show_bitstack(struct parser_params *, stack_type, const char *, int);
PRINTF_ARGS(void rb_parser_fatal(struct parser_params *p, const char *fmt, ...), 2, 3);
YYLTYPE *rb_parser_set_location_from_strterm_heredoc(struct parser_params *p, rb_strterm_heredoc_t *here, YYLTYPE *yylloc);
+YYLTYPE *rb_parser_set_location_of_delayed_token(struct parser_params *p, YYLTYPE *yylloc);
+YYLTYPE *rb_parser_set_location_of_heredoc_end(struct parser_params *p, YYLTYPE *yylloc);
+YYLTYPE *rb_parser_set_location_of_dummy_end(struct parser_params *p, YYLTYPE *yylloc);
YYLTYPE *rb_parser_set_location_of_none(struct parser_params *p, YYLTYPE *yylloc);
YYLTYPE *rb_parser_set_location(struct parser_params *p, YYLTYPE *yylloc);
RUBY_SYMBOL_EXPORT_END
@@ -1057,6 +1247,8 @@ endless_method_name(struct parser_params *p, NODE *defn, const YYLTYPE *loc)
token_info_drop(p, "def", loc->beg_pos);
}
+#define debug_token_line(p, name, line) if (p->debug) rb_parser_printf(p, name ":%d (%d: %ld|%ld|%ld)\n", line, p->ruby_sourceline, p->lex.ptok - p->lex.pbeg, p->lex.pcur - p->lex.ptok, p->lex.pend - p->lex.pcur)
+
#ifndef RIPPER
# define Qnone 0
# define Qnull 0
@@ -1356,6 +1548,9 @@ static int looking_at_eol_p(struct parser_params *p);
%token tSTRING_DEND "'}'"
%token tSTRING_DBEG tSTRING_DVAR tLAMBEG tLABEL_END
+%token tIGNORED_NL tCOMMENT tEMBDOC_BEG tEMBDOC tEMBDOC_END
+%token tHEREDOC_BEG tHEREDOC_END k__END__
+
/*
* precedence table
*/
@@ -3447,7 +3642,7 @@ k_if : keyword_if
token_info_push(p, "if", &@$);
if (p->token_info && p->token_info->nonspc &&
p->token_info->next && !strcmp(p->token_info->next->token, "else")) {
- const char *tok = p->lex.ptok;
+ const char *tok = p->lex.ptok - rb_strlen_lit("if");
const char *beg = p->lex.pbeg + p->token_info->next->beg.column;
beg += rb_strlen_lit("else");
while (beg < tok && ISSPACE(*beg)) beg++;
@@ -5906,7 +6101,11 @@ trailer : opt_nl
;
term : ';' {yyerrok;token_flush(p);}
- | '\n' {token_flush(p);}
+ | '\n'
+ {
+ @$.end_pos = @$.beg_pos;
+ token_flush(p);
+ }
;
terms : term
@@ -5967,12 +6166,91 @@ ripper_yylval_id(struct parser_params *p, ID x)
#endif
#define set_yylval_noname() set_yylval_id(keyword_nil)
+#define has_delayed_token(p) (!NIL_P(p->delayed.token))
#ifndef RIPPER
#define literal_flush(p, ptr) ((p)->lex.ptok = (ptr))
-#define dispatch_scan_event(p, t) ((void)0)
-#define dispatch_delayed_token(p, t) ((void)0)
-#define has_delayed_token(p) (0)
+#define dispatch_scan_event(p, t) parser_dispatch_scan_event(p, t, __LINE__)
+
+static bool
+parser_has_token(struct parser_params *p)
+{
+ if (p->keep_tokens && (p->lex.pcur < p->lex.ptok)) rb_bug("lex.pcur < lex.ptok. (line: %d) %ld|%ld|%ld", p->ruby_sourceline, p->lex.ptok - p->lex.pbeg, p->lex.pcur - p->lex.ptok, p->lex.pend - p->lex.pcur);
+ return p->lex.pcur > p->lex.ptok;
+}
+
+static VALUE
+code_loc_to_ary(const rb_code_location_t *loc)
+{
+ VALUE ary = rb_ary_new_from_args(4,
+ INT2NUM(loc->beg_pos.lineno), INT2NUM(loc->beg_pos.column),
+ INT2NUM(loc->end_pos.lineno), INT2NUM(loc->end_pos.column));
+ rb_obj_freeze(ary);
+
+ return ary;
+}
+
+static void
+parser_append_tokens(struct parser_params *p, VALUE str, enum yytokentype t, int line)
+{
+ VALUE ary;
+ int token_id;
+
+ ary = rb_ary_new2(4);
+ token_id = p->token_id;
+ rb_ary_push(ary, INT2FIX(token_id));
+ rb_ary_push(ary, ID2SYM(parser_token2id(t)));
+ rb_ary_push(ary, str);
+ rb_ary_push(ary, code_loc_to_ary(p->yylloc));
+ rb_obj_freeze(ary);
+ rb_ary_push(p->tokens, ary);
+ p->token_id++;
+
+ if (p->debug) {
+ rb_parser_printf(p, "Append tokens (line: %d) %"PRIsVALUE"\n", line, ary);
+ }
+}
+
+static void
+parser_dispatch_scan_event(struct parser_params *p, enum yytokentype t, int line)
+{
+ debug_token_line(p, "parser_dispatch_scan_event", line);
+
+ if (!parser_has_token(p)) return;
+
+ RUBY_SET_YYLLOC(*p->yylloc);
+
+ if (p->keep_tokens) {
+ VALUE str = STR_NEW(p->lex.ptok, p->lex.pcur - p->lex.ptok);
+ parser_append_tokens(p, str, t, line);
+ }
+
+ token_flush(p);
+}
+
+#define dispatch_delayed_token(p, t) parser_dispatch_delayed_token(p, t, __LINE__)
+static void
+parser_dispatch_delayed_token(struct parser_params *p, enum yytokentype t, int line)
+{
+ int saved_line = p->ruby_sourceline;
+ const char *saved_tokp = p->lex.ptok;
+
+ debug_token_line(p, "parser_dispatch_delayed_token", line);
+
+ if (!has_delayed_token(p)) return;
+
+ RUBY_SET_YYLLOC_OF_DELAYED_TOKEN(*p->yylloc);
+
+ if (p->keep_tokens) {
+ p->ruby_sourceline = p->delayed.beg_line;
+ p->lex.ptok = p->lex.pbeg + p->delayed.beg_col;
+ parser_append_tokens(p, p->delayed.token, t, line);
+ p->ruby_sourceline = saved_line;
+ p->lex.ptok = saved_tokp;
+ }
+
+ p->delayed.token = Qnil;
+}
#else
#define literal_flush(p, ptr) ((void)(ptr))
@@ -5997,6 +6275,7 @@ ripper_scan_event_val(struct parser_params *p, enum yytokentype t)
{
VALUE str = STR_NEW(p->lex.ptok, p->lex.pcur - p->lex.ptok);
VALUE rval = ripper_dispatch1(p, ripper_token2eventid(t), str);
+ RUBY_SET_YYLLOC(*p->yylloc);
token_flush(p);
return rval;
}
@@ -6016,15 +6295,14 @@ ripper_dispatch_delayed_token(struct parser_params *p, enum yytokentype t)
const char *saved_tokp = p->lex.ptok;
if (NIL_P(p->delayed.token)) return;
- p->ruby_sourceline = p->delayed.line;
- p->lex.ptok = p->lex.pbeg + p->delayed.col;
+ p->ruby_sourceline = p->delayed.beg_line;
+ p->lex.ptok = p->lex.pbeg + p->delayed.beg_col;
add_mark_object(p, yylval_rval = ripper_dispatch1(p, ripper_token2eventid(t), p->delayed.token));
p->delayed.token = Qnil;
p->ruby_sourceline = saved_line;
p->lex.ptok = saved_tokp;
}
#define dispatch_delayed_token(p, t) ripper_dispatch_delayed_token(p, t)
-#define has_delayed_token(p) (!NIL_P(p->delayed.token))
#endif /* RIPPER */
static inline int
@@ -6495,7 +6773,6 @@ yycompile0(VALUE arg)
p->lex.strterm = 0;
p->lex.pcur = p->lex.pbeg = p->lex.pend = 0;
- p->lex.prevline = p->lex.lastline = p->lex.nextline = 0;
if (n || p->error_p) {
VALUE mesg = p->error_buffer;
if (!mesg) {
@@ -6512,6 +6789,7 @@ yycompile0(VALUE arg)
}
else {
VALUE opt = p->compile_option;
+ VALUE tokens = p->tokens;
NODE *prelude;
NODE *body = parser_append_options(p, tree->nd_body);
if (!opt) opt = rb_obj_hide(rb_ident_hash_new());
@@ -6519,6 +6797,10 @@ yycompile0(VALUE arg)
prelude = block_append(p, p->eval_tree_begin, body);
tree->nd_body = prelude;
RB_OBJ_WRITE(p->ast, &p->ast->body.compile_option, opt);
+ if (p->keep_tokens) {
+ rb_obj_freeze(tokens);
+ rb_ast_set_tokens(p->ast, tokens);
+ }
}
p->ast->body.root = tree;
if (!p->ast->body.script_lines) p->ast->body.script_lines = INT2FIX(p->line_count);
@@ -6709,32 +6991,31 @@ parser_str_new(const char *ptr, long len, rb_encoding *enc, int func, rb_encodin
return str;
}
-#define lex_goto_eol(p) ((p)->lex.pcur = (p)->lex.pend)
-#define lex_eol_p(p) ((p)->lex.pcur >= (p)->lex.pend)
-#define lex_eol_n_p(p,n) ((p)->lex.pcur+(n) >= (p)->lex.pend)
#define peek(p,c) peek_n(p, (c), 0)
#define peek_n(p,c,n) (!lex_eol_n_p(p, n) && (c) == (unsigned char)(p)->lex.pcur[n])
#define peekc(p) peekc_n(p, 0)
#define peekc_n(p,n) (lex_eol_n_p(p, n) ? -1 : (unsigned char)(p)->lex.pcur[n])
-#ifdef RIPPER
static void
-add_delayed_token(struct parser_params *p, const char *tok, const char *end)
+add_delayed_token(struct parser_params *p, const char *tok, const char *end, int line)
{
+#ifndef RIPPER
+ debug_token_line(p, "add_delayed_token", line);
+#endif
+
if (tok < end) {
if (!has_delayed_token(p)) {
p->delayed.token = rb_str_buf_new(end - tok);
rb_enc_associate(p->delayed.token, p->enc);
- p->delayed.line = p->ruby_sourceline;
- p->delayed.col = rb_long2int(tok - p->lex.pbeg);
+ p->delayed.beg_line = p->ruby_sourceline;
+ p->delayed.beg_col = rb_long2int(tok - p->lex.pbeg);
}
rb_str_buf_cat(p->delayed.token, tok, end - tok);
+ p->delayed.end_line = p->ruby_sourceline;
+ p->delayed.end_col = rb_long2int(end - p->lex.pbeg);
p->lex.ptok = end;
}
}
-#else
-#define add_delayed_token(p, tok, end) ((void)(tok), (void)(end))
-#endif
static int
nextline(struct parser_params *p, int set_encoding)
@@ -6767,7 +7048,7 @@ nextline(struct parser_params *p, int set_encoding)
/* after here-document without terminator */
goto end_of_input;
}
- add_delayed_token(p, p->lex.ptok, p->lex.pend);
+ add_delayed_token(p, p->lex.ptok, p->lex.pend, __LINE__);
if (p->heredoc_end > 0) {
p->ruby_sourceline = p->heredoc_end;
p->heredoc_end = 0;
@@ -6776,7 +7057,6 @@ nextline(struct parser_params *p, int set_encoding)
p->lex.pbeg = p->lex.pcur = RSTRING_PTR(v);
p->lex.pend = p->lex.pcur + RSTRING_LEN(v);
token_flush(p);
- p->lex.prevline = p->lex.lastline;
p->lex.lastline = v;
return 0;
}
@@ -6929,20 +7209,22 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
{
size_t numlen;
int codepoint = scan_hex(p->lex.pcur, wide ? p->lex.pend - p->lex.pcur : 4, &numlen);
- literal_flush(p, p->lex.pcur);
p->lex.pcur += numlen;
if (p->lex.strterm == NULL ||
(p->lex.strterm->flags & STRTERM_HEREDOC) ||
(p->lex.strterm->u.literal.u1.func != str_regexp)) {
if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4)) {
+ literal_flush(p, p->lex.pcur);
yyerror0("invalid Unicode escape");
return wide && numlen > 0;
}
if (codepoint > 0x10ffff) {
+ literal_flush(p, p->lex.pcur);
yyerror0("invalid Unicode codepoint (too large)");
return wide;
}
if ((codepoint & 0xfffff800) == 0xd800) {
+ literal_flush(p, p->lex.pcur);
yyerror0("invalid Unicode codepoint");
return wide;
}
@@ -7363,7 +7645,6 @@ tokadd_string(struct parser_params *p,
}
}
else if (c == '\\') {
- literal_flush(p, p->lex.pcur - 1);
c = nextc(p);
switch (c) {
case '\n':
@@ -7511,7 +7792,21 @@ flush_string_content(struct parser_params *p, rb_encoding *enc)
yylval.val = content;
}
#else
-#define flush_string_content(p, enc) ((void)(enc))
+static void
+flush_string_content(struct parser_params *p, rb_encoding *enc)
+{
+ if (has_delayed_token(p)) {
+ ptrdiff_t len = p->lex.pcur - p->lex.ptok;
+ if (len > 0) {
+ rb_enc_str_buf_cat(p->delayed.token, p->lex.ptok, len, enc);
+ p->delayed.end_line = p->ruby_sourceline;
+ p->delayed.end_col = rb_long2int(p->lex.pcur - p->lex.pbeg);
+ }
+ dispatch_delayed_token(p, tSTRING_CONTENT);
+ p->lex.ptok = p->lex.pcur;
+ }
+ dispatch_scan_event(p, tSTRING_CONTENT);
+}
#endif
RUBY_FUNC_EXPORTED const unsigned int ruby_global_name_punct_bits[(0x7e - 0x20 + 31) / 32];
@@ -7630,14 +7925,14 @@ parse_string(struct parser_params *p, rb_strterm_literal_t *quote)
if (func & STR_FUNC_QWORDS) {
quote->u1.func |= STR_FUNC_TERM;
pushback(p, c); /* dispatch the term at tSTRING_END */
- add_delayed_token(p, p->lex.ptok, p->lex.pcur);
+ add_delayed_token(p, p->lex.ptok, p->lex.pcur, __LINE__);
return ' ';
}
return parser_string_term(p, func);
}
if (space) {
pushback(p, c);
- add_delayed_token(p, p->lex.ptok, p->lex.pcur);
+ add_delayed_token(p, p->lex.ptok, p->lex.pcur, __LINE__);
return ' ';
}
newtok(p);
@@ -7997,12 +8292,29 @@ dispatch_heredoc_end(struct parser_params *p)
dispatch_delayed_token(p, tSTRING_CONTENT);
str = STR_NEW(p->lex.ptok, p->lex.pend - p->lex.ptok);
ripper_dispatch1(p, ripper_token2eventid(tHEREDOC_END), str);
+ RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(*p->yylloc);
lex_goto_eol(p);
token_flush(p);
}
#else
-#define dispatch_heredoc_end(p) ((void)0)
+#define dispatch_heredoc_end(p) parser_dispatch_heredoc_end(p, __LINE__)
+static void
+parser_dispatch_heredoc_end(struct parser_params *p, int line)
+{
+ if (has_delayed_token(p))
+ dispatch_delayed_token(p, tSTRING_CONTENT);
+
+ if (p->keep_tokens) {
+ VALUE str = STR_NEW(p->lex.ptok, p->lex.pend - p->lex.ptok);
+ RUBY_SET_YYLLOC_OF_HEREDOC_END(*p->yylloc);
+ parser_append_tokens(p, str, tHEREDOC_END, line);
+ }
+
+ RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(*p->yylloc);
+ lex_goto_eol(p);
+ token_flush(p);
+}
#endif
static enum yytokentype
@@ -9430,6 +9742,16 @@ parse_ident(struct parser_params *p, int c, int cmd_state)
return result;
}
+static void
+warn_cr(struct parser_params *p)
+{
+ if (!p->cr_seen) {
+ p->cr_seen = TRUE;
+ /* carried over with p->lex.nextline for nextc() */
+ rb_warn0("encountered \\r in middle of line, treated as a mere space");
+ }
+}
+
static enum yytokentype
parser_yylex(struct parser_params *p)
{
@@ -9443,6 +9765,7 @@ parser_yylex(struct parser_params *p)
if (p->lex.strterm) {
if (p->lex.strterm->flags & STRTERM_HEREDOC) {
+ token_flush(p);
return here_document(p, &p->lex.strterm->u.heredoc);
}
else {
@@ -9453,11 +9776,11 @@ parser_yylex(struct parser_params *p)
cmd_state = p->command_start;
p->command_start = FALSE;
p->token_seen = TRUE;
- retry:
- last_state = p->lex.state;
#ifndef RIPPER
token_flush(p);
#endif
+ retry:
+ last_state = p->lex.state;
switch (c = nextc(p)) {
case '\0': /* NUL */
case '\004': /* ^D */
@@ -9467,26 +9790,27 @@ parser_yylex(struct parser_params *p)
#ifndef RIPPER
if (!NIL_P(p->end_expect_token_locations) && RARRAY_LEN(p->end_expect_token_locations) > 0) {
pop_end_expect_token_locations(p);
+ RUBY_SET_YYLLOC_OF_DUMMY_END(*p->yylloc);
return tDUMNY_END;
}
#endif
+ /* Set location for end-of-input because dispatch_scan_event is not called. */
+ RUBY_SET_YYLLOC(*p->yylloc);
return 0;
/* white spaces */
case '\r':
- if (!p->cr_seen) {
- p->cr_seen = TRUE;
- /* carried over with p->lex.nextline for nextc() */
- rb_warn0("encountered \\r in middle of line, treated as a mere space");
- }
+ warn_cr(p);
/* fall through */
case ' ': case '\t': case '\f':
case '\13': /* '\v' */
space_seen = 1;
-#ifdef RIPPER
while ((c = nextc(p))) {
switch (c) {
- case ' ': case '\t': case '\f': case '\r':
+ case '\r':
+ warn_cr(p);
+ /* fall through */
+ case ' ': case '\t': case '\f':
case '\13': /* '\v' */
break;
default:
@@ -9496,6 +9820,8 @@ parser_yylex(struct parser_params *p)
outofloop:
pushback(p, c);
dispatch_scan_event(p, tSP);
+#ifndef RIPPER
+ token_flush(p);
#endif
goto retry;
@@ -9533,7 +9859,10 @@ parser_yylex(struct parser_params *p)
break;
case '#':
pushback(p, c);
- if (space_seen) dispatch_scan_event(p, tSP);
+ if (space_seen) {
+ dispatch_scan_event(p, tSP);
+ token_flush(p);
+ }
goto retry;
case '&':
case '.': {
@@ -9548,18 +9877,10 @@ parser_yylex(struct parser_params *p)
p->ruby_sourceline--;
p->lex.nextline = p->lex.lastline;
case -1: /* EOF no decrement*/
-#ifndef RIPPER
- if (p->lex.prevline && !p->eofp) p->lex.lastline = p->lex.prevline;
- p->lex.pbeg = RSTRING_PTR(p->lex.lastline);
- p->lex.pend = p->lex.pcur = p->lex.pbeg + RSTRING_LEN(p->lex.lastline);
- pushback(p, 1); /* always pushback */
- p->lex.ptok = p->lex.pcur;
-#else
lex_goto_eol(p);
if (c != -1) {
p->lex.ptok = p->lex.pcur;
}
-#endif
goto normal_newline;
}
}
@@ -10157,12 +10478,9 @@ yylex(YYSTYPE *lval, YYLTYPE *yylloc, struct parser_params *p)
p->lval = lval;
lval->val = Qundef;
- t = parser_yylex(p);
+ p->yylloc = yylloc;
- if (p->lex.strterm && (p->lex.strterm->flags & STRTERM_HEREDOC))
- RUBY_SET_YYLLOC_FROM_STRTERM_HEREDOC(*yylloc);
- else
- RUBY_SET_YYLLOC(*yylloc);
+ t = parser_yylex(p);
if (has_delayed_token(p))
dispatch_delayed_token(p, t);
@@ -11054,6 +11372,34 @@ rb_parser_set_location_from_strterm_heredoc(struct parser_params *p, rb_strterm_
}
YYLTYPE *
+rb_parser_set_location_of_delayed_token(struct parser_params *p, YYLTYPE *yylloc)
+{
+ yylloc->beg_pos.lineno = p->delayed.beg_line;
+ yylloc->beg_pos.column = p->delayed.beg_col;
+ yylloc->end_pos.lineno = p->delayed.end_line;
+ yylloc->end_pos.column = p->delayed.end_col;
+
+ return yylloc;
+}
+
+YYLTYPE *
+rb_parser_set_location_of_heredoc_end(struct parser_params *p, YYLTYPE *yylloc)
+{
+ int sourceline = p->ruby_sourceline;
+ int beg_pos = (int)(p->lex.ptok - p->lex.pbeg);
+ int end_pos = (int)(p->lex.pend - p->lex.pbeg);
+ return rb_parser_set_pos(yylloc, sourceline, beg_pos, end_pos);
+}
+
+YYLTYPE *
+rb_parser_set_location_of_dummy_end(struct parser_params *p, YYLTYPE *yylloc)
+{
+ yylloc->end_pos = yylloc->beg_pos;
+
+ return yylloc;
+}
+
+YYLTYPE *
rb_parser_set_location_of_none(struct parser_params *p, YYLTYPE *yylloc)
{
int sourceline = p->ruby_sourceline;
@@ -13329,13 +13675,15 @@ parser_initialize(struct parser_params *p)
p->ruby_sourcefile_string = Qnil;
p->lex.lpar_beg = -1; /* make lambda_beginning_p() == FALSE at first */
p->node_id = 0;
-#ifdef RIPPER
p->delayed.token = Qnil;
+#ifdef RIPPER
p->result = Qnil;
p->parsing_thread = Qnil;
#else
p->error_buffer = Qfalse;
p->end_expect_token_locations = Qnil;
+ p->token_id = 0;
+ p->tokens = Qnil;
#endif
p->debug_buffer = Qnil;
p->debug_output = rb_ractor_stdout();
@@ -13353,20 +13701,20 @@ parser_mark(void *ptr)
struct parser_params *p = (struct parser_params*)ptr;
rb_gc_mark(p->lex.input);
- rb_gc_mark(p->lex.prevline);
rb_gc_mark(p->lex.lastline);
rb_gc_mark(p->lex.nextline);
rb_gc_mark(p->ruby_sourcefile_string);
rb_gc_mark((VALUE)p->lex.strterm);
rb_gc_mark((VALUE)p->ast);
rb_gc_mark(p->case_labels);
+ rb_gc_mark(p->delayed.token);
#ifndef RIPPER
rb_gc_mark(p->debug_lines);
rb_gc_mark(p->compile_option);
rb_gc_mark(p->error_buffer);
rb_gc_mark(p->end_expect_token_locations);
+ rb_gc_mark(p->tokens);
#else
- rb_gc_mark(p->delayed.token);
rb_gc_mark(p->value);
rb_gc_mark(p->result);
rb_gc_mark(p->parsing_thread);
@@ -13480,6 +13828,16 @@ rb_parser_error_tolerant(VALUE vparser)
p->end_expect_token_locations = rb_ary_new();
}
+void
+rb_parser_keep_tokens(VALUE vparser)
+{
+ struct parser_params *p;
+
+ TypedData_Get_Struct(vparser, struct parser_params, &parser_data_type, p);
+ p->keep_tokens = 1;
+ p->tokens = rb_ary_new();
+}
+
#endif
#ifdef RIPPER