diff options
author | Yuichiro Kaneko <spiketeika@gmail.com> | 2023-05-12 18:25:10 +0900 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-05-12 18:25:10 +0900 |
commit | a1b01e7701f9fc370f8dff777aad6d39a2c5a3e3 (patch) | |
tree | 69bb0c08c139f1c7c5abe9422649f11581f85529 /tool/lrama/lib/lrama | |
parent | d314fe42f987fcfaad67f102ec418ee4ca32ee99 (diff) |
Use Lrama LALR parser generator instead of Bisonv3_3_0_preview1
https://bugs.ruby-lang.org/issues/19637
Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
Notes
Notes:
Merged: https://github.com/ruby/ruby/pull/7798
Merged-By: yui-knk <spiketeika@gmail.com>
Diffstat (limited to 'tool/lrama/lib/lrama')
-rw-r--r-- | tool/lrama/lib/lrama/bitmap.rb | 29 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/command.rb | 144 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/context.rb | 511 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/digraph.rb | 53 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/grammar.rb | 850 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/lexer.rb | 349 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/output.rb | 370 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/parser.rb | 321 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/report.rb | 47 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/states.rb | 832 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/states_reporter.rb | 310 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/version.rb | 3 | ||||
-rw-r--r-- | tool/lrama/lib/lrama/warning.rb | 25 |
13 files changed, 3844 insertions, 0 deletions
diff --git a/tool/lrama/lib/lrama/bitmap.rb b/tool/lrama/lib/lrama/bitmap.rb new file mode 100644 index 0000000000..8349a23c34 --- /dev/null +++ b/tool/lrama/lib/lrama/bitmap.rb @@ -0,0 +1,29 @@ +module Lrama + module Bitmap + def self.from_array(ary) + bit = 0 + + ary.each do |int| + bit |= (1 << int) + end + + bit + end + + def self.to_array(int) + a = [] + i = 0 + + while int > 0 do + if int & 1 == 1 + a << i + end + + i += 1 + int >>= 1 + end + + a + end + end +end diff --git a/tool/lrama/lib/lrama/command.rb b/tool/lrama/lib/lrama/command.rb new file mode 100644 index 0000000000..71369de8ef --- /dev/null +++ b/tool/lrama/lib/lrama/command.rb @@ -0,0 +1,144 @@ +require 'optparse' + +module Lrama + class Command + def run(argv) + opt = OptionParser.new + + # opt.on('-h') {|v| p v } + opt.on('-V', '--version') {|v| puts Lrama::VERSION ; exit 0 } + + # Tuning the Parser + skeleton = "bison/yacc.c" + + opt.on('-S', '--skeleton=FILE') {|v| skeleton = v } + opt.on('-t') { } # Do nothing + + # Output Files: + header = false + header_file = nil + report = [] + report_file = nil + outfile = "y.tab.c" + + opt.on('-h', '--header=[FILE]') {|v| header = true; header_file = v } + opt.on('-d') { header = true } + opt.on('-r', '--report=THINGS') {|v| report = v.split(',') } + opt.on('--report-file=FILE') {|v| report_file = v } + opt.on('-v') { } # Do nothing + opt.on('-o', '--output=FILE') {|v| outfile = v } + + # Hidden + trace = [] + opt.on('--trace=THINGS') {|v| trace = v.split(',') } + + # Error Recovery + error_recovery = false + opt.on('-e') {|v| error_recovery = true } + + opt.parse!(argv) + + trace_opts = validate_trace(trace) + report_opts = validate_report(report) + + grammar_file = argv.shift + + if !report.empty? && report_file.nil? && grammar_file + report_file = File.dirname(grammar_file) + "/" + File.basename(grammar_file, ".*") + ".output" + end + + if !header_file && header + case + when outfile + header_file = File.dirname(outfile) + "/" + File.basename(outfile, ".*") + ".h" + when grammar_file + header_file = File.dirname(grammar_file) + "/" + File.basename(grammar_file, ".*") + ".h" + end + end + + if !grammar_file + puts "File should be specified\n" + exit 1 + end + + Report::Duration.enable if trace_opts[:time] + + warning = Lrama::Warning.new + y = File.read(grammar_file) + grammar = Lrama::Parser.new(y).parse + states = Lrama::States.new(grammar, warning, trace_state: (trace_opts[:automaton] || trace_opts[:closure])) + states.compute + context = Lrama::Context.new(states) + + if report_file + reporter = Lrama::StatesReporter.new(states) + File.open(report_file, "w+") do |f| + reporter.report(f, **report_opts) + end + end + + File.open(outfile, "w+") do |f| + Lrama::Output.new( + out: f, + output_file_path: outfile, + template_name: skeleton, + grammar_file_path: grammar_file, + header_file_path: header_file, + context: context, + grammar: grammar, + ).render + end + + if warning.has_error? + exit 1 + end + end + + private + + def validate_report(report) + bison_list = %w[states itemsets lookaheads solved counterexamples cex all none] + others = %w[verbose] + list = bison_list + others + not_supported = %w[counterexamples cex none] + h = { grammar: true } + + report.each do |r| + if list.include?(r) && !not_supported.include?(r) + h[r.to_sym] = true + else + raise "Invalid report option \"#{r}\"." + end + end + + if h[:all] + (bison_list - not_supported).each do |r| + h[r.to_sym] = true + end + + h.delete(:all) + end + + return h + end + + def validate_trace(trace) + list = %w[ + none locations scan parse automaton bitsets + closure grammar resource sets muscles tools + m4-early m4 skeleton time ielr cex all + ] + h = {} + + trace.each do |t| + if list.include?(t) + h[t.to_sym] = true + else + raise "Invalid trace option \"#{t}\"." + end + end + + return h + end + end +end diff --git a/tool/lrama/lib/lrama/context.rb b/tool/lrama/lib/lrama/context.rb new file mode 100644 index 0000000000..9086470011 --- /dev/null +++ b/tool/lrama/lib/lrama/context.rb @@ -0,0 +1,511 @@ +require "lrama/report" + +module Lrama + # This is passed to a template + class Context + include Report::Duration + + ErrorActionNumber = -Float::INFINITY + BaseMin = -Float::INFINITY + + # TODO: It might be better to pass `states` to Output directly? + attr_reader :states + + def initialize(states) + @states = states + @yydefact = nil + @yydefgoto = nil + # Array of array + @_actions = [] + + compute_tables + end + + # enum yytokentype + def yytokentype + @states.terms.reject do |term| + 0 < term.token_id && term.token_id < 128 + end.map do |term| + [term.id.s_value, term.token_id, term.display_name] + end.unshift(["YYEMPTY", -2, nil]) + end + + # enum yysymbol_kind_t + def yysymbol_kind_t + @states.symbols.map do |sym| + [sym.enum_name, sym.number, sym.comment] + end.unshift(["YYSYMBOL_YYEMPTY", -2, nil]) + end + + # State number of final (accepted) state + def yyfinal + @states.states.find do |state| + state.items.find do |item| + item.rule.lhs.id.s_value == "$accept" && item.end_of_rule? + end + end.id + end + + def yylast + @yylast + end + + # Number of terms + def yyntokens + @states.terms.count + end + + # Number of nterms + def yynnts + @states.nterms.count + end + + # Number of rules + def yynrules + @states.rules.count + end + + # Number of states + def yynstates + @states.states.count + end + + # Last token number + def yymaxutok + @states.terms.map(&:token_id).max + end + + # YYTRANSLATE + # + # yytranslate is a mapping from token id to symbol number + def yytranslate + # 2 is YYSYMBOL_YYUNDEF + a = Array.new(yymaxutok, 2) + + @states.terms.each do |term| + a[term.token_id] = term.number + end + + return a + end + + # Mapping from rule number to line number of the rule is defined. + # Dummy rule is appended as the first element whose value is 0 + # because 0 means error in yydefact. + def yyrline + a = [0] + + @states.rules.each do |rule| + a << rule.lineno + end + + return a + end + + # Mapping from symbol number to its name + def yytname + @states.symbols.sort_by(&:number).map do |sym| + sym.display_name + end + end + + def yypact_ninf + @yypact_ninf + end + + def yytable_ninf + @yytable_ninf + end + + def yypact + @base[0...yynstates] + end + + def yydefact + @yydefact + end + + def yypgoto + @base[yynstates..-1] + end + + def yydefgoto + @yydefgoto + end + + def yytable + @table + end + + def yycheck + @check + end + + def yystos + @states.states.map do |state| + state.accessing_symbol.number + end + end + + # Mapping from rule number to symbol number of LHS. + # Dummy rule is appended as the first element whose value is 0 + # because 0 means error in yydefact. + def yyr1 + a = [0] + + @states.rules.each do |rule| + a << rule.lhs.number + end + + return a + end + + # Mapping from rule number to lenght of RHS. + # Dummy rule is appended as the first element whose value is 0 + # because 0 means error in yydefact. + def yyr2 + a = [0] + + @states.rules.each do |rule| + a << rule.rhs.count + end + + return a + end + + private + + # Compute these + # + # See also: "src/tables.c" of Bison. + # + # * yydefact + # * yydefgoto + # * yypact and yypgoto + # * yytable + # * yycheck + # * yypact_ninf + # * yytable_ninf + def compute_tables + report_duration(:compute_yydefact) { compute_yydefact } + report_duration(:compute_yydefgoto) { compute_yydefgoto } + report_duration(:sort_actions) { sort_actions } + # debug_sorted_actions + report_duration(:compute_packed_table) { compute_packed_table } + end + + def vectors_count + @states.states.count + @states.nterms.count + end + + # In compressed table, rule 0 is appended as an error case + # and reduce is represented as minus number. + def rule_id_to_action_number(rule_id) + (rule_id + 1) * -1 + end + + # Symbol number is assinged to term first then nterm. + # This method calculates sequence_number for nterm. + def nterm_number_to_sequence_number(nterm_number) + nterm_number - @states.terms.count + end + + # Vector is states + nterms + def nterm_number_to_vector_number(nterm_number) + @states.states.count + (nterm_number - @states.terms.count) + end + + def compute_yydefact + # Default action (shift/reduce/error) for each state. + # Index is state id, value is `rule id + 1` of a default reduction. + @yydefact = Array.new(@states.states.count, 0) + + @states.states.each do |state| + # Action number means + # + # * number = 0, default action + # * number = -Float::INFINITY, error by %nonassoc + # * number > 0, shift then move to state "number" + # * number < 0, reduce by "-number" rule. Rule "number" is already added by 1. + actions = Array.new(@states.terms.count, 0) + + if state.reduces.map(&:selected_look_ahead).any? {|la| !la.empty? } + # Iterate reduces with reverse order so that first rule is used. + state.reduces.reverse.each do |reduce| + reduce.look_ahead.each do |term| + actions[term.number] = rule_id_to_action_number(reduce.rule.id) + end + end + end + + # Shift is selected when S/R conflict exists. + state.selected_term_transitions.each do |shift, next_state| + actions[shift.next_sym.number] = next_state.id + end + + state.resolved_conflicts.select do |conflict| + conflict.which == :error + end.each do |conflict| + actions[conflict.symbol.number] = ErrorActionNumber + end + + # If default_reduction_rule, replase default_reduction_rule in + # actions with zero. + if state.default_reduction_rule + actions.map! do |e| + if e == rule_id_to_action_number(state.default_reduction_rule.id) + 0 + else + e + end + end + end + + # If no default_reduction_rule, default behavior is an + # error then replase ErrorActionNumber with zero. + if !state.default_reduction_rule + actions.map! do |e| + if e == ErrorActionNumber + 0 + else + e + end + end + end + + s = actions.each_with_index.map do |n, i| + [i, n] + end.select do |i, n| + # Remove default_reduction_rule entries + n != 0 + end + + if s.count != 0 + # Entry of @_actions is an array of + # + # * State id + # * Array of tuple, [from, to] where from is term number and to is action. + # * The number of "Array of tuple" used by sort_actions + # * "width" used by sort_actions + @_actions << [state.id, s, s.count, s.last[0] - s.first[0] + 1] + end + + @yydefact[state.id] = state.default_reduction_rule ? state.default_reduction_rule.id + 1 : 0 + end + end + + def compute_yydefgoto + # Default GOTO (nterm transition) for each nterm. + # Index is sequence number of nterm, value is state id + # of a default nterm transition destination. + @yydefgoto = Array.new(@states.nterms.count, 0) + h = {} + # Mapping from nterm to next_states + nterm_to_next_states = {} + terms_count = @states.terms.count + + @states.states.each do |state| + state.nterm_transitions.each do |shift, next_state| + key = shift.next_sym + nterm_to_next_states[key] ||= [] + nterm_to_next_states[key] << [state, next_state] # [from_state, to_state] + end + end + + @states.nterms.each do |nterm| + if !(states = nterm_to_next_states[nterm]) + default_goto = 0 + not_default_gotos = [] + else + default_state = states.map(&:last).group_by {|s| s }.max_by {|_, v| v.count }.first + default_goto = default_state.id + not_default_gotos = [] + states.each do |from_state, to_state| + next if to_state.id == default_goto + not_default_gotos << [from_state.id, to_state.id] + end + end + + k = nterm_number_to_sequence_number(nterm.number) + @yydefgoto[k] = default_goto + + if not_default_gotos.count != 0 + v = nterm_number_to_vector_number(nterm.number) + + # Entry of @_actions is an array of + # + # * Nterm number as vector number + # * Array of tuple, [from, to] where from is state number and to is state number. + # * The number of "Array of tuple" used by sort_actions + # * "width" used by sort_actions + @_actions << [v, not_default_gotos, not_default_gotos.count, not_default_gotos.last[0] - not_default_gotos.first[0] + 1] + end + end + end + + def sort_actions + # This is not same with #sort_actions + # + # @sorted_actions = @_actions.sort_by do |_, _, count, width| + # [-width, -count] + # end + + @sorted_actions = [] + + @_actions.each do |action| + if @sorted_actions.empty? + @sorted_actions << action + next + end + + j = @sorted_actions.count - 1 + state_id, froms_and_tos, count, width = action + + while (j >= 0) do + case + when @sorted_actions[j][3] < width + j -= 1 + when @sorted_actions[j][3] == width && @sorted_actions[j][2] < count + j -= 1 + else + break + end + end + + @sorted_actions.insert(j + 1, action) + end + end + + def debug_sorted_actions + ary = Array.new + @sorted_actions.each do |state_id, froms_and_tos, count, width| + ary[state_id] = [state_id, froms_and_tos, count, width] + end + + print sprintf("table_print:\n\n") + + print sprintf("order [\n") + vectors_count.times do |i| + print sprintf("%d, ", @sorted_actions[i] ? @sorted_actions[i][0] : 0) + print "\n" if i % 10 == 9 + end + print sprintf("]\n\n") + + + print sprintf("width [\n") + vectors_count.times do |i| + print sprintf("%d, ", ary[i] ? ary[i][3] : 0) + print "\n" if i % 10 == 9 + end + print sprintf("]\n\n") + + + print sprintf("tally [\n") + vectors_count.times do |i| + print sprintf("%d, ", ary[i] ? ary[i][2] : 0) + print "\n" if i % 10 == 9 + end + print sprintf("]\n\n") + end + + def compute_packed_table + # yypact and yypgoto + @base = Array.new(vectors_count, BaseMin) + # yytable + @table = [] + # yycheck + @check = [] + # Key is froms_and_tos, value is index position + pushed = {} + userd_res = {} + lowzero = 0 + high = 0 + + @sorted_actions.each do |state_id, froms_and_tos, _, _| + if (res = pushed[froms_and_tos]) + @base[state_id] = res + next + end + + res = lowzero - froms_and_tos.first[0] + + while true do + ok = true + + froms_and_tos.each do |from, to| + loc = res + from + + if @table[loc] + # If the cell of table is set, can not use the cell. + ok = false + break + end + end + + if ok && userd_res[res] + ok = false + end + + if ok + break + else + res += 1 + end + end + + loc = 0 + + froms_and_tos.each do |from, to| + loc = res + from + + @table[loc] = to + @check[loc] = from + end + + while (@table[lowzero]) do + lowzero += 1 + end + + high = loc if high < loc + + @base[state_id] = res + pushed[froms_and_tos] = res + userd_res[res] = true + end + + @yylast = high + + # replace_ninf + @yypact_ninf = (@base.select {|i| i != BaseMin } + [0]).min - 1 + @base.map! do |i| + case i + when BaseMin + @yypact_ninf + else + i + end + end + + @yytable_ninf = (@table.compact.select {|i| i != ErrorActionNumber } + [0]).min - 1 + @table.map! do |i| + case i + when nil + 0 + when ErrorActionNumber + @yytable_ninf + else + i + end + end + + @check.map! do |i| + case i + when nil + -1 + else + i + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/digraph.rb b/tool/lrama/lib/lrama/digraph.rb new file mode 100644 index 0000000000..28f26781b1 --- /dev/null +++ b/tool/lrama/lib/lrama/digraph.rb @@ -0,0 +1,53 @@ +module Lrama + # Algorithm Digraph of https://dl.acm.org/doi/pdf/10.1145/69622.357187 (P. 625) + class Digraph + def initialize(sets, relation, base_function) + # X in the paper + @sets = sets + # R in the paper + @relation = relation + # F' in the paper + @base_function = base_function + # S in the paper + @stack = [] + # N in the paper + @h = Hash.new(0) + # F in the paper + @result = {} + end + + def compute + @sets.each do |x| + next if @h[x] != 0 + traverse(x) + end + + return @result + end + + private + + def traverse(x) + @stack.push(x) + d = @stack.count + @h[x] = d + @result[x] = @base_function[x] # F x = F' x + + @relation[x] && @relation[x].each do |y| + traverse(y) if @h[y] == 0 + @h[x] = [@h[x], @h[y]].min + @result[x] |= @result[y] # F x = F x + F y + end + + if @h[x] == d + while true do + z = @stack.pop + @h[z] = Float::INFINITY + @result[z] = @result[x] # F (Top of S) = F x + + break if z == x + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar.rb b/tool/lrama/lib/lrama/grammar.rb new file mode 100644 index 0000000000..8ca75cb60b --- /dev/null +++ b/tool/lrama/lib/lrama/grammar.rb @@ -0,0 +1,850 @@ +require "forwardable" +require "lrama/lexer" + +module Lrama + Rule = Struct.new(:id, :lhs, :rhs, :code, :nullable, :precedence_sym, :lineno, keyword_init: true) do + # TODO: Change this to display_name + def to_s + l = lhs.id.s_value + r = rhs.empty? ? "ε" : rhs.map {|r| r.id.s_value }.join(", ") + + "#{l} -> #{r}" + end + + # Used by #user_actions + def as_comment + l = lhs.id.s_value + r = rhs.empty? ? "%empty" : rhs.map {|r| r.display_name }.join(" ") + + "#{l}: #{r}" + end + + def precedence + precedence_sym && precedence_sym.precedence + end + + def initial_rule? + id == 0 + end + + def translated_code + if code + code.translated_code + else + nil + end + end + end + + # Symbol is both of nterm and term + # `number` is both for nterm and term + # `token_id` is tokentype for term, internal sequence number for nterm + # + # TODO: Add validation for ASCII code range for Token::Char + Symbol = Struct.new(:id, :alias_name, :number, :tag, :term, :token_id, :nullable, :precedence, :printer, keyword_init: true) do + attr_writer :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol + + def term? + term + end + + def nterm? + !term + end + + def eof_symbol? + !!@eof_symbol + end + + def error_symbol? + !!@error_symbol + end + + def undef_symbol? + !!@undef_symbol + end + + def accept_symbol? + !!@accept_symbol + end + + def display_name + if alias_name + alias_name + else + id.s_value + end + end + + # name for yysymbol_kind_t + # + # See: b4_symbol_kind_base + def enum_name + case + when accept_symbol? + name = "YYACCEPT" + when eof_symbol? + name = "YYEOF" + when term? && id.type == Token::Char + if alias_name + name = number.to_s + alias_name + else + name = number.to_s + id.s_value + end + when term? && id.type == Token::Ident + name = id.s_value + when nterm? && (id.s_value.include?("$") || id.s_value.include?("@")) + name = number.to_s + id.s_value + when nterm? + name = id.s_value + else + raise "Unexpected #{self}" + end + + "YYSYMBOL_" + name.gsub(/[^a-zA-Z_0-9]+/, "_") + end + + # comment for yysymbol_kind_t + def comment + case + when accept_symbol? + # YYSYMBOL_YYACCEPT + id.s_value + when eof_symbol? + # YYEOF + alias_name + when (term? && 0 < token_id && token_id < 128) + # YYSYMBOL_3_backslash_, YYSYMBOL_14_ + alias_name || id.s_value + when id.s_value.include?("$") || id.s_value.include?("@") + # YYSYMBOL_21_1 + id.s_value + else + # YYSYMBOL_keyword_class, YYSYMBOL_strings_1 + alias_name || id.s_value + end + end + end + + Type = Struct.new(:id, :tag, keyword_init: true) + + Code = Struct.new(:type, :token_code, keyword_init: true) do + extend Forwardable + + def_delegators "token_code", :s_value, :line, :column, :references + + # $$, $n, @$, @n is translated to C code + def translated_code + case type + when :user_code + translated_user_code + when :initial_action + translated_initial_action_code + end + end + + # * ($1) error + # * ($$) *yyvaluep + # * (@1) error + # * (@$) *yylocationp + def translated_printer_code(tag) + t_code = s_value.dup + + references.reverse.each do |ref| + first_column = ref.first_column + last_column = ref.last_column + + case + when ref.number == "$" && ref.type == :dollar # $$ + # Omit "<>" + member = tag.s_value[1..-2] + str = "((*yyvaluep).#{member})" + when ref.number == "$" && ref.type == :at # @$ + str = "(*yylocationp)" + when ref.type == :dollar # $n + raise "$#{ref.number} can not be used in %printer." + when ref.type == :at # @n + raise "@#{ref.number} can not be used in %printer." + else + raise "Unexpected. #{code}, #{ref}" + end + + t_code[first_column..last_column] = str + end + + return t_code + end + + + private + + # * ($1) yyvsp[i] + # * ($$) yyval + # * (@1) yylsp[i] + # * (@$) yyloc + def translated_user_code + t_code = s_value.dup + + references.reverse.each do |ref| + first_column = ref.first_column + last_column = ref.last_column + + case + when ref.number == "$" && ref.type == :dollar # $$ + # Omit "<>" + member = ref.tag.s_value[1..-2] + str = "(yyval.#{member})" + when ref.number == "$" && ref.type == :at # @$ + str = "(yyloc)" + when ref.type == :dollar # $n + i = -ref.position_in_rhs + ref.number + # Omit "<>" + member = ref.tag.s_value[1..-2] + str = "(yyvsp[#{i}].#{member})" + when ref.type == :at # @n + i = -ref.position_in_rhs + ref.number + str = "(yylsp[#{i}])" + else + raise "Unexpected. #{code}, #{ref}" + end + + t_code[first_column..last_column] = str + end + + return t_code + end + + # * ($1) error + # * ($$) yylval + # * (@1) error + # * (@$) yylloc + def translated_initial_action_code + t_code = s_value.dup + + references.reverse.each do |ref| + first_column = ref.first_column + last_column = ref.last_column + + case + when ref.number == "$" && ref.type == :dollar # $$ + str = "yylval" + when ref.number == "$" && ref.type == :at # @$ + str = "yylloc" + when ref.type == :dollar # $n + raise "$#{ref.number} can not be used in initial_action." + when ref.type == :at # @n + raise "@#{ref.number} can not be used in initial_action." + else + raise "Unexpected. #{code}, #{ref}" + end + + t_code[first_column..last_column] = str + end + + return t_code + end + end + + # type: :dollar or :at + # ex_tag: "$<tag>1" (Optional) + Reference = Struct.new(:type, :number, :ex_tag, :first_column, :last_column, :referring_symbol, :position_in_rhs, keyword_init: true) do + def tag + if ex_tag + ex_tag + else + referring_symbol.tag + end + end + end + + Precedence = Struct.new(:type, :precedence, keyword_init: true) do + include Comparable + + def <=>(other) + self.precedence <=> other.precedence + end + end + + Printer = Struct.new(:ident_or_tags, :code, :lineno, keyword_init: true) do + def translated_code(member) + code.translated_printer_code(member) + end + end + + Union = Struct.new(:code, :lineno, keyword_init: true) do + def braces_less_code + # Remove braces + code.s_value[1..-2] + end + end + + Token = Lrama::Lexer::Token + + # Grammar is the result of parsing an input grammar file + class Grammar + # Grammar file information not used by States but by Output + Aux = Struct.new(:prologue_first_lineno, :prologue, :epilogue_first_lineno, :epilogue, keyword_init: true) + + attr_reader :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux + attr_accessor :union, :expect, + :printers, + :lex_param, :parse_param, :initial_action, + :symbols, :types, + :rules, :_rules, + :sym_to_rules + + def initialize + @printers = [] + @symbols = [] + @types = [] + @_rules = [] + @rules = [] + @sym_to_rules = {} + @empty_symbol = nil + @eof_symbol = nil + @error_symbol = nil + @undef_symbol = nil + @accept_symbol = nil + @aux = Aux.new + + append_special_symbols + end + + def add_printer(ident_or_tags:, code:, lineno:) + @printers << Printer.new(ident_or_tags: ident_or_tags, code: code, lineno: lineno) + end + + def add_term(id:, alias_name: nil, tag: nil, token_id: nil, replace: false) + if token_id && (sym = @symbols.find {|s| s.token_id == token_id }) + if replace + sym.id = id + sym.alias_name = alias_name + sym.tag = tag + end + + return sym + end + + if sym = @symbols.find {|s| s.id == id } + return sym + end + + sym = Symbol.new( + id: id, alias_name: alias_name, number: nil, tag: tag, + term: true, token_id: token_id, nullable: false + ) + @symbols << sym + @terms = nil + + return sym + end + + def add_nterm(id:, alias_name: nil, tag: nil) + return if @symbols.find {|s| s.id == id } + + sym = Symbol.new( + id: id, alias_name: alias_name, number: nil, tag: tag, + term: false, token_id: nil, nullable: nil, + ) + @symbols << sym + @nterms = nil + + return sym + end + + def add_type(id:, tag:) + @types << Type.new(id: id, tag: tag) + end + + def add_nonassoc(sym, precedence) + set_precedence(sym, Precedence.new(type: :nonassoc, precedence: precedence)) + end + + def add_left(sym, precedence) + set_precedence(sym, Precedence.new(type: :left, precedence: precedence)) + end + + def add_right(sym, precedence) + set_precedence(sym, Precedence.new(type: :right, precedence: precedence)) + end + + def set_precedence(sym, precedence) + raise "" if sym.nterm? + sym.precedence = precedence + end + + def set_union(code, lineno) + @union = Union.new(code: code, lineno: lineno) + end + + def add_rule(lhs:, rhs:, lineno:) + @_rules << [lhs, rhs, lineno] + end + + def build_references(token_code) + token_code.references.map! do |type, number, tag, first_column, last_column| + Reference.new(type: type, number: number, ex_tag: tag, first_column: first_column, last_column: last_column) + end + + token_code + end + + def build_code(type, token_code) + build_references(token_code) + Code.new(type: type, token_code: token_code) + end + + def prologue_first_lineno=(prologue_first_lineno) + @aux.prologue_first_lineno = prologue_first_lineno + end + + def prologue=(prologue) + @aux.prologue = prologue + end + + def epilogue_first_lineno=(epilogue_first_lineno) + @aux.epilogue_first_lineno = epilogue_first_lineno + end + + def epilogue=(epilogue) + @aux.epilogue = epilogue + end + + def prepare + normalize_rules + collect_symbols + replace_token_with_symbol + fill_symbol_number + fill_default_precedence + fill_sym_to_rules + fill_nterm_type + fill_symbol_printer + @symbols.sort_by!(&:number) + end + + # TODO: More validation methods + def validate! + validate_symbol_number_uniqueness! + end + + def compute_nullable + @rules.each do |rule| + case + when rule.rhs.empty? + rule.nullable = true + when rule.rhs.any?(&:term) + rule.nullable = false + else + # noop + end + end + + while true do + rs = @rules.select {|e| e.nullable.nil? } + nts = nterms.select {|e| e.nullable.nil? } + rule_count_1 = rs.count + nterm_count_1 = nts.count + + rs.each do |rule| + if rule.rhs.all?(&:nullable) + rule.nullable = true + end + end + + nts.each do |nterm| + find_rules_by_symbol!(nterm).each do |rule| + if rule.nullable + nterm.nullable = true + end + end + end + + rule_count_2 = @rules.count {|e| e.nullable.nil? } + nterm_count_2 = nterms.count {|e| e.nullable.nil? } + + if (rule_count_1 == rule_count_2) && (nterm_count_1 == nterm_count_2) + break + end + end + + rules.select {|r| r.nullable.nil? }.each do |rule| + rule.nullable = false + end + + nterms.select {|r| r.nullable.nil? }.each do |nterm| + nterm.nullable = false + end + end + + def find_symbol_by_s_value(s_value) + @symbols.find do |sym| + sym.id.s_value == s_value + end + end + + def find_symbol_by_s_value!(s_value) + find_symbol_by_s_value(s_value) || (raise "Symbol not found: #{s_value}") + end + + def find_symbol_by_id(id) + @symbols.find do |sym| + # TODO: validate uniqueness of Token#s_value and Symbol#alias_name + sym.id == id || sym.alias_name == id.s_value + end + end + + def find_symbol_by_id!(id) + find_symbol_by_id(id) || (raise "Symbol not found: #{id}") + end + + def find_symbol_by_number!(number) + sym = @symbols[number] + + raise "Symbol not found: #{number}" unless sym + raise "[BUG] Symbol number mismatch. #{number}, #{sym}" if sym.number != number + + sym + end + + def find_rules_by_symbol!(sym) + find_rules_by_symbol(sym) || (raise "Rules for #{sym} not found") + end + + def find_rules_by_symbol(sym) + @sym_to_rules[sym.number] + end + + def terms_count + terms.count + end + + def terms + @terms ||= @symbols.select(&:term?) + end + + def nterms_count + nterms.count + end + + def nterms + @nterms ||= @symbols.select(&:nterm?) + end + + private + + def find_nterm_by_id!(id) + nterms.find do |nterm| + nterm.id == id + end || (raise "Nterm not found: #{id}") + end + + + def append_special_symbols + # YYEMPTY (token_id: -2, number: -2) is added when a template is evaluated + # term = add_term(id: Token.new(Token::Ident, "YYEMPTY"), token_id: -2) + # term.number = -2 + # @empty_symbol = term + + # YYEOF + term = add_term(id: Token.new(type: Token::Ident, s_value: "YYEOF"), alias_name: "\"end of file\"", token_id: 0) + term.number = 0 + term.eof_symbol = true + @eof_symbol = term + + # YYerror + term = add_term(id: Token.new(type: Token::Ident, s_value: "YYerror"), alias_name: "error") + term.number = 1 + term.error_symbol = true + @error_symbol = term + + # YYUNDEF + term = add_term(id: Token.new(type: Token::Ident, s_value: "YYUNDEF"), alias_name: "\"invalid token\"") + term.number = 2 + term.undef_symbol = true + @undef_symbol = term + + # $accept + term = add_nterm(id: Token.new(type: Token::Ident, s_value: "$accept")) + term.accept_symbol = true + @accept_symbol = term + end + + # 1. Add $accept rule to the top of rules + # 2. Extract precedence and last action + # 3. Extract action in the middle of RHS into new Empty rule + # 4. Append id and extract action then create Rule + # + # Bison 3.8.2 uses different orders for symbol number and rule number + # when a rule has actions in the middle of a rule. + # + # For example, + # + # `program: $@1 top_compstmt` + # + # Rules are ordered like below, + # + # 1 $@1: ε + # 2 program: $@1 top_compstmt + # + # Symbols are ordered like below, + # + # 164 program + # 165 $@1 + # + def normalize_rules + # 1. Add $accept rule to the top of rules + accept = find_symbol_by_s_value!("$accept") + eof = find_symbol_by_number!(0) + lineno = @_rules.first ? @_rules.first[2] : 0 + @rules << Rule.new(id: @rules.count, lhs: accept, rhs: [@_rules.first[0], eof], code: nil, lineno: lineno) + + extracted_action_number = 1 # @n as nterm + + @_rules.each do |lhs, rhs, lineno| + a = [] + rhs1 = [] + code = nil + precedence_sym = nil + + # 2. Extract precedence and last action + rhs.reverse.each do |r| + case + when r.is_a?(Symbol) # precedence_sym + precedence_sym = r + when (r.type == Token::User_code) && precedence_sym.nil? && code.nil? && rhs1.empty? + code = r + else + rhs1 << r + end + end + rhs1.reverse! + + # Bison n'th component is 1-origin + (rhs1 + [code]).compact.each.with_index(1) do |token, i| + if token.type == Token::User_code + token.references.each do |ref| + # Need to keep position_in_rhs for actions in the middle of RHS + ref.position_in_rhs = i - 1 + next if ref.type == :at + # $$, $n, @$, @n can be used in any actions + number = ref.number + + if number == "$" + # TODO: Should be postponed after middle actions are extracted? + ref.referring_symbol = lhs + else + raise "Can not refer following component. #{number} >= #{i}. #{token}" if number >= i + rhs1[number - 1].referred = true + ref.referring_symbol = rhs1[number - 1] + end + end + end + end + + rhs2 = rhs1.map do |token| + if token.type == Token::User_code + prefix = token.referred ? "@" : "$@" + new_token = Token.new(type: Token::Ident, s_value: prefix + extracted_action_number.to_s) + extracted_action_number += 1 + a << [new_token, token] + new_token + else + token + end + end + + # Extract actions in the middle of RHS + # into new rules. + a.each do |new_token, code| + @rules << Rule.new(id: @rules.count, lhs: new_token, rhs: [], code: Code.new(type: :user_code, token_code: code), lineno: code.line) + end + + c = code ? Code.new(type: :user_code, token_code: code) : nil + @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: rhs2, code: c, precedence_sym: precedence_sym, lineno: lineno) + + add_nterm(id: lhs) + a.each do |new_token, _| + add_nterm(id: new_token) + end + end + end + + # Collect symbols from rules + def collect_symbols + @rules.flat_map(&:rhs).each do |s| + case s + when Token + if s.type == Token::Char + add_term(id: s) + end + when Symbol + # skip + else + raise "Unknown class: #{s}" + end + end + end + + # Fill #number and #token_id + def fill_symbol_number + # TODO: why start from 256 + token_id = 256 + + # YYEMPTY = -2 + # YYEOF = 0 + # YYerror = 1 + # YYUNDEF = 2 + number = 3 + + nterm_token_id = 0 + used_numbers = {} + + @symbols.map(&:number).each do |n| + used_numbers[n] = true + end + + (@symbols.select(&:term?) + @symbols.select(&:nterm?)).each do |sym| + while used_numbers[number] do + number += 1 + end + + if sym.number.nil? + sym.number = number + number += 1 + end + + # If id is Token::Char, it uses ASCII code + if sym.term? && sym.token_id.nil? + if sym.id.type == Token::Char + # Igonre ' on the both sides + case sym.id.s_value[1..-2] + when "\\b" + sym.token_id = 8 + when "\\f" + sym.token_id = 12 + when "\\n" + sym.token_id = 10 + when "\\r" + sym.token_id = 13 + when "\\t" + sym.token_id = 9 + when "\\v" + sym.token_id = 11 + when "\"" + sym.token_id = 34 + when "\'" + sym.token_id = 39 + when "\\\\" + sym.token_id = 92 + when /\A\\(\d+)\z/ + sym.token_id = Integer($1, 8) + when /\A(.)\z/ + sym.token_id = $1.bytes.first + else + raise "Unknown Char s_value #{sym}" + end + else + sym.token_id = token_id + token_id += 1 + end + end + + if sym.nterm? && sym.token_id.nil? + sym.token_id = nterm_token_id + nterm_token_id += 1 + end + end + end + + def replace_token_with_symbol + @rules.each do |rule| + rule.lhs = token_to_symbol(rule.lhs) + + rule.rhs.map! do |t| + token_to_symbol(t) + end + + if rule.code + rule.code.references.each do |ref| + next if ref.type == :at + + if ref.referring_symbol.type != Token::User_code + ref.referring_symbol = token_to_symbol(ref.referring_symbol) + end + end + end + end + end + + def token_to_symbol(token) + case token + when Token + find_symbol_by_id!(token) + when Symbol + token + else + raise "Unknown class: #{token}" + end + end + + # Rule inherits precedence from the last term in RHS. + # + # https://www.gnu.org/software/bison/manual/html_node/How-Precedence.html + def fill_default_precedence + @rules.each do |rule| + # Explicitly specified precedence has the highest priority + next if rule.precedence_sym + + precedence_sym = nil + rule.rhs.each do |sym| + precedence_sym = sym if sym.term? + end + + rule.precedence_sym = precedence_sym + end + end + + def fill_sym_to_rules + @rules.each do |rule| + key = rule.lhs.number + @sym_to_rules[key] ||= [] + @sym_to_rules[key] << rule + end + end + + # Fill nterm's tag defined by %type decl + def fill_nterm_type + @types.each do |type| + nterm = find_nterm_by_id!(type.id) + nterm.tag = type.tag + end + end + + def fill_symbol_printer + @symbols.each do |sym| + @printers.each do |printer| + printer.ident_or_tags.each do |ident_or_tag| + case ident_or_tag.type + when Token::Ident + sym.printer = printer if sym.id == ident_or_tag + when Token::Tag + sym.printer = printer if sym.tag == ident_or_tag + else + raise "Unknown token type. #{printer}" + end + end + end + end + end + + def validate_symbol_number_uniqueness! + invalid = @symbols.group_by(&:number).select do |number, syms| + syms.count > 1 + end + + return if invalid.empty? + + raise "Symbol number is dupulicated. #{invalid}" + end + end +end diff --git a/tool/lrama/lib/lrama/lexer.rb b/tool/lrama/lib/lrama/lexer.rb new file mode 100644 index 0000000000..5c9583327b --- /dev/null +++ b/tool/lrama/lib/lrama/lexer.rb @@ -0,0 +1,349 @@ +require "strscan" +require "lrama/report" + +module Lrama + # Lexer for parse.y + class Lexer + include Lrama::Report::Duration + + # s_value is semantic value + Token = Struct.new(:type, :s_value, keyword_init: true) do + Type = Struct.new(:id, :name, keyword_init: true) + + attr_accessor :line, :column, :referred + # For User_code + attr_accessor :references + + def to_s + "#{super} line: #{line}, column: #{column}" + end + + @i = 0 + @types = [] + + def self.define_type(name) + type = Type.new(id: @i, name: name.to_s) + const_set(name, type) + @types << type + @i += 1 + end + + # Token types + define_type(:P_expect) # %expect + define_type(:P_define) # %define + define_type(:P_printer) # %printer + define_type(:P_lex_param) # %lex-param + define_type(:P_parse_param) # %parse-param + define_type(:P_initial_action) # %initial-action + define_type(:P_union) # %union + define_type(:P_token) # %token + define_type(:P_type) # %type + define_type(:P_nonassoc) # %nonassoc + define_type(:P_left) # %left + define_type(:P_right) # %right + define_type(:P_prec) # %prec + define_type(:User_code) # { ... } + define_type(:Tag) # <int> + define_type(:Number) # 0 + define_type(:Ident_Colon) # k_if:, k_if : (spaces can be there) + define_type(:Ident) # api.pure, tNUMBER + define_type(:Semicolon) # ; + define_type(:Bar) # | + define_type(:String) # "str" + define_type(:Char) # '+' + end + + # States + # + # See: https://www.gnu.org/software/bison/manual/html_node/Grammar-Outline.html + Initial = 0 + Prologue = 1 + BisonDeclarations = 2 + GrammarRules = 3 + Epilogue = 4 + + # Token types + + attr_reader :prologue, :bison_declarations, :grammar_rules, :epilogue, + :bison_declarations_tokens, :grammar_rules_tokens + + def initialize(text) + @text = text + @state = Initial + # Array of texts + @prologue = [] + @bison_declarations = [] + @grammar_rules = [] + @epilogue = [] + + # + @bison_declarations_tokens = [] + @grammar_rules_tokens = [] + + @debug = false + + report_duration(:lex) do + lex_text + lex_bison_declarations_tokens + lex_grammar_rules_tokens + end + end + + private + + def create_token(type, s_value, line, column) + t = Token.new(type: type, s_value: s_value) + t.line = line + t.column = column + + return t + end + + # TODO: Remove this + def lex_text + @text.each_line.with_index(1) do |string, lineno| + case @state + when Initial + # Skip until "%{" + if string == "%{\n" + @state = Prologue + @prologue << ["", lineno] + next + end + when Prologue + # Between "%{" and "%}" + if string == "%}\n" + @state = BisonDeclarations + @prologue << ["", lineno] + next + end + + @prologue << [string, lineno] + when BisonDeclarations + if string == "%%\n" + @state = GrammarRules + next + end + + @bison_declarations << [string, lineno] + when GrammarRules + # Between "%%" and "%%" + if string == "%%\n" + @state = Epilogue + next + end + + @grammar_rules << [string, lineno] + when Epilogue + @epilogue << [string, lineno] + else + raise "Unknown state: #{@state}" + end + end + end + + # See: + # * https://www.gnu.org/software/bison/manual/html_node/Decl-Summary.html + # * https://www.gnu.org/software/bison/manual/html_node/Symbol-Decls.html + # * https://www.gnu.org/software/bison/manual/html_node/Empty-Rules.html + def lex_common(lines, tokens) + line = lines.first[1] + column = 0 + ss = StringScanner.new(lines.map(&:first).join) + + while !ss.eos? do + case + when ss.scan(/\n/) + line += 1 + column = ss.pos + when ss.scan(/\s+/) + # skip + when ss.scan(/;/) + tokens << create_token(Token::Semicolon, ss[0], line, ss.pos - column) + when ss.scan(/\|/) + tokens << create_token(Token::Bar, ss[0], line, ss.pos - column) + when ss.scan(/(\d+)/) + tokens << create_token(Token::Number, Integer(ss[0]), line, ss.pos - column) + when ss.scan(/(<[a-zA-Z0-9_]+>)/) + tokens << create_token(Token::Tag, ss[0], line, ss.pos - column) + when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)\s*:/) + tokens << create_token(Token::Ident_Colon, ss[1], line, ss.pos - column) + when ss.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)/) + tokens << create_token(Token::Ident, ss[0], line, ss.pos - column) + when ss.scan(/%expect/) + tokens << create_token(Token::P_expect, ss[0], line, ss.pos - column) + when ss.scan(/%define/) + tokens << create_token(Token::P_define, ss[0], line, ss.pos - column) + when ss.scan(/%printer/) + tokens << create_token(Token::P_printer, ss[0], line, ss.pos - column) + when ss.scan(/%lex-param/) + tokens << create_token(Token::P_lex_param, ss[0], line, ss.pos - column) + when ss.scan(/%parse-param/) + tokens << create_token(Token::P_parse_param, ss[0], line, ss.pos - column) + when ss.scan(/%initial-action/) + tokens << create_token(Token::P_initial_action, ss[0], line, ss.pos - column) + when ss.scan(/%union/) + tokens << create_token(Token::P_union, ss[0], line, ss.pos - column) + when ss.scan(/%token/) + tokens << create_token(Token::P_token, ss[0], line, ss.pos - column) + when ss.scan(/%type/) + tokens << create_token(Token::P_type, ss[0], line, ss.pos - column) + when ss.scan(/%nonassoc/) + tokens << create_token(Token::P_nonassoc, ss[0], line, ss.pos - column) + when ss.scan(/%left/) + tokens << create_token(Token::P_left, ss[0], line, ss.pos - column) + when ss.scan(/%right/) + tokens << create_token(Token::P_right, ss[0], line, ss.pos - column) + when ss.scan(/%prec/) + tokens << create_token(Token::P_prec, ss[0], line, ss.pos - column) + when ss.scan(/{/) + token, line = lex_user_code(ss, line, ss.pos - column, lines) + tokens << token + when ss.scan(/"/) + string, line = lex_string(ss, "\"", line, lines) + token = create_token(Token::String, string, line, ss.pos - column) + tokens << token + when ss.scan(/\/\*/) + # TODO: Need to keep comment? + line = lex_comment(ss, line, lines, "") + when ss.scan(/'(.)'/) + tokens << create_token(Token::Char, ss[0], line, ss.pos - column) + when ss.scan(/'\\(.)'/) # '\\', '\t' + tokens << create_token(Token::Char, ss[0], line, ss.pos - column) + when ss.scan(/'\\(\d+)'/) # '\13' + tokens << create_token(Token::Char, ss[0], line, ss.pos - column) + when ss.scan(/%empty/) + # skip + else + l = line - lines.first[1] + split = ss.string.split("\n") + col = ss.pos - split[0...l].join("\n").length + raise "Parse error (unknow token): #{split[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{col})" + end + end + end + + def lex_bison_declarations_tokens + lex_common(@bison_declarations, @bison_declarations_tokens) + end + + def lex_user_code(ss, line, column, lines) + first_line = line + first_column = column + debug("Enter lex_user_code: #{line}") + brace_count = 1 + str = "{" + # Array of [type, $n, tag, first column, last column] + # TODO: Is it better to keep string, like "$$", and use gsub? + references = [] + + while !ss.eos? do + case + when ss.scan(/\n/) + line += 1 + when ss.scan(/"/) + string, line = lex_string(ss, "\"", line, lines) + str << string + next + when ss.scan(/'/) + string, line = lex_string(ss, "'", line, lines) + str << string + next + when ss.scan(/\$(<[a-zA-Z0-9_]+>)?\$/) # $$, $<long>$ + tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil + references << [:dollar, "$", tag, str.length, str.length + ss[0].length - 1] + when ss.scan(/\$(<[a-zA-Z0-9_]+>)?(\d+)/) # $1, $2, $<long>1 + tag = ss[1] ? create_token(Token::Tag, ss[1], line, str.length) : nil + references << [:dollar, Integer(ss[2]), tag, str.length, str.length + ss[0].length - 1] + when ss.scan(/@\$/) # @$ + references << [:at, "$", nil, str.length, str.length + ss[0].length - 1] + when ss.scan(/@(\d)+/) # @1 + references << [:at, Integer(ss[1]), nil, str.length, str.length + ss[0].length - 1] + when ss.scan(/{/) + brace_count += 1 + when ss.scan(/}/) + brace_count -= 1 + + debug("Return lex_user_code: #{line}") + if brace_count == 0 + str << ss[0] + user_code = Token.new(type: Token::User_code, s_value: str.freeze) + user_code.line = first_line + user_code.column = first_column + user_code.references = references + return [user_code, line] + end + when ss.scan(/\/\*/) + str << ss[0] + line = lex_comment(ss, line, lines, str) + else + # noop, just consume char + str << ss.getch + next + end + + str << ss[0] + end + + # Reach to end of input but brace does not match + l = line - lines.first[1] + raise "Parse error (brace mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})" + end + + def lex_string(ss, terminator, line, lines) + debug("Enter lex_string: #{line}") + + str = terminator.dup + + while (c = ss.getch) do + str << c + + case c + when "\n" + line += 1 + when terminator + debug("Return lex_string: #{line}") + return [str, line] + else + # noop + end + end + + # Reach to end of input but quote does not match + l = line - lines.first[1] + raise "Parse error (quote mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})" + end + + # TODO: Need to handle // style comment + # + # /* */ style comment + def lex_comment(ss, line, lines, str) + while !ss.eos? do + case + when ss.scan(/\n/) + line += 1 + when ss.scan(/\*\//) + return line + else + str << ss.getch + next + end + + str << ss[0] + end + + # Reach to end of input but quote does not match + l = line - lines.first[1] + raise "Parse error (comment mismatch): #{ss.string.split("\n")[l]} \"#{ss.string[ss.pos]}\" (#{line}: #{ss.pos})" + end + + def lex_grammar_rules_tokens + lex_common(@grammar_rules, @grammar_rules_tokens) + end + + def debug(msg) + return unless @debug + puts "#{msg}\n" + end + end +end diff --git a/tool/lrama/lib/lrama/output.rb b/tool/lrama/lib/lrama/output.rb new file mode 100644 index 0000000000..eaefbd04dc --- /dev/null +++ b/tool/lrama/lib/lrama/output.rb @@ -0,0 +1,370 @@ +require "erb" +require "forwardable" +require "lrama/report" + +module Lrama + class Output + extend Forwardable + include Report::Duration + + attr_reader :grammar_file_path, :context, :grammar + + def_delegators "@context", :yyfinal, :yylast, :yyntokens, :yynnts, :yynrules, :yynstates, + :yymaxutok, :yypact_ninf, :yytable_ninf + + def_delegators "@grammar", :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol + + def initialize(out:, output_file_path:, template_name:, grammar_file_path:, header_out: nil, header_file_path: nil, context:, grammar:) + @out = out + @output_file_path = output_file_path + @template_name = template_name + @grammar_file_path = grammar_file_path + @header_out = header_out + @header_file_path = header_file_path + @context = context + @grammar = grammar + end + + if ERB.instance_method(:initialize).parameters.last.first == :key + def self.erb(input) + ERB.new(input, trim_mode: '-') + end + else + def self.erb(input) + ERB.new(input, nil, '-') + end + end + + def eval_template(file, path) + erb = self.class.erb(File.read(file)) + erb.filename = file + tmp = erb.result_with_hash(context: @context, output: self) + replace_special_variables(tmp, path) + end + + def render + report_duration(:render) do + tmp = eval_template(template_file, @output_file_path) + @out << tmp + + if @header_file_path + tmp = eval_template(header_template_file, @header_file_path) + + if @header_out + @header_out << tmp + else + File.open(@header_file_path, "w+") do |f| + f << tmp + end + end + end + end + end + + # A part of b4_token_enums + def token_enums + str = "" + + @context.yytokentype.each do |s_value, token_id, display_name| + s = sprintf("%s = %d%s", s_value, token_id, token_id == yymaxutok ? "" : ",") + + if display_name + str << sprintf(" %-30s /* %s */\n", s, display_name) + else + str << sprintf(" %s\n", s) + end + end + + str + end + + # b4_symbol_enum + def symbol_enum + str = "" + + last_sym_number = @context.yysymbol_kind_t.last[1] + @context.yysymbol_kind_t.each do |s_value, sym_number, display_name| + s = sprintf("%s = %d%s", s_value, sym_number, (sym_number == last_sym_number) ? "" : ",") + + if display_name + str << sprintf(" %-40s /* %s */\n", s, display_name) + else + str << sprintf(" %s\n", s) + end + end + + str + end + + def yytranslate + int_array_to_string(@context.yytranslate) + end + + def yyrline + int_array_to_string(@context.yyrline) + end + + def yytname + string_array_to_string(@context.yytname) + " YY_NULLPTR" + end + + # b4_int_type_for + def int_type_for(ary) + min = ary.min + max = ary.max + + case + when (-127 <= min && min <= 127) && (-127 <= max && max <= 127) + "yytype_int8" + when (0 <= min && min <= 255) && (0 <= max && max <= 255) + "yytype_uint8" + when (-32767 <= min && min <= 32767) && (-32767 <= max && max <= 32767) + "yytype_int16" + when (0 <= min && min <= 65535) && (0 <= max && max <= 65535) + "yytype_uint16" + else + "int" + end + end + + def symbol_actions_for_printer + str = "" + + @grammar.symbols.each do |sym| + next unless sym.printer + + str << <<-STR + case #{sym.enum_name}: /* #{sym.comment} */ +#line #{sym.printer.lineno} "#{@grammar_file_path}" + #{sym.printer.translated_code(sym.tag)} +#line [@oline@] [@ofile@] + break; + + STR + end + + str + end + + # b4_user_initial_action + def user_initial_action(comment = "") + return "" unless @grammar.initial_action + + <<-STR + #{comment} +#line #{@grammar.initial_action.line} "#{@grammar_file_path}" + #{@grammar.initial_action.translated_code} + STR + end + + # b4_user_actions + def user_actions + str = "" + + @context.states.rules.each do |rule| + next unless rule.code + + rule = rule + code = rule.code + spaces = " " * (code.column - 1) + + str << <<-STR + case #{rule.id + 1}: /* #{rule.as_comment} */ +#line #{code.line} "#{@grammar_file_path}" +#{spaces}#{rule.translated_code} +#line [@oline@] [@ofile@] + break; + + STR + end + + str << <<-STR + +#line [@oline@] [@ofile@] + STR + + str + end + + def omit_braces_and_blanks(param) + param[1..-2].strip + end + + # b4_parse_param + def parse_param + if @grammar.parse_param + omit_braces_and_blanks(@grammar.parse_param) + else + "" + end + end + + def lex_param + if @grammar.lex_param + omit_braces_and_blanks(@grammar.lex_param) + else + "" + end + end + + # b4_user_formals + def user_formals + if @grammar.parse_param + ", #{parse_param}" + else + "" + end + end + + # b4_user_args + def user_args + if @grammar.parse_param + ", #{parse_param_name}" + else + "" + end + end + + def extract_param_name(param) + /\A(.)+([a-zA-Z0-9_]+)\z/.match(param)[2] + end + + def parse_param_name + if @grammar.parse_param + extract_param_name(parse_param) + else + "" + end + end + + def lex_param_name + if @grammar.lex_param + extract_param_name(lex_param) + else + "" + end + end + + # b4_parse_param_use + def parse_param_use(val, loc) + str = <<-STR + YY_USE (#{val}); + YY_USE (#{loc}); + STR + + if @grammar.parse_param + str << " YY_USE (#{parse_param_name});" + end + + str + end + + # b4_yylex_formals + def yylex_formals + ary = ["&yylval", "&yylloc"] + + if @grammar.lex_param + ary << lex_param_name + end + + "(#{ary.join(', ')})" + end + + # b4_table_value_equals + def table_value_equals(table, value, literal, symbol) + if literal < table.min || table.max < literal + "0" + else + "((#{value}) == #{symbol})" + end + end + + # b4_yyerror_args + def yyerror_args + ary = ["&yylloc"] + + if @grammar.parse_param + ary << parse_param_name + end + + "#{ary.join(', ')}" + end + + def template_basename + File.basename(template_file) + end + + def aux + @grammar.aux + end + + def int_array_to_string(ary) + last = ary.count - 1 + + s = ary.each_with_index.each_slice(10).map do |slice| + str = " " + + slice.each do |e, i| + str << sprintf("%6d%s", e, (i == last) ? "" : ",") + end + + str + end + + s.join("\n") + end + + def spec_mapped_header_file + @header_file_path + end + + def b4_cpp_guard__b4_spec_mapped_header_file + if @header_file_path + "YY_YY_" + @header_file_path.gsub(/[^a-zA-Z_0-9]+/, "_").upcase + "_INCLUDED" + else + "" + end + end + + private + + def template_file + File.join(template_dir, @template_name) + end + + def header_template_file + File.join(template_dir, "bison/yacc.h") + end + + def template_dir + File.expand_path("../../../template", __FILE__) + end + + def string_array_to_string(ary) + str = "" + tmp = " " + + ary.each do |s| + s = s.gsub('\\', '\\\\\\\\') + s = s.gsub('"', '\\"') + + if (tmp + s + " \"\",").length > 75 + str << tmp << "\n" + tmp = " \"#{s}\"," + else + tmp << " \"#{s}\"," + end + end + + str << tmp + end + + def replace_special_variables(str, ofile) + str.each_line.with_index(1).map do |line, i| + line.gsub!("[@oline@]", (i + 1).to_s) + line.gsub!("[@ofile@]", "\"#{ofile}\"") + line + end.join + end + end +end diff --git a/tool/lrama/lib/lrama/parser.rb b/tool/lrama/lib/lrama/parser.rb new file mode 100644 index 0000000000..e90a94c637 --- /dev/null +++ b/tool/lrama/lib/lrama/parser.rb @@ -0,0 +1,321 @@ +require "lrama/report" + +module Lrama + # Parser for parse.y, generates a grammar + class Parser + include Lrama::Report::Duration + + T = Lrama::Lexer::Token + + class TokenScanner + def initialize(tokens) + @tokens = tokens + @index = 0 + end + + def current_token + @tokens[@index] + end + + def current_type + current_token && current_token.type + end + + def next + token = current_token + @index += 1 + return token + end + + def consume(*token_types) + if token_types.include?(current_type) + token = current_token + self.next + return token + end + + return nil + end + + def consume!(*token_types) + consume(*token_types) || (raise "#{token_types} is expected but #{current_type}. #{current_token}") + end + + def consume_multi(*token_types) + a = [] + + while token_types.include?(current_type) + a << current_token + self.next + end + + raise "No token is consumed. #{token_types}" if a.empty? + + return a + end + + def eots? + current_token.nil? + end + end + + def initialize(text) + @text = text + end + + def parse + report_duration(:parse) do + lexer = Lexer.new(@text) + grammar = Grammar.new + process_prologue(grammar, lexer) + parse_bison_declarations(TokenScanner.new(lexer.bison_declarations_tokens), grammar) + parse_grammar_rules(TokenScanner.new(lexer.grammar_rules_tokens), grammar) + process_epilogue(grammar, lexer) + grammar.prepare + grammar.compute_nullable + grammar.validate! + + grammar + end + end + + private + + def process_prologue(grammar, lexer) + grammar.prologue_first_lineno = lexer.prologue.first[1] if lexer.prologue.first + grammar.prologue = lexer.prologue.map(&:first).join + end + + def process_epilogue(grammar, lexer) + grammar.epilogue_first_lineno = lexer.epilogue.first[1] if lexer.epilogue.first + grammar.epilogue = lexer.epilogue.map(&:first).join + end + + def parse_bison_declarations(ts, grammar) + precedence_number = 0 + + while !ts.eots? do + case ts.current_type + when T::P_expect + ts.next + grammar.expect = ts.consume!(T::Number).s_value + when T::P_define + ts.next + # Ignore + ts.consume_multi(T::Ident) + when T::P_printer + lineno = ts.current_token.line + ts.next + code = ts.consume!(T::User_code) + code = grammar.build_code(:printer, code) + ident_or_tags = ts.consume_multi(T::Ident, T::Tag) + grammar.add_printer(ident_or_tags: ident_or_tags, code: code, lineno: lineno) + when T::P_lex_param + ts.next + code = ts.consume!(T::User_code) + code = grammar.build_code(:lex_param, code) + grammar.lex_param = code.token_code.s_value + when T::P_parse_param + ts.next + code = ts.consume!(T::User_code) + code = grammar.build_code(:parse_param, code) + grammar.parse_param = code.token_code.s_value + when T::P_initial_action + ts.next + code = ts.consume!(T::User_code) + code = grammar.build_code(:initial_action, code) + ts.consume(T::Semicolon) + grammar.initial_action = code + when T::P_union + lineno = ts.current_token.line + ts.next + code = ts.consume!(T::User_code) + code = grammar.build_code(:union, code) + ts.consume(T::Semicolon) + grammar.set_union(code, lineno) + when T::P_token + # %token tag? (ident number? string?)+ + # + # * ident can be char, e.g. '\\', '\t', '\13' + # * number is a token_id for term + # + # These are valid token declaration (from CRuby parse.y) + # + # %token END_OF_INPUT 0 "end-of-input" + # %token <id> '\\' "backslash" + # %token tSP "escaped space" + # %token tUPLUS 132 "unary+" + # %token tCOLON3 ":: at EXPR_BEG" + # %token tSTRING_DBEG tSTRING_DVAR tLAMBEG tLABEL_END + # + # + # See: https://www.gnu.org/software/bison/manual/html_node/Symbol-Decls.html + ts.next + opt_tag = ts.consume(T::Tag) + + while (id = ts.consume(T::Ident, T::Char)) do + opt_number = ts.consume(T::Number) + opt_string = ts.consume(T::String) + # Can replace 0 (EOF) + grammar.add_term( + id: id, + alias_name: opt_string && opt_string.s_value, + token_id: opt_number && opt_number.s_value, + tag: opt_tag, + replace: true, + ) + end + when T::P_type + # %type tag? (ident|char|string)+ + # + # See: https://www.gnu.org/software/bison/manual/html_node/Symbol-Decls.html + ts.next + opt_tag = ts.consume(T::Tag) + + while (id = ts.consume(T::Ident, T::Char, T::String)) do + grammar.add_type( + id: id, + tag: opt_tag + ) + end + when T::P_nonassoc + # %nonassoc (ident|char|string)+ + ts.next + while (id = ts.consume(T::Ident, T::Char, T::String)) do + sym = grammar.add_term(id: id) + grammar.add_nonassoc(sym, precedence_number) + end + precedence_number += 1 + when T::P_left + # %left (ident|char|string)+ + ts.next + while (id = ts.consume(T::Ident, T::Char, T::String)) do + sym = grammar.add_term(id: id) + grammar.add_left(sym, precedence_number) + end + precedence_number += 1 + when T::P_right + # %right (ident|char|string)+ + ts.next + while (id = ts.consume(T::Ident, T::Char, T::String)) do + sym = grammar.add_term(id: id) + grammar.add_right(sym, precedence_number) + end + precedence_number += 1 + when nil + # end of input + raise "Reach to end of input within declarations" + else + raise "Unexpected token: #{ts.current_token}" + end + end + end + + def parse_grammar_rules(ts, grammar) + while !ts.eots? do + parse_grammar_rule(ts, grammar) + end + end + + # TODO: Take care of %prec of rule. + # If %prec exists, user code before %prec + # is NOT an action. For example "{ code 3 }" is NOT an action. + # + # keyword_class { code 2 } tSTRING '!' keyword_end { code 3 } %prec "=" + def parse_grammar_rule(ts, grammar) + # LHS + lhs = ts.consume!(T::Ident_Colon) # class: + lhs.type = T::Ident + + rhs = parse_grammar_rule_rhs(ts, grammar) + + grammar.add_rule(lhs: lhs, rhs: rhs, lineno: rhs.first ? rhs.first.line : lhs.line) + + while true do + case ts.current_type + when T::Bar + # | + bar_lineno = ts.current_token.line + ts.next + rhs = parse_grammar_rule_rhs(ts, grammar) + grammar.add_rule(lhs: lhs, rhs: rhs, lineno: rhs.first ? rhs.first.line : bar_lineno) + when T::Semicolon + # ; + ts.next + break + when T::Ident_Colon + # Next lhs can be here because ";" is optional. + # Do not consume next token. + break + when nil + # end of input can be here when ";" is omitted + break + else + raise "Unexpected token: #{ts.current_token}" + end + end + end + + def parse_grammar_rule_rhs(ts, grammar) + a = [] + prec_seen = false + code_after_prec = false + + while true do + # TODO: Srting can be here + case ts.current_type + when T::Ident + # keyword_class + + raise "Ident after %prec" if prec_seen + a << ts.current_token + ts.next + when T::Char + # '!' + + raise "Char after %prec" if prec_seen + a << ts.current_token + ts.next + when T::P_prec + # %prec tPLUS + # + # See: https://www.gnu.org/software/bison/manual/html_node/Contextual-Precedence.html + + ts.next + prec_seen = true + precedence_id = ts.consume!(T::Ident, T::String, T::Char) + precedence_sym = grammar.find_symbol_by_id!(precedence_id) + a << precedence_sym + when T::User_code + # { code } in the middle of rhs + + if prec_seen + raise "Multiple User_code after %prec" if code_after_prec + code_after_prec = true + end + + code = ts.current_token + grammar.build_references(code) + a << code + ts.next + when T::Bar + # | + break + when T::Semicolon + # ; + break + when T::Ident_Colon + # Next lhs can be here because ";" is optional. + break + when nil + # end of input can be here when ";" is omitted + break + else + raise "Unexpected token: #{ts.current_token}" + end + end + + return a + end + end +end diff --git a/tool/lrama/lib/lrama/report.rb b/tool/lrama/lib/lrama/report.rb new file mode 100644 index 0000000000..7016a45171 --- /dev/null +++ b/tool/lrama/lib/lrama/report.rb @@ -0,0 +1,47 @@ +module Lrama + class Report + module Profile + # 1. Wrap target method with Profile.report_profile like below: + # + # Lrama::Report::Profile.report_profile { method } + # + # 2. Run lrama command, for example + # + # $ ./exe/lrama --trace=time spec/fixtures/integration/ruby_3_2_0/parse.tmp.y + # + # 3. Generate html file + # + # $ stackprof --d3-flamegraph tmp/stackprof-cpu-myapp.dump > tmp/flamegraph.html + # + def self.report_profile + require "stackprof" + + StackProf.run(mode: :cpu, raw: true, out: 'tmp/stackprof-cpu-myapp.dump') do + yield + end + end + end + + module Duration + def self.enable + @_report_duration_enabled = true + end + + def self.enabled? + !!@_report_duration_enabled + end + + def report_duration(method_name) + time1 = Time.now.to_f + result = yield + time2 = Time.now.to_f + + if Duration.enabled? + puts sprintf("%s %10.5f s", method_name, time2 - time1) + end + + return result + end + end + end +end diff --git a/tool/lrama/lib/lrama/states.rb b/tool/lrama/lib/lrama/states.rb new file mode 100644 index 0000000000..f907db30cc --- /dev/null +++ b/tool/lrama/lib/lrama/states.rb @@ -0,0 +1,832 @@ +require "forwardable" +require "lrama/report" + +module Lrama + class State + class Reduce + # https://www.gnu.org/software/bison/manual/html_node/Default-Reductions.html + attr_reader :item, :look_ahead, :not_selected_symbols + attr_accessor :default_reduction + + def initialize(item) + @item = item + @look_ahead = nil + @not_selected_symbols = [] + end + + def rule + @item.rule + end + + def look_ahead=(look_ahead) + @look_ahead = look_ahead.freeze + end + + def add_not_selected_symbol(sym) + @not_selected_symbols << sym + end + + def selected_look_ahead + if @look_ahead + @look_ahead - @not_selected_symbols + else + [] + end + end + end + + class Shift + attr_reader :next_sym, :next_items + attr_accessor :not_selected + + def initialize(next_sym, next_items) + @next_sym = next_sym + @next_items = next_items + end + end + + # * symbol: A symbol under discussion + # * reduce: A reduce under discussion + # * which: For which a conflict is resolved. :shift, :reduce or :error (for nonassociative) + ResolvedConflict = Struct.new(:symbol, :reduce, :which, :same_prec, keyword_init: true) do + def report_message + s = symbol.display_name + r = reduce.rule.precedence_sym.display_name + case + when which == :shift && same_prec + msg = "resolved as #{which} (%right #{s})" + when which == :shift + msg = "resolved as #{which} (#{r} < #{s})" + when which == :reduce && same_prec + msg = "resolved as #{which} (%left #{s})" + when which == :reduce + msg = "resolved as #{which} (#{s} < #{r})" + when which == :error + msg = "resolved as an #{which} (%nonassoc #{s})" + else + raise "Unknown direction. #{self}" + end + + "Conflict between rule #{reduce.rule.id} and token #{s} #{msg}." + end + end + + Conflict = Struct.new(:symbols, :reduce, :type, keyword_init: true) + + attr_reader :id, :accessing_symbol, :kernels, :conflicts, :resolved_conflicts, + :default_reduction_rule, :closure, :items + attr_accessor :shifts, :reduces + + def initialize(id, accessing_symbol, kernels) + @id = id + @accessing_symbol = accessing_symbol + @kernels = kernels.freeze + @items = @kernels + # Manage relationships between items to state + # to resolve next state + @items_to_state = {} + @conflicts = [] + @resolved_conflicts = [] + @default_reduction_rule = nil + end + + def closure=(closure) + @closure = closure + @items = @kernels + @closure + end + + def non_default_reduces + reduces.select do |reduce| + reduce.rule != @default_reduction_rule + end + end + + def compute_shifts_reduces + _shifts = {} + reduces = [] + items.each do |item| + # TODO: Consider what should be pushed + if item.end_of_rule? + reduces << Reduce.new(item) + else + key = item.next_sym + _shifts[key] ||= [] + _shifts[key] << item.new_by_next_position + end + end + + # It seems Bison 3.8.2 iterates transitions order by symbol number + shifts = _shifts.sort_by do |next_sym, new_items| + next_sym.number + end.map do |next_sym, new_items| + Shift.new(next_sym, new_items.flatten) + end + self.shifts = shifts.freeze + self.reduces = reduces.freeze + end + + def set_items_to_state(items, next_state) + @items_to_state[items] = next_state + end + + # + def set_look_ahead(rule, look_ahead) + reduce = reduces.find do |r| + r.rule == rule + end + + reduce.look_ahead = look_ahead + end + + # Returns array of [nterm, next_state] + def nterm_transitions + return @nterm_transitions if @nterm_transitions + + @nterm_transitions = [] + + shifts.each do |shift| + next if shift.next_sym.term? + + @nterm_transitions << [shift, @items_to_state[shift.next_items]] + end + + @nterm_transitions + end + + # Returns array of [term, next_state] + def term_transitions + return @term_transitions if @term_transitions + + @term_transitions = [] + + shifts.each do |shift| + next if shift.next_sym.nterm? + + @term_transitions << [shift, @items_to_state[shift.next_items]] + end + + @term_transitions + end + + def selected_term_transitions + term_transitions.select do |shift, next_state| + !shift.not_selected + end + end + + # Move to next state by sym + def transition(sym) + result = nil + + if sym.term? + term_transitions.each do |shift, next_state| + term = shift.next_sym + result = next_state if term == sym + end + else + nterm_transitions.each do |shift, next_state| + nterm = shift.next_sym + result = next_state if nterm == sym + end + end + + raise "Can not transit by #{sym} #{self}" if result.nil? + + result + end + + def find_reduce_by_item!(item) + reduces.find do |r| + r.item == item + end || (raise "reduce is not found. #{item}, #{state}") + end + + def default_reduction_rule=(default_reduction_rule) + @default_reduction_rule = default_reduction_rule + + reduces.each do |r| + if r.rule == default_reduction_rule + r.default_reduction = true + end + end + end + + def sr_conflicts + @conflicts.select do |conflict| + conflict.type == :shift_reduce + end + end + + def rr_conflicts + @conflicts.select do |conflict| + conflict.type == :reduce_reduce + end + end + end + + # States is passed to a template file + # + # "Efficient Computation of LALR(1) Look-Ahead Sets" + # https://dl.acm.org/doi/pdf/10.1145/69622.357187 + class States + extend Forwardable + include Lrama::Report::Duration + + def_delegators "@grammar", :symbols, :terms, :nterms, :rules, + :accept_symbol, :eof_symbol, :find_symbol_by_s_value! + + # TODO: Validate position is not over rule rhs + Item = Struct.new(:rule, :position, keyword_init: true) do + # Optimization for States#setup_state + def hash + [rule.id, position].hash + end + + def rule_id + rule.id + end + + def next_sym + rule.rhs[position] + end + + def end_of_rule? + rule.rhs.count == position + end + + def new_by_next_position + Item.new(rule: rule, position: position + 1) + end + + def previous_sym + rule.rhs[position - 1] + end + + def display_name + r = rule.rhs.map(&:display_name).insert(position, "•").join(" ") + "#{r} (rule #{rule.id})" + end + + # Right after position + def display_rest + r = rule.rhs[position..-1].map(&:display_name).join(" ") + ". #{r} (rule #{rule.id})" + end + end + + attr_reader :states, :reads_relation, :includes_relation, :lookback_relation + + def initialize(grammar, warning, trace_state: false) + @grammar = grammar + @warning = warning + @trace_state = trace_state + + @states = [] + + # `DR(p, A) = {t ∈ T | p -(A)-> r -(t)-> }` + # where p is state, A is nterm, t is term. + # + # `@direct_read_sets` is a hash whose + # key is [state.id, nterm.token_id], + # value is bitmap of term. + @direct_read_sets = {} + + # Reads relation on nonterminal transitions (pair of state and nterm) + # `(p, A) reads (r, C) iff p -(A)-> r -(C)-> and C =>* ε` + # where p, r are state, A, C are nterm. + # + # `@reads_relation` is a hash whose + # key is [state.id, nterm.token_id], + # value is array of [state.id, nterm.token_id]. + @reads_relation = {} + + # `@read_sets` is a hash whose + # key is [state.id, nterm.token_id], + # value is bitmap of term. + @read_sets = {} + + # `(p, A) includes (p', B) iff B -> βAγ, γ =>* ε, p' -(β)-> p` + # where p, p' are state, A, B are nterm, β, γ is sequence of symbol. + # + # `@includes_relation` is a hash whose + # key is [state.id, nterm.token_id], + # value is array of [state.id, nterm.token_id]. + @includes_relation = {} + + # `(q, A -> ω) lookback (p, A) iff p -(ω)-> q` + # where p, q are state, A -> ω is rule, A is nterm, ω is sequence of symbol. + # + # `@lookback_relation` is a hash whose + # key is [state.id, rule.id], + # value is array of [state.id, nterm.token_id]. + @lookback_relation = {} + + # `@follow_sets` is a hash whose + # key is [state.id, rule.id], + # value is bitmap of term. + @follow_sets = {} + + # `LA(q, A -> ω) = ∪{Follow(p, A) | (q, A -> ω) lookback (p, A)` + # + # `@la` is a hash whose + # key is [state.id, rule.id], + # value is bitmap of term. + @la = {} + end + + def compute + # Look Ahead Sets + report_duration(:compute_lr0_states) { compute_lr0_states } + report_duration(:compute_direct_read_sets) { compute_direct_read_sets } + report_duration(:compute_reads_relation) { compute_reads_relation } + report_duration(:compute_read_sets) { compute_read_sets } + report_duration(:compute_includes_relation) { compute_includes_relation } + report_duration(:compute_lookback_relation) { compute_lookback_relation } + report_duration(:compute_follow_sets) { compute_follow_sets } + report_duration(:compute_look_ahead_sets) { compute_look_ahead_sets } + + # Conflicts + report_duration(:compute_conflicts) { compute_conflicts } + + report_duration(:compute_default_reduction) { compute_default_reduction } + + check_conflicts + end + + def reporter + StatesReporter.new(self) + end + + def states_count + @states.count + end + + def direct_read_sets + h = {} + + @direct_read_sets.each do |k, v| + h[k] = bitmap_to_terms(v) + end + + return h + end + + def read_sets + h = {} + + @read_sets.each do |k, v| + h[k] = bitmap_to_terms(v) + end + + return h + end + + def follow_sets + h = {} + + @follow_sets.each do |k, v| + h[k] = bitmap_to_terms(v) + end + + return h + end + + def la + h = {} + + @la.each do |k, v| + h[k] = bitmap_to_terms(v) + end + + return h + end + + private + + def sr_conflicts + @states.flat_map(&:sr_conflicts) + end + + def rr_conflicts + @states.flat_map(&:rr_conflicts) + end + + def initial_attrs + h = {} + + attrs.each do |attr| + h[attr.id] = false + end + + h + end + + def trace_state + if @trace_state + yield STDERR + end + end + + def create_state(accessing_symbol, kernels, states_creted) + # A item can appear in some states, + # so need to use `kernels` (not `kernels.first`) as a key. + # + # For example... + # + # %% + # program: '+' strings_1 + # | '-' strings_2 + # ; + # + # strings_1: string_1 + # ; + # + # strings_2: string_1 + # | string_2 + # ; + # + # string_1: string + # ; + # + # string_2: string '+' + # ; + # + # string: tSTRING + # ; + # %% + # + # For these grammar, there are 2 states + # + # State A + # string_1: string • + # + # State B + # string_1: string • + # string_2: string • '+' + # + return [states_creted[kernels], false] if states_creted[kernels] + + state = State.new(@states.count, accessing_symbol, kernels) + @states << state + states_creted[kernels] = state + + return [state, true] + end + + def setup_state(state) + # closure + closure = [] + visited = {} + queued = {} + items = state.kernels.dup + + items.each do |item| + queued[item] = true + end + + while (item = items.shift) do + visited[item] = true + + if (sym = item.next_sym) && sym.nterm? + @grammar.find_rules_by_symbol!(sym).each do |rule| + i = Item.new(rule: rule, position: 0) + next if queued[i] + closure << i + items << i + queued[i] = true + end + end + end + + state.closure = closure.sort_by {|i| i.rule.id } + + # Trace + trace_state do |out| + out << "Closure: input\n" + state.kernels.each do |item| + out << " #{item.display_rest}\n" + end + out << "\n\n" + out << "Closure: output\n" + state.items.each do |item| + out << " #{item.display_rest}\n" + end + out << "\n\n" + end + + # shift & reduce + state.compute_shifts_reduces + end + + def enqueue_state(states, state) + # Trace + previous = state.kernels.first.previous_sym + trace_state do |out| + out << sprintf("state_list_append (state = %d, symbol = %d (%s))", + @states.count, previous.number, previous.display_name) + end + + states << state + end + + def compute_lr0_states + # State queue + states = [] + states_creted = {} + + state, _ = create_state(symbols.first, [Item.new(rule: @grammar.rules.first, position: 0)], states_creted) + enqueue_state(states, state) + + while (state = states.shift) do + # Trace + # + # Bison 3.8.2 renders "(reached by "end-of-input")" for State 0 but + # I think it is not correct... + previous = state.kernels.first.previous_sym + trace_state do |out| + out << "Processing state #{state.id} (reached by #{previous.display_name})\n" + end + + setup_state(state) + + state.shifts.each do |shift| + new_state, created = create_state(shift.next_sym, shift.next_items, states_creted) + state.set_items_to_state(shift.next_items, new_state) + enqueue_state(states, new_state) if created + end + end + end + + def nterm_transitions + a = [] + + @states.each do |state| + state.nterm_transitions.each do |shift, next_state| + nterm = shift.next_sym + a << [state, nterm, next_state] + end + end + + a + end + + def compute_direct_read_sets + @states.each do |state| + state.nterm_transitions.each do |shift, next_state| + nterm = shift.next_sym + + ary = next_state.term_transitions.map do |shift, _| + shift.next_sym.number + end + + key = [state.id, nterm.token_id] + @direct_read_sets[key] = Bitmap.from_array(ary) + end + end + end + + def compute_reads_relation + @states.each do |state| + state.nterm_transitions.each do |shift, next_state| + nterm = shift.next_sym + next_state.nterm_transitions.each do |shift2, _next_state2| + nterm2 = shift2.next_sym + if nterm2.nullable + key = [state.id, nterm.token_id] + @reads_relation[key] ||= [] + @reads_relation[key] << [next_state.id, nterm2.token_id] + end + end + end + end + end + + def compute_read_sets + sets = nterm_transitions.map do |state, nterm, next_state| + [state.id, nterm.token_id] + end + + @read_sets = Digraph.new(sets, @reads_relation, @direct_read_sets).compute + end + + # Execute transition of state by symbols + # then return final state. + def transition(state, symbols) + symbols.each do |sym| + state = state.transition(sym) + end + + state + end + + def compute_includes_relation + @states.each do |state| + state.nterm_transitions.each do |shift, next_state| + nterm = shift.next_sym + @grammar.find_rules_by_symbol!(nterm).each do |rule| + i = rule.rhs.count - 1 + + while (i > -1) do + sym = rule.rhs[i] + + break if sym.term? + state2 = transition(state, rule.rhs[0...i]) + # p' = state, B = nterm, p = state2, A = sym + key = [state2.id, sym.token_id] + # TODO: need to omit if state == state2 ? + @includes_relation[key] ||= [] + @includes_relation[key] << [state.id, nterm.token_id] + break if !sym.nullable + i -= 1 + end + end + end + end + end + + def compute_lookback_relation + @states.each do |state| + state.nterm_transitions.each do |shift, next_state| + nterm = shift.next_sym + @grammar.find_rules_by_symbol!(nterm).each do |rule| + state2 = transition(state, rule.rhs) + # p = state, A = nterm, q = state2, A -> ω = rule + key = [state2.id, rule.id] + @lookback_relation[key] ||= [] + @lookback_relation[key] << [state.id, nterm.token_id] + end + end + end + end + + def compute_follow_sets + sets = nterm_transitions.map do |state, nterm, next_state| + [state.id, nterm.token_id] + end + + @follow_sets = Digraph.new(sets, @includes_relation, @read_sets).compute + end + + def compute_look_ahead_sets + @states.each do |state| + rules.each do |rule| + ary = @lookback_relation[[state.id, rule.id]] + next if !ary + + ary.each do |state2_id, nterm_token_id| + # q = state, A -> ω = rule, p = state2, A = nterm + follows = @follow_sets[[state2_id, nterm_token_id]] + + next if follows == 0 + + key = [state.id, rule.id] + @la[key] ||= 0 + look_ahead = @la[key] | follows + @la[key] |= look_ahead + + # No risk of conflict when + # * the state only has single reduce + # * the state only has nterm_transitions (GOTO) + next if state.reduces.count == 1 && state.term_transitions.count == 0 + + state.set_look_ahead(rule, bitmap_to_terms(look_ahead)) + end + end + end + end + + def bitmap_to_terms(bit) + ary = Bitmap.to_array(bit) + ary.map do |i| + @grammar.find_symbol_by_number!(i) + end + end + + def compute_conflicts + compute_shift_reduce_conflicts + compute_reduce_reduce_conflicts + end + + def compute_shift_reduce_conflicts + states.each do |state| + state.shifts.each do |shift| + state.reduces.each do |reduce| + sym = shift.next_sym + + next unless reduce.look_ahead + next if !reduce.look_ahead.include?(sym) + + # Shift/Reduce conflict + shift_prec = sym.precedence + reduce_prec = reduce.item.rule.precedence + + # Can resolve only when both have prec + unless shift_prec && reduce_prec + state.conflicts << State::Conflict.new(symbols: [sym], reduce: reduce, type: :shift_reduce) + next + end + + case + when shift_prec < reduce_prec + # Reduce is selected + state.resolved_conflicts << State::ResolvedConflict.new(symbol: sym, reduce: reduce, which: :reduce) + shift.not_selected = true + next + when shift_prec > reduce_prec + # Shift is selected + state.resolved_conflicts << State::ResolvedConflict.new(symbol: sym, reduce: reduce, which: :shift) + reduce.add_not_selected_symbol(sym) + next + end + + # shift_prec == reduce_prec, then check associativity + case sym.precedence.type + when :right + # Shift is selected + state.resolved_conflicts << State::ResolvedConflict.new(symbol: sym, reduce: reduce, which: :shift, same_prec: true) + reduce.add_not_selected_symbol(sym) + next + when :left + # Reduce is selected + state.resolved_conflicts << State::ResolvedConflict.new(symbol: sym, reduce: reduce, which: :reduce, same_prec: true) + shift.not_selected = true + next + when :nonassoc + # Can not resolve + # + # nonassoc creates "run-time" error, precedence creates "compile-time" error. + # Then omit both the shift and reduce. + # + # https://www.gnu.org/software/bison/manual/html_node/Using-Precedence.html + state.resolved_conflicts << State::ResolvedConflict.new(symbol: sym, reduce: reduce, which: :error) + shift.not_selected = true + reduce.add_not_selected_symbol(sym) + else + raise "Unknown precedence type. #{sym}" + end + end + end + end + end + + def compute_reduce_reduce_conflicts + states.each do |state| + a = [] + + state.reduces.each do |reduce| + next if reduce.look_ahead.nil? + + intersection = a & reduce.look_ahead + a += reduce.look_ahead + + if !intersection.empty? + state.conflicts << State::Conflict.new(symbols: intersection.dup, reduce: reduce, type: :reduce_reduce) + end + end + end + end + + def compute_default_reduction + states.each do |state| + next if state.reduces.empty? + # Do not set, if conflict exist + next if !state.conflicts.empty? + # Do not set, if shift with `error` exists. + next if state.shifts.map(&:next_sym).include?(@grammar.error_symbol) + + state.default_reduction_rule = state.reduces.map do |r| + [r.rule, r.rule.id, (r.look_ahead || []).count] + end.sort_by do |rule, rule_id, count| + [-count, rule_id] + end.first.first + end + end + + def check_conflicts + sr_count = sr_conflicts.count + rr_count = rr_conflicts.count + + if @grammar.expect + + expected_sr_conflicts = @grammar.expect + expected_rr_conflicts = 0 + + if expected_sr_conflicts != sr_count + @warning.error("shift/reduce conflicts: #{sr_count} found, #{expected_sr_conflicts} expected") + end + + if expected_rr_conflicts != rr_count + @warning.error("reduce/reduce conflicts: #{rr_count} found, #{expected_rr_conflicts} expected") + end + else + if sr_count != 0 + @warning.warn("shift/reduce conflicts: #{sr_count} found") + end + + if rr_count != 0 + @warning.warn("reduce/reduce conflicts: #{rr_count} found") + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/states_reporter.rb b/tool/lrama/lib/lrama/states_reporter.rb new file mode 100644 index 0000000000..25893e61be --- /dev/null +++ b/tool/lrama/lib/lrama/states_reporter.rb @@ -0,0 +1,310 @@ +module Lrama + class StatesReporter + include Lrama::Report::Duration + + def initialize(states) + @states = states + end + + def report(io, **options) + report_duration(:report) do + _report(io, **options) + end + end + + private + + def _report(io, grammar: false, states: false, itemsets: false, lookaheads: false, solved: false, verbose: false) + # TODO: Unused terms + # TODO: Unused rules + + report_conflicts(io) + report_grammar(io) if grammar + report_states(io, itemsets, lookaheads, solved, verbose) + end + + def report_conflicts(io) + has_conflict = false + + @states.states.each do |state| + messages = [] + cs = state.conflicts.group_by(&:type) + if cs[:shift_reduce] + messages << "#{cs[:shift_reduce].count} shift/reduce" + end + + if cs[:reduce_reduce] + messages << "#{cs[:reduce_reduce].count} reduce/reduce" + end + + if !messages.empty? + has_conflict = true + io << "State #{state.id} conflicts: #{messages.join(', ')}\n" + end + end + + if has_conflict + io << "\n\n" + end + end + + def report_grammar(io) + io << "Grammar\n" + last_lhs = nil + + @states.rules.each do |rule| + if rule.rhs.empty? + r = "ε" + else + r = rule.rhs.map(&:display_name).join(" ") + end + + if rule.lhs == last_lhs + io << sprintf("%5d %s| %s\n", rule.id, " " * rule.lhs.display_name.length, r) + else + io << "\n" + io << sprintf("%5d %s: %s\n", rule.id, rule.lhs.display_name, r) + end + + last_lhs = rule.lhs + end + io << "\n\n" + end + + def report_states(io, itemsets, lookaheads, solved, verbose) + @states.states.each do |state| + # Report State + io << "State #{state.id}\n\n" + + # Report item + last_lhs = nil + list = itemsets ? state.items : state.kernels + list.sort_by {|i| [i.rule_id, i.position] }.each do |item| + rule = item.rule + position = item.position + if rule.rhs.empty? + r = "ε •" + else + r = rule.rhs.map(&:display_name).insert(position, "•").join(" ") + end + if rule.lhs == last_lhs + l = " " * rule.lhs.id.s_value.length + "|" + else + l = rule.lhs.id.s_value + ":" + end + la = "" + if lookaheads && item.end_of_rule? + reduce = state.find_reduce_by_item!(item) + look_ahead = reduce.selected_look_ahead + if !look_ahead.empty? + la = " [#{look_ahead.map(&:display_name).join(", ")}]" + end + end + last_lhs = rule.lhs + + io << sprintf("%5i %s %s%s\n", rule.id, l, r, la) + end + io << "\n" + + + # Report shifts + tmp = state.term_transitions.select do |shift, _| + !shift.not_selected + end.map do |shift, next_state| + [shift.next_sym, next_state.id] + end + max_len = tmp.map(&:first).map(&:display_name).map(&:length).max + tmp.each do |term, state_id| + io << " #{term.display_name.ljust(max_len)} shift, and go to state #{state_id}\n" + end + io << "\n" if !tmp.empty? + + + # Report error caused by %nonassoc + nl = false + tmp = state.resolved_conflicts.select do |resolved| + resolved.which == :error + end.map do |error| + error.symbol.display_name + end + max_len = tmp.map(&:length).max + tmp.each do |name| + nl = true + io << " #{name.ljust(max_len)} error (nonassociative)\n" + end + io << "\n" if !tmp.empty? + + + # Report reduces + nl = false + max_len = state.non_default_reduces.flat_map(&:look_ahead).compact.map(&:display_name).map(&:length).max || 0 + max_len = [max_len, "$default".length].max if state.default_reduction_rule + ary = [] + + state.non_default_reduces.each do |reduce| + reduce.look_ahead.each do |term| + ary << [term, reduce] + end + end + + ary.sort_by do |term, reduce| + term.number + end.each do |term, reduce| + rule = reduce.item.rule + io << " #{term.display_name.ljust(max_len)} reduce using rule #{rule.id} (#{rule.lhs.display_name})\n" + nl = true + end + + if r = state.default_reduction_rule + nl = true + s = "$default".ljust(max_len) + + if r.initial_rule? + io << " #{s} accept\n" + else + io << " #{s} reduce using rule #{r.id} (#{r.lhs.display_name})\n" + end + end + io << "\n" if nl + + + # Report nonterminal transitions + tmp = [] + max_len = 0 + state.nterm_transitions.each do |shift, next_state| + nterm = shift.next_sym + tmp << [nterm, next_state.id] + max_len = [max_len, nterm.id.s_value.length].max + end + tmp.uniq! + tmp.sort_by! do |nterm, state_id| + nterm.number + end + tmp.each do |nterm, state_id| + io << " #{nterm.id.s_value.ljust(max_len)} go to state #{state_id}\n" + end + io << "\n" if !tmp.empty? + + + if solved + # Report conflict resolutions + state.resolved_conflicts.each do |resolved| + io << " #{resolved.report_message}\n" + end + io << "\n" if !state.resolved_conflicts.empty? + end + + + if verbose + # Report direct_read_sets + io << " [Direct Read sets]\n" + direct_read_sets = @states.direct_read_sets + @states.nterms.each do |nterm| + terms = direct_read_sets[[state.id, nterm.token_id]] + next if !terms + next if terms.empty? + + str = terms.map {|sym| sym.id.s_value }.join(", ") + io << " read #{nterm.id.s_value} shift #{str}\n" + end + io << "\n" + + + # Reprot reads_relation + io << " [Reads Relation]\n" + @states.nterms.each do |nterm| + a = @states.reads_relation[[state.id, nterm.token_id]] + next if !a + + a.each do |state_id2, nterm_id2| + n = @states.nterms.find {|n| n.token_id == nterm_id2 } + io << " (State #{state_id2}, #{n.id.s_value})\n" + end + end + io << "\n" + + + # Reprot read_sets + io << " [Read sets]\n" + read_sets = @states.read_sets + @states.nterms.each do |nterm| + terms = read_sets[[state.id, nterm.token_id]] + next if !terms + next if terms.empty? + + terms.each do |sym| + io << " #{sym.id.s_value}\n" + end + end + io << "\n" + + + # Reprot includes_relation + io << " [Includes Relation]\n" + @states.nterms.each do |nterm| + a = @states.includes_relation[[state.id, nterm.token_id]] + next if !a + + a.each do |state_id2, nterm_id2| + n = @states.nterms.find {|n| n.token_id == nterm_id2 } + io << " (State #{state.id}, #{nterm.id.s_value}) -> (State #{state_id2}, #{n.id.s_value})\n" + end + end + io << "\n" + + + # Report lookback_relation + io << " [Lookback Relation]\n" + @states.rules.each do |rule| + a = @states.lookback_relation[[state.id, rule.id]] + next if !a + + a.each do |state_id2, nterm_id2| + n = @states.nterms.find {|n| n.token_id == nterm_id2 } + io << " (Rule: #{rule.to_s}) -> (State #{state_id2}, #{n.id.s_value})\n" + end + end + io << "\n" + + + # Reprot follow_sets + io << " [Follow sets]\n" + follow_sets = @states.follow_sets + @states.nterms.each do |nterm| + terms = follow_sets[[state.id, nterm.token_id]] + + next if !terms + + terms.each do |sym| + io << " #{nterm.id.s_value} -> #{sym.id.s_value}\n" + end + end + io << "\n" + + + # Report LA + io << " [Look-Ahead Sets]\n" + tmp = [] + max_len = 0 + @states.rules.each do |rule| + syms = @states.la[[state.id, rule.id]] + next if !syms + + tmp << [rule, syms] + max_len = ([max_len] + syms.map {|s| s.id.s_value.length }).max + end + tmp.each do |rule, syms| + syms.each do |sym| + io << " #{sym.id.s_value.ljust(max_len)} reduce using rule #{rule.id} (#{rule.lhs.id.s_value})\n" + end + end + io << "\n" if !tmp.empty? + end + + + # End of Report State + io << "\n" + end + end + end +end diff --git a/tool/lrama/lib/lrama/version.rb b/tool/lrama/lib/lrama/version.rb new file mode 100644 index 0000000000..2da384bf73 --- /dev/null +++ b/tool/lrama/lib/lrama/version.rb @@ -0,0 +1,3 @@ +module Lrama + VERSION = "0.4.0".freeze +end diff --git a/tool/lrama/lib/lrama/warning.rb b/tool/lrama/lib/lrama/warning.rb new file mode 100644 index 0000000000..3c99791ebf --- /dev/null +++ b/tool/lrama/lib/lrama/warning.rb @@ -0,0 +1,25 @@ +module Lrama + class Warning + attr_reader :errors, :warns + + def initialize(out = STDERR) + @out = out + @errors = [] + @warns = [] + end + + def error(message) + @out << message << "\n" + @errors << message + end + + def warn(message) + @out << message << "\n" + @warns << message + end + + def has_error? + !@errors.empty? + end + end +end |