From a1b01e7701f9fc370f8dff777aad6d39a2c5a3e3 Mon Sep 17 00:00:00 2001 From: Yuichiro Kaneko Date: Fri, 12 May 2023 18:25:10 +0900 Subject: Use Lrama LALR parser generator instead of Bison https://bugs.ruby-lang.org/issues/19637 Co-authored-by: Nobuyoshi Nakada --- tool/lrama/lib/lrama/grammar.rb | 850 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 850 insertions(+) create mode 100644 tool/lrama/lib/lrama/grammar.rb (limited to 'tool/lrama/lib/lrama/grammar.rb') diff --git a/tool/lrama/lib/lrama/grammar.rb b/tool/lrama/lib/lrama/grammar.rb new file mode 100644 index 0000000000..8ca75cb60b --- /dev/null +++ b/tool/lrama/lib/lrama/grammar.rb @@ -0,0 +1,850 @@ +require "forwardable" +require "lrama/lexer" + +module Lrama + Rule = Struct.new(:id, :lhs, :rhs, :code, :nullable, :precedence_sym, :lineno, keyword_init: true) do + # TODO: Change this to display_name + def to_s + l = lhs.id.s_value + r = rhs.empty? ? "ε" : rhs.map {|r| r.id.s_value }.join(", ") + + "#{l} -> #{r}" + end + + # Used by #user_actions + def as_comment + l = lhs.id.s_value + r = rhs.empty? ? "%empty" : rhs.map {|r| r.display_name }.join(" ") + + "#{l}: #{r}" + end + + def precedence + precedence_sym && precedence_sym.precedence + end + + def initial_rule? + id == 0 + end + + def translated_code + if code + code.translated_code + else + nil + end + end + end + + # Symbol is both of nterm and term + # `number` is both for nterm and term + # `token_id` is tokentype for term, internal sequence number for nterm + # + # TODO: Add validation for ASCII code range for Token::Char + Symbol = Struct.new(:id, :alias_name, :number, :tag, :term, :token_id, :nullable, :precedence, :printer, keyword_init: true) do + attr_writer :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol + + def term? + term + end + + def nterm? + !term + end + + def eof_symbol? + !!@eof_symbol + end + + def error_symbol? + !!@error_symbol + end + + def undef_symbol? + !!@undef_symbol + end + + def accept_symbol? + !!@accept_symbol + end + + def display_name + if alias_name + alias_name + else + id.s_value + end + end + + # name for yysymbol_kind_t + # + # See: b4_symbol_kind_base + def enum_name + case + when accept_symbol? + name = "YYACCEPT" + when eof_symbol? + name = "YYEOF" + when term? && id.type == Token::Char + if alias_name + name = number.to_s + alias_name + else + name = number.to_s + id.s_value + end + when term? && id.type == Token::Ident + name = id.s_value + when nterm? && (id.s_value.include?("$") || id.s_value.include?("@")) + name = number.to_s + id.s_value + when nterm? + name = id.s_value + else + raise "Unexpected #{self}" + end + + "YYSYMBOL_" + name.gsub(/[^a-zA-Z_0-9]+/, "_") + end + + # comment for yysymbol_kind_t + def comment + case + when accept_symbol? + # YYSYMBOL_YYACCEPT + id.s_value + when eof_symbol? + # YYEOF + alias_name + when (term? && 0 < token_id && token_id < 128) + # YYSYMBOL_3_backslash_, YYSYMBOL_14_ + alias_name || id.s_value + when id.s_value.include?("$") || id.s_value.include?("@") + # YYSYMBOL_21_1 + id.s_value + else + # YYSYMBOL_keyword_class, YYSYMBOL_strings_1 + alias_name || id.s_value + end + end + end + + Type = Struct.new(:id, :tag, keyword_init: true) + + Code = Struct.new(:type, :token_code, keyword_init: true) do + extend Forwardable + + def_delegators "token_code", :s_value, :line, :column, :references + + # $$, $n, @$, @n is translated to C code + def translated_code + case type + when :user_code + translated_user_code + when :initial_action + translated_initial_action_code + end + end + + # * ($1) error + # * ($$) *yyvaluep + # * (@1) error + # * (@$) *yylocationp + def translated_printer_code(tag) + t_code = s_value.dup + + references.reverse.each do |ref| + first_column = ref.first_column + last_column = ref.last_column + + case + when ref.number == "$" && ref.type == :dollar # $$ + # Omit "<>" + member = tag.s_value[1..-2] + str = "((*yyvaluep).#{member})" + when ref.number == "$" && ref.type == :at # @$ + str = "(*yylocationp)" + when ref.type == :dollar # $n + raise "$#{ref.number} can not be used in %printer." + when ref.type == :at # @n + raise "@#{ref.number} can not be used in %printer." + else + raise "Unexpected. #{code}, #{ref}" + end + + t_code[first_column..last_column] = str + end + + return t_code + end + + + private + + # * ($1) yyvsp[i] + # * ($$) yyval + # * (@1) yylsp[i] + # * (@$) yyloc + def translated_user_code + t_code = s_value.dup + + references.reverse.each do |ref| + first_column = ref.first_column + last_column = ref.last_column + + case + when ref.number == "$" && ref.type == :dollar # $$ + # Omit "<>" + member = ref.tag.s_value[1..-2] + str = "(yyval.#{member})" + when ref.number == "$" && ref.type == :at # @$ + str = "(yyloc)" + when ref.type == :dollar # $n + i = -ref.position_in_rhs + ref.number + # Omit "<>" + member = ref.tag.s_value[1..-2] + str = "(yyvsp[#{i}].#{member})" + when ref.type == :at # @n + i = -ref.position_in_rhs + ref.number + str = "(yylsp[#{i}])" + else + raise "Unexpected. #{code}, #{ref}" + end + + t_code[first_column..last_column] = str + end + + return t_code + end + + # * ($1) error + # * ($$) yylval + # * (@1) error + # * (@$) yylloc + def translated_initial_action_code + t_code = s_value.dup + + references.reverse.each do |ref| + first_column = ref.first_column + last_column = ref.last_column + + case + when ref.number == "$" && ref.type == :dollar # $$ + str = "yylval" + when ref.number == "$" && ref.type == :at # @$ + str = "yylloc" + when ref.type == :dollar # $n + raise "$#{ref.number} can not be used in initial_action." + when ref.type == :at # @n + raise "@#{ref.number} can not be used in initial_action." + else + raise "Unexpected. #{code}, #{ref}" + end + + t_code[first_column..last_column] = str + end + + return t_code + end + end + + # type: :dollar or :at + # ex_tag: "$1" (Optional) + Reference = Struct.new(:type, :number, :ex_tag, :first_column, :last_column, :referring_symbol, :position_in_rhs, keyword_init: true) do + def tag + if ex_tag + ex_tag + else + referring_symbol.tag + end + end + end + + Precedence = Struct.new(:type, :precedence, keyword_init: true) do + include Comparable + + def <=>(other) + self.precedence <=> other.precedence + end + end + + Printer = Struct.new(:ident_or_tags, :code, :lineno, keyword_init: true) do + def translated_code(member) + code.translated_printer_code(member) + end + end + + Union = Struct.new(:code, :lineno, keyword_init: true) do + def braces_less_code + # Remove braces + code.s_value[1..-2] + end + end + + Token = Lrama::Lexer::Token + + # Grammar is the result of parsing an input grammar file + class Grammar + # Grammar file information not used by States but by Output + Aux = Struct.new(:prologue_first_lineno, :prologue, :epilogue_first_lineno, :epilogue, keyword_init: true) + + attr_reader :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol, :aux + attr_accessor :union, :expect, + :printers, + :lex_param, :parse_param, :initial_action, + :symbols, :types, + :rules, :_rules, + :sym_to_rules + + def initialize + @printers = [] + @symbols = [] + @types = [] + @_rules = [] + @rules = [] + @sym_to_rules = {} + @empty_symbol = nil + @eof_symbol = nil + @error_symbol = nil + @undef_symbol = nil + @accept_symbol = nil + @aux = Aux.new + + append_special_symbols + end + + def add_printer(ident_or_tags:, code:, lineno:) + @printers << Printer.new(ident_or_tags: ident_or_tags, code: code, lineno: lineno) + end + + def add_term(id:, alias_name: nil, tag: nil, token_id: nil, replace: false) + if token_id && (sym = @symbols.find {|s| s.token_id == token_id }) + if replace + sym.id = id + sym.alias_name = alias_name + sym.tag = tag + end + + return sym + end + + if sym = @symbols.find {|s| s.id == id } + return sym + end + + sym = Symbol.new( + id: id, alias_name: alias_name, number: nil, tag: tag, + term: true, token_id: token_id, nullable: false + ) + @symbols << sym + @terms = nil + + return sym + end + + def add_nterm(id:, alias_name: nil, tag: nil) + return if @symbols.find {|s| s.id == id } + + sym = Symbol.new( + id: id, alias_name: alias_name, number: nil, tag: tag, + term: false, token_id: nil, nullable: nil, + ) + @symbols << sym + @nterms = nil + + return sym + end + + def add_type(id:, tag:) + @types << Type.new(id: id, tag: tag) + end + + def add_nonassoc(sym, precedence) + set_precedence(sym, Precedence.new(type: :nonassoc, precedence: precedence)) + end + + def add_left(sym, precedence) + set_precedence(sym, Precedence.new(type: :left, precedence: precedence)) + end + + def add_right(sym, precedence) + set_precedence(sym, Precedence.new(type: :right, precedence: precedence)) + end + + def set_precedence(sym, precedence) + raise "" if sym.nterm? + sym.precedence = precedence + end + + def set_union(code, lineno) + @union = Union.new(code: code, lineno: lineno) + end + + def add_rule(lhs:, rhs:, lineno:) + @_rules << [lhs, rhs, lineno] + end + + def build_references(token_code) + token_code.references.map! do |type, number, tag, first_column, last_column| + Reference.new(type: type, number: number, ex_tag: tag, first_column: first_column, last_column: last_column) + end + + token_code + end + + def build_code(type, token_code) + build_references(token_code) + Code.new(type: type, token_code: token_code) + end + + def prologue_first_lineno=(prologue_first_lineno) + @aux.prologue_first_lineno = prologue_first_lineno + end + + def prologue=(prologue) + @aux.prologue = prologue + end + + def epilogue_first_lineno=(epilogue_first_lineno) + @aux.epilogue_first_lineno = epilogue_first_lineno + end + + def epilogue=(epilogue) + @aux.epilogue = epilogue + end + + def prepare + normalize_rules + collect_symbols + replace_token_with_symbol + fill_symbol_number + fill_default_precedence + fill_sym_to_rules + fill_nterm_type + fill_symbol_printer + @symbols.sort_by!(&:number) + end + + # TODO: More validation methods + def validate! + validate_symbol_number_uniqueness! + end + + def compute_nullable + @rules.each do |rule| + case + when rule.rhs.empty? + rule.nullable = true + when rule.rhs.any?(&:term) + rule.nullable = false + else + # noop + end + end + + while true do + rs = @rules.select {|e| e.nullable.nil? } + nts = nterms.select {|e| e.nullable.nil? } + rule_count_1 = rs.count + nterm_count_1 = nts.count + + rs.each do |rule| + if rule.rhs.all?(&:nullable) + rule.nullable = true + end + end + + nts.each do |nterm| + find_rules_by_symbol!(nterm).each do |rule| + if rule.nullable + nterm.nullable = true + end + end + end + + rule_count_2 = @rules.count {|e| e.nullable.nil? } + nterm_count_2 = nterms.count {|e| e.nullable.nil? } + + if (rule_count_1 == rule_count_2) && (nterm_count_1 == nterm_count_2) + break + end + end + + rules.select {|r| r.nullable.nil? }.each do |rule| + rule.nullable = false + end + + nterms.select {|r| r.nullable.nil? }.each do |nterm| + nterm.nullable = false + end + end + + def find_symbol_by_s_value(s_value) + @symbols.find do |sym| + sym.id.s_value == s_value + end + end + + def find_symbol_by_s_value!(s_value) + find_symbol_by_s_value(s_value) || (raise "Symbol not found: #{s_value}") + end + + def find_symbol_by_id(id) + @symbols.find do |sym| + # TODO: validate uniqueness of Token#s_value and Symbol#alias_name + sym.id == id || sym.alias_name == id.s_value + end + end + + def find_symbol_by_id!(id) + find_symbol_by_id(id) || (raise "Symbol not found: #{id}") + end + + def find_symbol_by_number!(number) + sym = @symbols[number] + + raise "Symbol not found: #{number}" unless sym + raise "[BUG] Symbol number mismatch. #{number}, #{sym}" if sym.number != number + + sym + end + + def find_rules_by_symbol!(sym) + find_rules_by_symbol(sym) || (raise "Rules for #{sym} not found") + end + + def find_rules_by_symbol(sym) + @sym_to_rules[sym.number] + end + + def terms_count + terms.count + end + + def terms + @terms ||= @symbols.select(&:term?) + end + + def nterms_count + nterms.count + end + + def nterms + @nterms ||= @symbols.select(&:nterm?) + end + + private + + def find_nterm_by_id!(id) + nterms.find do |nterm| + nterm.id == id + end || (raise "Nterm not found: #{id}") + end + + + def append_special_symbols + # YYEMPTY (token_id: -2, number: -2) is added when a template is evaluated + # term = add_term(id: Token.new(Token::Ident, "YYEMPTY"), token_id: -2) + # term.number = -2 + # @empty_symbol = term + + # YYEOF + term = add_term(id: Token.new(type: Token::Ident, s_value: "YYEOF"), alias_name: "\"end of file\"", token_id: 0) + term.number = 0 + term.eof_symbol = true + @eof_symbol = term + + # YYerror + term = add_term(id: Token.new(type: Token::Ident, s_value: "YYerror"), alias_name: "error") + term.number = 1 + term.error_symbol = true + @error_symbol = term + + # YYUNDEF + term = add_term(id: Token.new(type: Token::Ident, s_value: "YYUNDEF"), alias_name: "\"invalid token\"") + term.number = 2 + term.undef_symbol = true + @undef_symbol = term + + # $accept + term = add_nterm(id: Token.new(type: Token::Ident, s_value: "$accept")) + term.accept_symbol = true + @accept_symbol = term + end + + # 1. Add $accept rule to the top of rules + # 2. Extract precedence and last action + # 3. Extract action in the middle of RHS into new Empty rule + # 4. Append id and extract action then create Rule + # + # Bison 3.8.2 uses different orders for symbol number and rule number + # when a rule has actions in the middle of a rule. + # + # For example, + # + # `program: $@1 top_compstmt` + # + # Rules are ordered like below, + # + # 1 $@1: ε + # 2 program: $@1 top_compstmt + # + # Symbols are ordered like below, + # + # 164 program + # 165 $@1 + # + def normalize_rules + # 1. Add $accept rule to the top of rules + accept = find_symbol_by_s_value!("$accept") + eof = find_symbol_by_number!(0) + lineno = @_rules.first ? @_rules.first[2] : 0 + @rules << Rule.new(id: @rules.count, lhs: accept, rhs: [@_rules.first[0], eof], code: nil, lineno: lineno) + + extracted_action_number = 1 # @n as nterm + + @_rules.each do |lhs, rhs, lineno| + a = [] + rhs1 = [] + code = nil + precedence_sym = nil + + # 2. Extract precedence and last action + rhs.reverse.each do |r| + case + when r.is_a?(Symbol) # precedence_sym + precedence_sym = r + when (r.type == Token::User_code) && precedence_sym.nil? && code.nil? && rhs1.empty? + code = r + else + rhs1 << r + end + end + rhs1.reverse! + + # Bison n'th component is 1-origin + (rhs1 + [code]).compact.each.with_index(1) do |token, i| + if token.type == Token::User_code + token.references.each do |ref| + # Need to keep position_in_rhs for actions in the middle of RHS + ref.position_in_rhs = i - 1 + next if ref.type == :at + # $$, $n, @$, @n can be used in any actions + number = ref.number + + if number == "$" + # TODO: Should be postponed after middle actions are extracted? + ref.referring_symbol = lhs + else + raise "Can not refer following component. #{number} >= #{i}. #{token}" if number >= i + rhs1[number - 1].referred = true + ref.referring_symbol = rhs1[number - 1] + end + end + end + end + + rhs2 = rhs1.map do |token| + if token.type == Token::User_code + prefix = token.referred ? "@" : "$@" + new_token = Token.new(type: Token::Ident, s_value: prefix + extracted_action_number.to_s) + extracted_action_number += 1 + a << [new_token, token] + new_token + else + token + end + end + + # Extract actions in the middle of RHS + # into new rules. + a.each do |new_token, code| + @rules << Rule.new(id: @rules.count, lhs: new_token, rhs: [], code: Code.new(type: :user_code, token_code: code), lineno: code.line) + end + + c = code ? Code.new(type: :user_code, token_code: code) : nil + @rules << Rule.new(id: @rules.count, lhs: lhs, rhs: rhs2, code: c, precedence_sym: precedence_sym, lineno: lineno) + + add_nterm(id: lhs) + a.each do |new_token, _| + add_nterm(id: new_token) + end + end + end + + # Collect symbols from rules + def collect_symbols + @rules.flat_map(&:rhs).each do |s| + case s + when Token + if s.type == Token::Char + add_term(id: s) + end + when Symbol + # skip + else + raise "Unknown class: #{s}" + end + end + end + + # Fill #number and #token_id + def fill_symbol_number + # TODO: why start from 256 + token_id = 256 + + # YYEMPTY = -2 + # YYEOF = 0 + # YYerror = 1 + # YYUNDEF = 2 + number = 3 + + nterm_token_id = 0 + used_numbers = {} + + @symbols.map(&:number).each do |n| + used_numbers[n] = true + end + + (@symbols.select(&:term?) + @symbols.select(&:nterm?)).each do |sym| + while used_numbers[number] do + number += 1 + end + + if sym.number.nil? + sym.number = number + number += 1 + end + + # If id is Token::Char, it uses ASCII code + if sym.term? && sym.token_id.nil? + if sym.id.type == Token::Char + # Igonre ' on the both sides + case sym.id.s_value[1..-2] + when "\\b" + sym.token_id = 8 + when "\\f" + sym.token_id = 12 + when "\\n" + sym.token_id = 10 + when "\\r" + sym.token_id = 13 + when "\\t" + sym.token_id = 9 + when "\\v" + sym.token_id = 11 + when "\"" + sym.token_id = 34 + when "\'" + sym.token_id = 39 + when "\\\\" + sym.token_id = 92 + when /\A\\(\d+)\z/ + sym.token_id = Integer($1, 8) + when /\A(.)\z/ + sym.token_id = $1.bytes.first + else + raise "Unknown Char s_value #{sym}" + end + else + sym.token_id = token_id + token_id += 1 + end + end + + if sym.nterm? && sym.token_id.nil? + sym.token_id = nterm_token_id + nterm_token_id += 1 + end + end + end + + def replace_token_with_symbol + @rules.each do |rule| + rule.lhs = token_to_symbol(rule.lhs) + + rule.rhs.map! do |t| + token_to_symbol(t) + end + + if rule.code + rule.code.references.each do |ref| + next if ref.type == :at + + if ref.referring_symbol.type != Token::User_code + ref.referring_symbol = token_to_symbol(ref.referring_symbol) + end + end + end + end + end + + def token_to_symbol(token) + case token + when Token + find_symbol_by_id!(token) + when Symbol + token + else + raise "Unknown class: #{token}" + end + end + + # Rule inherits precedence from the last term in RHS. + # + # https://www.gnu.org/software/bison/manual/html_node/How-Precedence.html + def fill_default_precedence + @rules.each do |rule| + # Explicitly specified precedence has the highest priority + next if rule.precedence_sym + + precedence_sym = nil + rule.rhs.each do |sym| + precedence_sym = sym if sym.term? + end + + rule.precedence_sym = precedence_sym + end + end + + def fill_sym_to_rules + @rules.each do |rule| + key = rule.lhs.number + @sym_to_rules[key] ||= [] + @sym_to_rules[key] << rule + end + end + + # Fill nterm's tag defined by %type decl + def fill_nterm_type + @types.each do |type| + nterm = find_nterm_by_id!(type.id) + nterm.tag = type.tag + end + end + + def fill_symbol_printer + @symbols.each do |sym| + @printers.each do |printer| + printer.ident_or_tags.each do |ident_or_tag| + case ident_or_tag.type + when Token::Ident + sym.printer = printer if sym.id == ident_or_tag + when Token::Tag + sym.printer = printer if sym.tag == ident_or_tag + else + raise "Unknown token type. #{printer}" + end + end + end + end + end + + def validate_symbol_number_uniqueness! + invalid = @symbols.group_by(&:number).select do |number, syms| + syms.count > 1 + end + + return if invalid.empty? + + raise "Symbol number is dupulicated. #{invalid}" + end + end +end -- cgit v1.2.3