diff options
Diffstat (limited to 'tool/lrama/lib/lrama/grammar')
27 files changed, 1380 insertions, 0 deletions
diff --git a/tool/lrama/lib/lrama/grammar/auxiliary.rb b/tool/lrama/lib/lrama/grammar/auxiliary.rb new file mode 100644 index 0000000000..933574b0f6 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/auxiliary.rb @@ -0,0 +1,7 @@ +module Lrama + class Grammar + # Grammar file information not used by States but by Output + class Auxiliary < Struct.new(:prologue_first_lineno, :prologue, :epilogue_first_lineno, :epilogue, keyword_init: true) + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/binding.rb b/tool/lrama/lib/lrama/grammar/binding.rb new file mode 100644 index 0000000000..e5ea3fb037 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/binding.rb @@ -0,0 +1,24 @@ +module Lrama + class Grammar + class Binding + attr_reader :actual_args, :count + + def initialize(parameterizing_rule, actual_args) + @parameters = parameterizing_rule.parameters + @actual_args = actual_args + @parameter_to_arg = @parameters.zip(actual_args).map do |param, arg| + [param.s_value, arg] + end.to_h + end + + def resolve_symbol(symbol) + if symbol.is_a?(Lexer::Token::InstantiateRule) + resolved_args = symbol.args.map { |arg| resolve_symbol(arg) } + Lrama::Lexer::Token::InstantiateRule.new(s_value: symbol.s_value, location: symbol.location, args: resolved_args, lhs_tag: symbol.lhs_tag) + else + @parameter_to_arg[symbol.s_value] || symbol + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/code.rb b/tool/lrama/lib/lrama/grammar/code.rb new file mode 100644 index 0000000000..3bad599dae --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/code.rb @@ -0,0 +1,51 @@ +require "forwardable" +require "lrama/grammar/code/destructor_code" +require "lrama/grammar/code/initial_action_code" +require "lrama/grammar/code/no_reference_code" +require "lrama/grammar/code/printer_code" +require "lrama/grammar/code/rule_action" + +module Lrama + class Grammar + class Code + extend Forwardable + + def_delegators "token_code", :s_value, :line, :column, :references + + attr_reader :type, :token_code + + def initialize(type:, token_code:) + @type = type + @token_code = token_code + end + + def ==(other) + self.class == other.class && + self.type == other.type && + self.token_code == other.token_code + end + + # $$, $n, @$, @n are translated to C code + def translated_code + t_code = s_value.dup + + references.reverse_each do |ref| + first_column = ref.first_column + last_column = ref.last_column + + str = reference_to_c(ref) + + t_code[first_column...last_column] = str + end + + return t_code + end + + private + + def reference_to_c(ref) + raise NotImplementedError.new("#reference_to_c is not implemented") + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/code/destructor_code.rb b/tool/lrama/lib/lrama/grammar/code/destructor_code.rb new file mode 100644 index 0000000000..70360eb90f --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/code/destructor_code.rb @@ -0,0 +1,40 @@ +module Lrama + class Grammar + class Code + class DestructorCode < Code + def initialize(type:, token_code:, tag:) + super(type: type, token_code: token_code) + @tag = tag + end + + private + + # * ($$) *yyvaluep + # * (@$) *yylocationp + # * ($:$) error + # * ($1) error + # * (@1) error + # * ($:1) error + def reference_to_c(ref) + case + when ref.type == :dollar && ref.name == "$" # $$ + member = @tag.member + "((*yyvaluep).#{member})" + when ref.type == :at && ref.name == "$" # @$ + "(*yylocationp)" + when ref.type == :index && ref.name == "$" # $:$ + raise "$:#{ref.value} can not be used in #{type}." + when ref.type == :dollar # $n + raise "$#{ref.value} can not be used in #{type}." + when ref.type == :at # @n + raise "@#{ref.value} can not be used in #{type}." + when ref.type == :index # $:n + raise "$:#{ref.value} can not be used in #{type}." + else + raise "Unexpected. #{self}, #{ref}" + end + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/code/initial_action_code.rb b/tool/lrama/lib/lrama/grammar/code/initial_action_code.rb new file mode 100644 index 0000000000..a694f193cb --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/code/initial_action_code.rb @@ -0,0 +1,34 @@ +module Lrama + class Grammar + class Code + class InitialActionCode < Code + private + + # * ($$) yylval + # * (@$) yylloc + # * ($:$) error + # * ($1) error + # * (@1) error + # * ($:1) error + def reference_to_c(ref) + case + when ref.type == :dollar && ref.name == "$" # $$ + "yylval" + when ref.type == :at && ref.name == "$" # @$ + "yylloc" + when ref.type == :index && ref.name == "$" # $:$ + raise "$:#{ref.value} can not be used in initial_action." + when ref.type == :dollar # $n + raise "$#{ref.value} can not be used in initial_action." + when ref.type == :at # @n + raise "@#{ref.value} can not be used in initial_action." + when ref.type == :index # $:n + raise "$:#{ref.value} can not be used in initial_action." + else + raise "Unexpected. #{self}, #{ref}" + end + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/code/no_reference_code.rb b/tool/lrama/lib/lrama/grammar/code/no_reference_code.rb new file mode 100644 index 0000000000..6e614cc64a --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/code/no_reference_code.rb @@ -0,0 +1,28 @@ +module Lrama + class Grammar + class Code + class NoReferenceCode < Code + private + + # * ($$) error + # * (@$) error + # * ($:$) error + # * ($1) error + # * (@1) error + # * ($:1) error + def reference_to_c(ref) + case + when ref.type == :dollar # $$, $n + raise "$#{ref.value} can not be used in #{type}." + when ref.type == :at # @$, @n + raise "@#{ref.value} can not be used in #{type}." + when ref.type == :index # $:$, $:n + raise "$:#{ref.value} can not be used in #{type}." + else + raise "Unexpected. #{self}, #{ref}" + end + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/code/printer_code.rb b/tool/lrama/lib/lrama/grammar/code/printer_code.rb new file mode 100644 index 0000000000..ffccd89395 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/code/printer_code.rb @@ -0,0 +1,40 @@ +module Lrama + class Grammar + class Code + class PrinterCode < Code + def initialize(type:, token_code:, tag:) + super(type: type, token_code: token_code) + @tag = tag + end + + private + + # * ($$) *yyvaluep + # * (@$) *yylocationp + # * ($:$) error + # * ($1) error + # * (@1) error + # * ($:1) error + def reference_to_c(ref) + case + when ref.type == :dollar && ref.name == "$" # $$ + member = @tag.member + "((*yyvaluep).#{member})" + when ref.type == :at && ref.name == "$" # @$ + "(*yylocationp)" + when ref.type == :index && ref.name == "$" # $:$ + raise "$:#{ref.value} can not be used in #{type}." + when ref.type == :dollar # $n + raise "$#{ref.value} can not be used in #{type}." + when ref.type == :at # @n + raise "@#{ref.value} can not be used in #{type}." + when ref.type == :index # $:n + raise "$:#{ref.value} can not be used in #{type}." + else + raise "Unexpected. #{self}, #{ref}" + end + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/code/rule_action.rb b/tool/lrama/lib/lrama/grammar/code/rule_action.rb new file mode 100644 index 0000000000..d3c0eab64a --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/code/rule_action.rb @@ -0,0 +1,88 @@ +module Lrama + class Grammar + class Code + class RuleAction < Code + def initialize(type:, token_code:, rule:) + super(type: type, token_code: token_code) + @rule = rule + end + + private + + # * ($$) yyval + # * (@$) yyloc + # * ($:$) error + # * ($1) yyvsp[i] + # * (@1) yylsp[i] + # * ($:1) i - 1 + # + # + # Consider a rule like + # + # class: keyword_class { $1 } tSTRING { $2 + $3 } keyword_end { $class = $1 + $keyword_end } + # + # For the semantic action of original rule: + # + # "Rule" class: keyword_class { $1 } tSTRING { $2 + $3 } keyword_end { $class = $1 + $keyword_end } + # "Position in grammar" $1 $2 $3 $4 $5 + # "Index for yyvsp" -4 -3 -2 -1 0 + # "$:n" $:1 $:2 $:3 $:4 $:5 + # "index of $:n" -5 -4 -3 -2 -1 + # + # + # For the first midrule action: + # + # "Rule" class: keyword_class { $1 } tSTRING { $2 + $3 } keyword_end { $class = $1 + $keyword_end } + # "Position in grammar" $1 + # "Index for yyvsp" 0 + # "$:n" $:1 + def reference_to_c(ref) + case + when ref.type == :dollar && ref.name == "$" # $$ + tag = ref.ex_tag || lhs.tag + raise_tag_not_found_error(ref) unless tag + "(yyval.#{tag.member})" + when ref.type == :at && ref.name == "$" # @$ + "(yyloc)" + when ref.type == :index && ref.name == "$" # $:$ + raise "$:$ is not supported" + when ref.type == :dollar # $n + i = -position_in_rhs + ref.index + tag = ref.ex_tag || rhs[ref.index - 1].tag + raise_tag_not_found_error(ref) unless tag + "(yyvsp[#{i}].#{tag.member})" + when ref.type == :at # @n + i = -position_in_rhs + ref.index + "(yylsp[#{i}])" + when ref.type == :index # $:n + i = -position_in_rhs + ref.index + "(#{i} - 1)" + else + raise "Unexpected. #{self}, #{ref}" + end + end + + def position_in_rhs + # If rule is not derived rule, User Code is only action at + # the end of rule RHS. In such case, the action is located on + # `@rule.rhs.count`. + @rule.position_in_original_rule_rhs || @rule.rhs.count + end + + # If this is midrule action, RHS is a RHS of the original rule. + def rhs + (@rule.original_rule || @rule).rhs + end + + # Unlike `rhs`, LHS is always a LHS of the rule. + def lhs + @rule.lhs + end + + def raise_tag_not_found_error(ref) + raise "Tag is not specified for '$#{ref.value}' in '#{@rule}'" + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/counter.rb b/tool/lrama/lib/lrama/grammar/counter.rb new file mode 100644 index 0000000000..c13f4ec3e3 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/counter.rb @@ -0,0 +1,15 @@ +module Lrama + class Grammar + class Counter + def initialize(number) + @number = number + end + + def increment + n = @number + @number += 1 + n + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/destructor.rb b/tool/lrama/lib/lrama/grammar/destructor.rb new file mode 100644 index 0000000000..4b7059e923 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/destructor.rb @@ -0,0 +1,9 @@ +module Lrama + class Grammar + class Destructor < Struct.new(:ident_or_tags, :token_code, :lineno, keyword_init: true) + def translated_code(tag) + Code::DestructorCode.new(type: :destructor, token_code: token_code, tag: tag).translated_code + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/error_token.rb b/tool/lrama/lib/lrama/grammar/error_token.rb new file mode 100644 index 0000000000..8efde7df33 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/error_token.rb @@ -0,0 +1,9 @@ +module Lrama + class Grammar + class ErrorToken < Struct.new(:ident_or_tags, :token_code, :lineno, keyword_init: true) + def translated_code(tag) + Code::PrinterCode.new(type: :error_token, token_code: token_code, tag: tag).translated_code + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/parameterizing_rule.rb b/tool/lrama/lib/lrama/grammar/parameterizing_rule.rb new file mode 100644 index 0000000000..d371805f4b --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/parameterizing_rule.rb @@ -0,0 +1,3 @@ +require_relative 'parameterizing_rule/resolver' +require_relative 'parameterizing_rule/rhs' +require_relative 'parameterizing_rule/rule' diff --git a/tool/lrama/lib/lrama/grammar/parameterizing_rule/resolver.rb b/tool/lrama/lib/lrama/grammar/parameterizing_rule/resolver.rb new file mode 100644 index 0000000000..d8f3ae7897 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/parameterizing_rule/resolver.rb @@ -0,0 +1,56 @@ +module Lrama + class Grammar + class ParameterizingRule + class Resolver + attr_accessor :rules, :created_lhs_list + + def initialize + @rules = [] + @created_lhs_list = [] + end + + def add_parameterizing_rule(rule) + @rules << rule + end + + def find_rule(token) + select_rules(@rules, token).last + end + + def find_inline(token) + @rules.select { |rule| rule.name == token.s_value && rule.is_inline }.last + end + + def created_lhs(lhs_s_value) + @created_lhs_list.reverse.find { |created_lhs| created_lhs.s_value == lhs_s_value } + end + + private + + def select_rules(rules, token) + rules = select_not_inline_rules(rules) + rules = select_rules_by_name(rules, token.rule_name) + rules = rules.select { |rule| rule.required_parameters_count == token.args_count } + if rules.empty? + raise "Invalid number of arguments. `#{token.rule_name}`" + else + rules + end + end + + def select_not_inline_rules(rules) + rules.select { |rule| !rule.is_inline } + end + + def select_rules_by_name(rules, rule_name) + rules = rules.select { |rule| rule.name == rule_name } + if rules.empty? + raise "Parameterizing rule does not exist. `#{rule_name}`" + else + rules + end + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/parameterizing_rule/rhs.rb b/tool/lrama/lib/lrama/grammar/parameterizing_rule/rhs.rb new file mode 100644 index 0000000000..3eb92f8ef4 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/parameterizing_rule/rhs.rb @@ -0,0 +1,37 @@ +module Lrama + class Grammar + class ParameterizingRule + class Rhs + attr_accessor :symbols, :user_code, :precedence_sym + + def initialize + @symbols = [] + @user_code = nil + @precedence_sym = nil + end + + def resolve_user_code(bindings) + return unless user_code + + var_to_arg = {} + symbols.each do |sym| + resolved_sym = bindings.resolve_symbol(sym) + if resolved_sym != sym + var_to_arg[sym.s_value] = resolved_sym.s_value + end + end + + var_to_arg.each do |var, arg| + user_code.references.each do |ref| + if ref.name == var + ref.name = arg + end + end + end + + return user_code + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/parameterizing_rule/rule.rb b/tool/lrama/lib/lrama/grammar/parameterizing_rule/rule.rb new file mode 100644 index 0000000000..38f0fca4ea --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/parameterizing_rule/rule.rb @@ -0,0 +1,18 @@ +module Lrama + class Grammar + class ParameterizingRule + class Rule + attr_reader :name, :parameters, :rhs_list, :required_parameters_count, :tag, :is_inline + + def initialize(name, parameters, rhs_list, tag: nil, is_inline: false) + @name = name + @parameters = parameters + @rhs_list = rhs_list + @tag = tag + @is_inline = is_inline + @required_parameters_count = parameters.count + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/percent_code.rb b/tool/lrama/lib/lrama/grammar/percent_code.rb new file mode 100644 index 0000000000..8cbc5aef2c --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/percent_code.rb @@ -0,0 +1,12 @@ +module Lrama + class Grammar + class PercentCode + attr_reader :name, :code + + def initialize(name, code) + @name = name + @code = code + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/precedence.rb b/tool/lrama/lib/lrama/grammar/precedence.rb new file mode 100644 index 0000000000..fed739b3c0 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/precedence.rb @@ -0,0 +1,11 @@ +module Lrama + class Grammar + class Precedence < Struct.new(:type, :precedence, keyword_init: true) + include Comparable + + def <=>(other) + self.precedence <=> other.precedence + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/printer.rb b/tool/lrama/lib/lrama/grammar/printer.rb new file mode 100644 index 0000000000..8984a96e1a --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/printer.rb @@ -0,0 +1,9 @@ +module Lrama + class Grammar + class Printer < Struct.new(:ident_or_tags, :token_code, :lineno, keyword_init: true) + def translated_code(tag) + Code::PrinterCode.new(type: :printer, token_code: token_code, tag: tag).translated_code + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/reference.rb b/tool/lrama/lib/lrama/grammar/reference.rb new file mode 100644 index 0000000000..c56e7673a6 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/reference.rb @@ -0,0 +1,14 @@ +module Lrama + class Grammar + # type: :dollar or :at + # name: String (e.g. $$, $foo, $expr.right) + # number: Integer (e.g. $1) + # index: Integer + # ex_tag: "$<tag>1" (Optional) + class Reference < Struct.new(:type, :name, :number, :index, :ex_tag, :first_column, :last_column, keyword_init: true) + def value + name || number + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/rule.rb b/tool/lrama/lib/lrama/grammar/rule.rb new file mode 100644 index 0000000000..0e06edc80d --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/rule.rb @@ -0,0 +1,60 @@ +module Lrama + class Grammar + # _rhs holds original RHS element. Use rhs to refer to Symbol. + class Rule < Struct.new(:id, :_lhs, :lhs, :lhs_tag, :_rhs, :rhs, :token_code, :position_in_original_rule_rhs, :nullable, :precedence_sym, :lineno, keyword_init: true) + attr_accessor :original_rule + + def ==(other) + self.class == other.class && + self.lhs == other.lhs && + self.lhs_tag == other.lhs_tag && + self.rhs == other.rhs && + self.token_code == other.token_code && + self.position_in_original_rule_rhs == other.position_in_original_rule_rhs && + self.nullable == other.nullable && + self.precedence_sym == other.precedence_sym && + self.lineno == other.lineno + end + + # TODO: Change this to display_name + def to_s + l = lhs.id.s_value + r = empty_rule? ? "ε" : rhs.map {|r| r.id.s_value }.join(" ") + + "#{l} -> #{r}" + end + + # Used by #user_actions + def as_comment + l = lhs.id.s_value + r = empty_rule? ? "%empty" : rhs.map(&:display_name).join(" ") + + "#{l}: #{r}" + end + + def with_actions + "#{to_s} {#{token_code&.s_value}}" + end + + # opt_nl: ε <-- empty_rule + # | '\n' <-- not empty_rule + def empty_rule? + rhs.empty? + end + + def precedence + precedence_sym&.precedence + end + + def initial_rule? + id == 0 + end + + def translated_code + return nil unless token_code + + Code::RuleAction.new(type: :rule_action, token_code: token_code, rule: self).translated_code + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/rule_builder.rb b/tool/lrama/lib/lrama/grammar/rule_builder.rb new file mode 100644 index 0000000000..ccb41e67f8 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/rule_builder.rb @@ -0,0 +1,268 @@ +module Lrama + class Grammar + class RuleBuilder + attr_accessor :lhs, :line + attr_reader :lhs_tag, :rhs, :user_code, :precedence_sym + + def initialize(rule_counter, midrule_action_counter, position_in_original_rule_rhs = nil, lhs_tag: nil, skip_preprocess_references: false) + @rule_counter = rule_counter + @midrule_action_counter = midrule_action_counter + @position_in_original_rule_rhs = position_in_original_rule_rhs + @skip_preprocess_references = skip_preprocess_references + + @lhs = nil + @lhs_tag = lhs_tag + @rhs = [] + @user_code = nil + @precedence_sym = nil + @line = nil + @rules = [] + @rule_builders_for_parameterizing_rules = [] + @rule_builders_for_derived_rules = [] + @rule_builders_for_inline_rules = [] + @parameterizing_rules = [] + @inline_rules = [] + @midrule_action_rules = [] + end + + def add_rhs(rhs) + if !@line + @line = rhs.line + end + + flush_user_code + + @rhs << rhs + end + + def user_code=(user_code) + if !@line + @line = user_code&.line + end + + flush_user_code + + @user_code = user_code + end + + def precedence_sym=(precedence_sym) + flush_user_code + + @precedence_sym = precedence_sym + end + + def complete_input + freeze_rhs + end + + def setup_rules(parameterizing_rule_resolver) + preprocess_references unless @skip_preprocess_references + if rhs.any? { |token| parameterizing_rule_resolver.find_inline(token) } + resolve_inline(parameterizing_rule_resolver) + else + process_rhs(parameterizing_rule_resolver) + end + build_rules + end + + def rules + @parameterizing_rules + @inline_rules + @midrule_action_rules + @rules + end + + private + + def freeze_rhs + @rhs.freeze + end + + def preprocess_references + numberize_references + end + + def build_rules + tokens = @replaced_rhs + + if tokens + rule = Rule.new( + id: @rule_counter.increment, _lhs: lhs, _rhs: tokens, lhs_tag: lhs_tag, token_code: user_code, + position_in_original_rule_rhs: @position_in_original_rule_rhs, precedence_sym: precedence_sym, lineno: line + ) + @rules = [rule] + @parameterizing_rules = @rule_builders_for_parameterizing_rules.map do |rule_builder| + rule_builder.rules + end.flatten + @midrule_action_rules = @rule_builders_for_derived_rules.map do |rule_builder| + rule_builder.rules + end.flatten + @midrule_action_rules.each do |r| + r.original_rule = rule + end + else + @inline_rules = @rule_builders_for_inline_rules.map do |rule_builder| + rule_builder.rules + end.flatten + end + end + + # rhs is a mixture of variety type of tokens like `Ident`, `InstantiateRule`, `UserCode` and so on. + # `#process_rhs` replaces some kind of tokens to `Ident` so that all `@replaced_rhs` are `Ident` or `Char`. + def process_rhs(parameterizing_rule_resolver) + return if @replaced_rhs + + @replaced_rhs = [] + + rhs.each_with_index do |token, i| + case token + when Lrama::Lexer::Token::Char + @replaced_rhs << token + when Lrama::Lexer::Token::Ident + @replaced_rhs << token + when Lrama::Lexer::Token::InstantiateRule + parameterizing_rule = parameterizing_rule_resolver.find_rule(token) + raise "Unexpected token. #{token}" unless parameterizing_rule + + bindings = Binding.new(parameterizing_rule, token.args) + lhs_s_value = lhs_s_value(token, bindings) + if (created_lhs = parameterizing_rule_resolver.created_lhs(lhs_s_value)) + @replaced_rhs << created_lhs + else + lhs_token = Lrama::Lexer::Token::Ident.new(s_value: lhs_s_value, location: token.location) + @replaced_rhs << lhs_token + parameterizing_rule_resolver.created_lhs_list << lhs_token + parameterizing_rule.rhs_list.each do |r| + rule_builder = RuleBuilder.new(@rule_counter, @midrule_action_counter, lhs_tag: token.lhs_tag || parameterizing_rule.tag) + rule_builder.lhs = lhs_token + r.symbols.each { |sym| rule_builder.add_rhs(bindings.resolve_symbol(sym)) } + rule_builder.line = line + rule_builder.precedence_sym = r.precedence_sym + rule_builder.user_code = r.resolve_user_code(bindings) + rule_builder.complete_input + rule_builder.setup_rules(parameterizing_rule_resolver) + @rule_builders_for_parameterizing_rules << rule_builder + end + end + when Lrama::Lexer::Token::UserCode + prefix = token.referred ? "@" : "$@" + tag = token.tag || lhs_tag + new_token = Lrama::Lexer::Token::Ident.new(s_value: prefix + @midrule_action_counter.increment.to_s) + @replaced_rhs << new_token + + rule_builder = RuleBuilder.new(@rule_counter, @midrule_action_counter, i, lhs_tag: tag, skip_preprocess_references: true) + rule_builder.lhs = new_token + rule_builder.user_code = token + rule_builder.complete_input + rule_builder.setup_rules(parameterizing_rule_resolver) + + @rule_builders_for_derived_rules << rule_builder + else + raise "Unexpected token. #{token}" + end + end + end + + def lhs_s_value(token, bindings) + s_values = token.args.map do |arg| + resolved = bindings.resolve_symbol(arg) + if resolved.is_a?(Lexer::Token::InstantiateRule) + [resolved.s_value, resolved.args.map(&:s_value)] + else + resolved.s_value + end + end + "#{token.rule_name}_#{s_values.join('_')}" + end + + def resolve_inline(parameterizing_rule_resolver) + rhs.each_with_index do |token, i| + if inline_rule = parameterizing_rule_resolver.find_inline(token) + inline_rule.rhs_list.each_with_index do |inline_rhs| + rule_builder = RuleBuilder.new(@rule_counter, @midrule_action_counter, lhs_tag: lhs_tag, skip_preprocess_references: true) + resolve_inline_rhs(rule_builder, inline_rhs, i) + rule_builder.lhs = lhs + rule_builder.line = line + rule_builder.user_code = replace_inline_user_code(inline_rhs, i) + rule_builder.complete_input + rule_builder.setup_rules(parameterizing_rule_resolver) + @rule_builders_for_inline_rules << rule_builder + end + end + end + end + + def resolve_inline_rhs(rule_builder, inline_rhs, index) + rhs.each_with_index do |token, i| + if index == i + inline_rhs.symbols.each { |sym| rule_builder.add_rhs(sym) } + else + rule_builder.add_rhs(token) + end + end + end + + def replace_inline_user_code(inline_rhs, index) + return user_code if inline_rhs.user_code.nil? + return user_code if user_code.nil? + + code = user_code.s_value.gsub(/\$#{index + 1}/, inline_rhs.user_code.s_value) + Lrama::Lexer::Token::UserCode.new(s_value: code, location: user_code.location) + end + + def numberize_references + # Bison n'th component is 1-origin + (rhs + [user_code]).compact.each.with_index(1) do |token, i| + next unless token.is_a?(Lrama::Lexer::Token::UserCode) + + token.references.each do |ref| + ref_name = ref.name + + if ref_name + if ref_name == '$' + ref.name = '$' + else + candidates = ([lhs] + rhs).each_with_index.select {|token, _i| token.referred_by?(ref_name) } + + if candidates.size >= 2 + token.invalid_ref(ref, "Referring symbol `#{ref_name}` is duplicated.") + end + + unless (referring_symbol = candidates.first) + token.invalid_ref(ref, "Referring symbol `#{ref_name}` is not found.") + end + + if referring_symbol[1] == 0 # Refers to LHS + ref.name = '$' + else + ref.number = referring_symbol[1] + end + end + end + + if ref.number + # TODO: When Inlining is implemented, for example, if `$1` is expanded to multiple RHS tokens, + # `$2` needs to access `$2 + n` to actually access it. So, after the Inlining implementation, + # it needs resolves from number to index. + ref.index = ref.number + end + + # TODO: Need to check index of @ too? + next if ref.type == :at + + if ref.index + # TODO: Prohibit $0 even so Bison allows it? + # See: https://www.gnu.org/software/bison/manual/html_node/Actions.html + token.invalid_ref(ref, "Can not refer following component. #{ref.index} >= #{i}.") if ref.index >= i + rhs[ref.index - 1].referred = true + end + end + end + end + + def flush_user_code + if (c = @user_code) + @rhs << c + @user_code = nil + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/stdlib.y b/tool/lrama/lib/lrama/grammar/stdlib.y new file mode 100644 index 0000000000..d6e89c908c --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/stdlib.y @@ -0,0 +1,122 @@ +/********************************************************************** + + stdlib.y + + This is lrama's standard library. It provides a number of + parameterizing rule definitions, such as options and lists, + that should be useful in a number of situations. + +**********************************************************************/ + +// ------------------------------------------------------------------- +// Options + +/* + * program: option(number) + * + * => + * + * program: option_number + * option_number: %empty + * option_number: number + */ +%rule option(X): /* empty */ + | X + ; + +// ------------------------------------------------------------------- +// Sequences + +/* + * program: preceded(opening, X) + * + * => + * + * program: preceded_opening_X + * preceded_opening_X: opening X + */ +%rule preceded(opening, X): opening X { $$ = $2; } + ; + +/* + * program: terminated(X, closing) + * + * => + * + * program: terminated_X_closing + * terminated_X_closing: X closing + */ +%rule terminated(X, closing): X closing { $$ = $1; } + ; + +/* + * program: delimited(opening, X, closing) + * + * => + * + * program: delimited_opening_X_closing + * delimited_opening_X_closing: opening X closing + */ +%rule delimited(opening, X, closing): opening X closing { $$ = $2; } + ; + +// ------------------------------------------------------------------- +// Lists + +/* + * program: list(number) + * + * => + * + * program: list_number + * list_number: %empty + * list_number: list_number number + */ +%rule list(X): /* empty */ + | list(X) X + ; + +/* + * program: nonempty_list(number) + * + * => + * + * program: nonempty_list_number + * nonempty_list_number: number + * nonempty_list_number: nonempty_list_number number + */ +%rule nonempty_list(X): X + | nonempty_list(X) X + ; + +/* + * program: separated_nonempty_list(comma, number) + * + * => + * + * program: separated_nonempty_list_comma_number + * separated_nonempty_list_comma_number: number + * separated_nonempty_list_comma_number: separated_nonempty_list_comma_number comma number + */ +%rule separated_nonempty_list(separator, X): X + | separated_nonempty_list(separator, X) separator X + ; + +/* + * program: separated_list(comma, number) + * + * => + * + * program: separated_list_comma_number + * separated_list_comma_number: option_separated_nonempty_list_comma_number + * option_separated_nonempty_list_comma_number: %empty + * option_separated_nonempty_list_comma_number: separated_nonempty_list_comma_number + * separated_nonempty_list_comma_number: number + * separated_nonempty_list_comma_number: comma separated_nonempty_list_comma_number number + */ +%rule separated_list(separator, X): option(separated_nonempty_list(separator, X)) + ; + +%% + +%union{}; diff --git a/tool/lrama/lib/lrama/grammar/symbol.rb b/tool/lrama/lib/lrama/grammar/symbol.rb new file mode 100644 index 0000000000..deb67ad9a8 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/symbol.rb @@ -0,0 +1,103 @@ +# Symbol is both of nterm and term +# `number` is both for nterm and term +# `token_id` is tokentype for term, internal sequence number for nterm +# +# TODO: Add validation for ASCII code range for Token::Char + +module Lrama + class Grammar + class Symbol + attr_accessor :id, :alias_name, :tag, :number, :token_id, :nullable, :precedence, + :printer, :destructor, :error_token, :first_set, :first_set_bitmap + attr_reader :term + attr_writer :eof_symbol, :error_symbol, :undef_symbol, :accept_symbol + + def initialize(id:, term:, alias_name: nil, number: nil, tag: nil, token_id: nil, nullable: nil, precedence: nil, printer: nil, destructor: nil) + @id = id + @alias_name = alias_name + @number = number + @tag = tag + @term = term + @token_id = token_id + @nullable = nullable + @precedence = precedence + @printer = printer + @destructor = destructor + end + + def term? + term + end + + def nterm? + !term + end + + def eof_symbol? + !!@eof_symbol + end + + def error_symbol? + !!@error_symbol + end + + def undef_symbol? + !!@undef_symbol + end + + def accept_symbol? + !!@accept_symbol + end + + def display_name + alias_name || id.s_value + end + + # name for yysymbol_kind_t + # + # See: b4_symbol_kind_base + # @type var name: String + def enum_name + case + when accept_symbol? + name = "YYACCEPT" + when eof_symbol? + name = "YYEOF" + when term? && id.is_a?(Lrama::Lexer::Token::Char) + name = number.to_s + display_name + when term? && id.is_a?(Lrama::Lexer::Token::Ident) + name = id.s_value + when nterm? && (id.s_value.include?("$") || id.s_value.include?("@")) + name = number.to_s + id.s_value + when nterm? + name = id.s_value + else + raise "Unexpected #{self}" + end + + "YYSYMBOL_" + name.gsub(/\W+/, "_") + end + + # comment for yysymbol_kind_t + def comment + case + when accept_symbol? + # YYSYMBOL_YYACCEPT + id.s_value + when eof_symbol? + # YYEOF + alias_name + when (term? && 0 < token_id && token_id < 128) + # YYSYMBOL_3_backslash_, YYSYMBOL_14_ + alias_name || id.s_value + when id.s_value.include?("$") || id.s_value.include?("@") + # YYSYMBOL_21_1 + id.s_value + else + # YYSYMBOL_keyword_class, YYSYMBOL_strings_1 + alias_name || id.s_value + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/symbols.rb b/tool/lrama/lib/lrama/grammar/symbols.rb new file mode 100644 index 0000000000..cc9b4ec559 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/symbols.rb @@ -0,0 +1 @@ +require_relative "symbols/resolver" diff --git a/tool/lrama/lib/lrama/grammar/symbols/resolver.rb b/tool/lrama/lib/lrama/grammar/symbols/resolver.rb new file mode 100644 index 0000000000..1788ed63fa --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/symbols/resolver.rb @@ -0,0 +1,293 @@ +module Lrama + class Grammar + class Symbols + class Resolver + attr_reader :terms, :nterms + + def initialize + @terms = [] + @nterms = [] + end + + def symbols + @symbols ||= (@terms + @nterms) + end + + def sort_by_number! + symbols.sort_by!(&:number) + end + + def add_term(id:, alias_name: nil, tag: nil, token_id: nil, replace: false) + if token_id && (sym = find_symbol_by_token_id(token_id)) + if replace + sym.id = id + sym.alias_name = alias_name + sym.tag = tag + end + + return sym + end + + if (sym = find_symbol_by_id(id)) + return sym + end + + @symbols = nil + term = Symbol.new( + id: id, alias_name: alias_name, number: nil, tag: tag, + term: true, token_id: token_id, nullable: false + ) + @terms << term + term + end + + def add_nterm(id:, alias_name: nil, tag: nil) + return if find_symbol_by_id(id) + + @symbols = nil + nterm = Symbol.new( + id: id, alias_name: alias_name, number: nil, tag: tag, + term: false, token_id: nil, nullable: nil, + ) + @nterms << nterm + nterm + end + + def find_symbol_by_s_value(s_value) + symbols.find { |s| s.id.s_value == s_value } + end + + def find_symbol_by_s_value!(s_value) + find_symbol_by_s_value(s_value) || (raise "Symbol not found. value: `#{s_value}`") + end + + def find_symbol_by_id(id) + symbols.find do |s| + s.id == id || s.alias_name == id.s_value + end + end + + def find_symbol_by_id!(id) + find_symbol_by_id(id) || (raise "Symbol not found. #{id}") + end + + def find_symbol_by_token_id(token_id) + symbols.find {|s| s.token_id == token_id } + end + + def find_symbol_by_number!(number) + sym = symbols[number] + + raise "Symbol not found. number: `#{number}`" unless sym + raise "[BUG] Symbol number mismatch. #{number}, #{sym}" if sym.number != number + + sym + end + + def fill_symbol_number + # YYEMPTY = -2 + # YYEOF = 0 + # YYerror = 1 + # YYUNDEF = 2 + @number = 3 + fill_terms_number + fill_nterms_number + end + + def fill_nterm_type(types) + types.each do |type| + nterm = find_nterm_by_id!(type.id) + nterm.tag = type.tag + end + end + + def fill_printer(printers) + symbols.each do |sym| + printers.each do |printer| + printer.ident_or_tags.each do |ident_or_tag| + case ident_or_tag + when Lrama::Lexer::Token::Ident + sym.printer = printer if sym.id == ident_or_tag + when Lrama::Lexer::Token::Tag + sym.printer = printer if sym.tag == ident_or_tag + else + raise "Unknown token type. #{printer}" + end + end + end + end + end + + def fill_destructor(destructors) + symbols.each do |sym| + destructors.each do |destructor| + destructor.ident_or_tags.each do |ident_or_tag| + case ident_or_tag + when Lrama::Lexer::Token::Ident + sym.destructor = destructor if sym.id == ident_or_tag + when Lrama::Lexer::Token::Tag + sym.destructor = destructor if sym.tag == ident_or_tag + else + raise "Unknown token type. #{destructor}" + end + end + end + end + end + + def fill_error_token(error_tokens) + symbols.each do |sym| + error_tokens.each do |token| + token.ident_or_tags.each do |ident_or_tag| + case ident_or_tag + when Lrama::Lexer::Token::Ident + sym.error_token = token if sym.id == ident_or_tag + when Lrama::Lexer::Token::Tag + sym.error_token = token if sym.tag == ident_or_tag + else + raise "Unknown token type. #{token}" + end + end + end + end + end + + def token_to_symbol(token) + case token + when Lrama::Lexer::Token + find_symbol_by_id!(token) + else + raise "Unknown class: #{token}" + end + end + + def validate! + validate_number_uniqueness! + validate_alias_name_uniqueness! + end + + private + + def find_nterm_by_id!(id) + @nterms.find do |s| + s.id == id + end || (raise "Symbol not found. #{id}") + end + + def fill_terms_number + # Character literal in grammar file has + # token id corresponding to ASCII code by default, + # so start token_id from 256. + token_id = 256 + + @terms.each do |sym| + while used_numbers[@number] do + @number += 1 + end + + if sym.number.nil? + sym.number = @number + used_numbers[@number] = true + @number += 1 + end + + # If id is Token::Char, it uses ASCII code + if sym.token_id.nil? + if sym.id.is_a?(Lrama::Lexer::Token::Char) + # Ignore ' on the both sides + case sym.id.s_value[1..-2] + when "\\b" + sym.token_id = 8 + when "\\f" + sym.token_id = 12 + when "\\n" + sym.token_id = 10 + when "\\r" + sym.token_id = 13 + when "\\t" + sym.token_id = 9 + when "\\v" + sym.token_id = 11 + when "\"" + sym.token_id = 34 + when "'" + sym.token_id = 39 + when "\\\\" + sym.token_id = 92 + when /\A\\(\d+)\z/ + unless (id = Integer($1, 8)).nil? + sym.token_id = id + else + raise "Unknown Char s_value #{sym}" + end + when /\A(.)\z/ + unless (id = $1&.bytes&.first).nil? + sym.token_id = id + else + raise "Unknown Char s_value #{sym}" + end + else + raise "Unknown Char s_value #{sym}" + end + else + sym.token_id = token_id + token_id += 1 + end + end + end + end + + def fill_nterms_number + token_id = 0 + + @nterms.each do |sym| + while used_numbers[@number] do + @number += 1 + end + + if sym.number.nil? + sym.number = @number + used_numbers[@number] = true + @number += 1 + end + + if sym.token_id.nil? + sym.token_id = token_id + token_id += 1 + end + end + end + + def used_numbers + return @used_numbers if defined?(@used_numbers) + + @used_numbers = {} + symbols.map(&:number).each do |n| + @used_numbers[n] = true + end + @used_numbers + end + + def validate_number_uniqueness! + invalid = symbols.group_by(&:number).select do |number, syms| + syms.count > 1 + end + + return if invalid.empty? + + raise "Symbol number is duplicated. #{invalid}" + end + + def validate_alias_name_uniqueness! + invalid = symbols.select(&:alias_name).group_by(&:alias_name).select do |alias_name, syms| + syms.count > 1 + end + + return if invalid.empty? + + raise "Symbol alias name is duplicated. #{invalid}" + end + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/type.rb b/tool/lrama/lib/lrama/grammar/type.rb new file mode 100644 index 0000000000..6b4b0961a1 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/type.rb @@ -0,0 +1,18 @@ +module Lrama + class Grammar + class Type + attr_reader :id, :tag + + def initialize(id:, tag:) + @id = id + @tag = tag + end + + def ==(other) + self.class == other.class && + self.id == other.id && + self.tag == other.tag + end + end + end +end diff --git a/tool/lrama/lib/lrama/grammar/union.rb b/tool/lrama/lib/lrama/grammar/union.rb new file mode 100644 index 0000000000..854bffb5c1 --- /dev/null +++ b/tool/lrama/lib/lrama/grammar/union.rb @@ -0,0 +1,10 @@ +module Lrama + class Grammar + class Union < Struct.new(:code, :lineno, keyword_init: true) + def braces_less_code + # Braces is already removed by lexer + code.s_value + end + end + end +end |