require "strscan" require "lrama/lexer/grammar_file" require "lrama/lexer/location" require "lrama/lexer/token" module Lrama class Lexer attr_reader :head_line, :head_column, :line attr_accessor :status, :end_symbol SYMBOLS = ['%{', '%}', '%%', '{', '}', '\[', '\]', '\(', '\)', '\,', ':', '\|', ';'] PERCENT_TOKENS = %w( %union %token %type %left %right %nonassoc %expect %define %require %printer %destructor %lex-param %parse-param %initial-action %precedence %prec %error-token %before-reduce %after-reduce %after-shift-error-token %after-shift %after-pop-stack %empty %code %rule %no-stdlib %inline ) def initialize(grammar_file) @grammar_file = grammar_file @scanner = StringScanner.new(grammar_file.text) @head_column = @head = @scanner.pos @head_line = @line = 1 @status = :initial @end_symbol = nil end def next_token case @status when :initial lex_token when :c_declaration lex_c_code end end def column @scanner.pos - @head end def location Location.new( grammar_file: @grammar_file, first_line: @head_line, first_column: @head_column, last_line: line, last_column: column ) end def lex_token while !@scanner.eos? do case when @scanner.scan(/\n/) newline when @scanner.scan(/\s+/) # noop when @scanner.scan(/\/\*/) lex_comment when @scanner.scan(/\/\/.*(?\n)?/) newline if @scanner[:newline] else break end end reset_first_position case when @scanner.eos? return when @scanner.scan(/#{SYMBOLS.join('|')}/) return [@scanner.matched, @scanner.matched] when @scanner.scan(/#{PERCENT_TOKENS.join('|')}/) return [@scanner.matched, @scanner.matched] when @scanner.scan(/[\?\+\*]/) return [@scanner.matched, @scanner.matched] when @scanner.scan(/<\w+>/) return [:TAG, Lrama::Lexer::Token::Tag.new(s_value: @scanner.matched, location: location)] when @scanner.scan(/'.'/) return [:CHARACTER, Lrama::Lexer::Token::Char.new(s_value: @scanner.matched, location: location)] when @scanner.scan(/'\\\\'|'\\b'|'\\t'|'\\f'|'\\r'|'\\n'|'\\v'|'\\13'/) return [:CHARACTER, Lrama::Lexer::Token::Char.new(s_value: @scanner.matched, location: location)] when @scanner.scan(/".*?"/) return [:STRING, %Q(#{@scanner.matched})] when @scanner.scan(/\d+/) return [:INTEGER, Integer(@scanner.matched)] when @scanner.scan(/([a-zA-Z_.][-a-zA-Z0-9_.]*)/) token = Lrama::Lexer::Token::Ident.new(s_value: @scanner.matched, location: location) type = if @scanner.check(/\s*(\[\s*[a-zA-Z_.][-a-zA-Z0-9_.]*\s*\])?\s*:/) :IDENT_COLON else :IDENTIFIER end return [type, token] else raise ParseError, "Unexpected token: #{@scanner.peek(10).chomp}." end end def lex_c_code nested = 0 code = '' reset_first_position while !@scanner.eos? do case when @scanner.scan(/{/) code += @scanner.matched nested += 1 when @scanner.scan(/}/) if nested == 0 && @end_symbol == '}' @scanner.unscan return [:C_DECLARATION, Lrama::Lexer::Token::UserCode.new(s_value: code, location: location)] else code += @scanner.matched nested -= 1 end when @scanner.check(/#{@end_symbol}/) return [:C_DECLARATION, Lrama::Lexer::Token::UserCode.new(s_value: code, location: location)] when @scanner.scan(/\n/) code += @scanner.matched newline when @scanner.scan(/".*?"/) code += %Q(#{@scanner.matched}) @line += @scanner.matched.count("\n") when @scanner.scan(/'.*?'/) code += %Q(#{@scanner.matched}) when @scanner.scan(/[^\"'\{\}\n]+/) code += @scanner.matched when @scanner.scan(/#{Regexp.escape(@end_symbol)}/) code += @scanner.matched else code += @scanner.getch end end raise ParseError, "Unexpected code: #{code}." end private def lex_comment while !@scanner.eos? do case when @scanner.scan(/\n/) newline when @scanner.scan(/\*\//) return else @scanner.getch end end end def reset_first_position @head_line = line @head_column = column end def newline @line += 1 @head = @scanner.pos end end end