diff options
Diffstat (limited to 'lib/rdoc/encoding.rb')
| -rw-r--r-- | lib/rdoc/encoding.rb | 136 |
1 files changed, 136 insertions, 0 deletions
diff --git a/lib/rdoc/encoding.rb b/lib/rdoc/encoding.rb new file mode 100644 index 0000000000..cf60badd24 --- /dev/null +++ b/lib/rdoc/encoding.rb @@ -0,0 +1,136 @@ +# coding: US-ASCII +# frozen_string_literal: true + +## +# This class is a wrapper around File IO and Encoding that helps RDoc load +# files and convert them to the correct encoding. + +module RDoc::Encoding + + HEADER_REGEXP = /^ + (?: + \A\#!.*\n + | + ^\#\s+frozen[-_]string[-_]literal[=:].+\n + | + ^\#[^\n]+\b(?:en)?coding[=:]\s*(?<name>[^\s;]+).*\n + | + <\?xml[^?]*encoding=(?<quote>["'])(?<name>.*?)\k<quote>.*\n + )+ + /xi # :nodoc: + + ## + # Reads the contents of +filename+ and handles any encoding directives in + # the file. + # + # The content will be converted to the +encoding+. If the file cannot be + # converted a warning will be printed and nil will be returned. + # + # If +force_transcode+ is true the document will be transcoded and any + # unknown character in the target encoding will be replaced with '?' + + def self.read_file filename, encoding, force_transcode = false + content = File.open filename, "rb" do |f| f.read end + content.gsub!("\r\n", "\n") if RUBY_PLATFORM =~ /mswin|mingw/ + + utf8 = content.sub!(/\A\xef\xbb\xbf/, '') + + enc = RDoc::Encoding.detect_encoding content + content = RDoc::Encoding.change_encoding content, enc if enc + + begin + encoding ||= Encoding.default_external + orig_encoding = content.encoding + + if not orig_encoding.ascii_compatible? then + content = content.encode encoding + elsif utf8 then + content = RDoc::Encoding.change_encoding content, Encoding::UTF_8 + content = content.encode encoding + else + # assume the content is in our output encoding + content = RDoc::Encoding.change_encoding content, encoding + end + + unless content.valid_encoding? then + # revert and try to transcode + content = RDoc::Encoding.change_encoding content, orig_encoding + content = content.encode encoding + end + + unless content.valid_encoding? then + warn "unable to convert #{filename} to #{encoding}, skipping" + content = nil + end + rescue Encoding::InvalidByteSequenceError, + Encoding::UndefinedConversionError => e + if force_transcode then + content = RDoc::Encoding.change_encoding content, orig_encoding + content = content.encode(encoding, + :invalid => :replace, + :undef => :replace, + :replace => '?') + return content + else + warn "unable to convert #{e.message} for #{filename}, skipping" + return nil + end + end + + content + rescue ArgumentError => e + raise unless e.message =~ /unknown encoding name - (.*)/ + warn "unknown encoding name \"#{$1}\" for #{filename}, skipping" + nil + rescue Errno::EISDIR, Errno::ENOENT + nil + end + + def self.remove_frozen_string_literal string + string =~ /\A(?:#!.*\n)?(.*\n)/ + first_line = $1 + + if first_line =~ /\A# +frozen[-_]string[-_]literal[=:].+$/i + string = string.sub first_line, '' + end + + string + end + + ## + # Detects the encoding of +string+ based on the magic comment + + def self.detect_encoding string + result = HEADER_REGEXP.match string + name = result && result[:name] + + name ? Encoding.find(name) : nil + end + + ## + # Removes magic comments and shebang + + def self.remove_magic_comment string + string.sub HEADER_REGEXP do |s| + s.gsub(/[^\n]/, '') + end + end + + ## + # Changes encoding based on +encoding+ without converting and returns new + # string + + def self.change_encoding text, encoding + if text.kind_of? RDoc::Comment + text.encode! encoding + else + # TODO: Remove this condition after Ruby 2.2 EOL + if RUBY_VERSION < '2.3.0' + text.force_encoding encoding + else + String.new text, encoding: encoding + end + end + end + +end |
