1 files changed, 136 insertions, 0 deletions
diff --git a/lib/rdoc/encoding.rb b/lib/rdoc/encoding.rb
new file mode 100644
index 0000000000..cf60badd24
--- /dev/null
+++ b/lib/rdoc/encoding.rb
@@ -0,0 +1,136 @@
+# coding: US-ASCII
+# frozen_string_literal: true
+
+##
+# This class is a wrapper around File IO and Encoding that helps RDoc load
+# files and convert them to the correct encoding.
+
+module RDoc::Encoding
+
+  HEADER_REGEXP = /^
+    (?:
+      \A\#!.*\n
+      |
+      ^\#\s+frozen[-_]string[-_]literal[=:].+\n
+      |
+      ^\#[^\n]+\b(?:en)?coding[=:]\s*(?<name>[^\s;]+).*\n
+      |
+      <\?xml[^?]*encoding=(?<quote>["'])(?<name>.*?)\k<quote>.*\n
+    )+
+  /xi # :nodoc:
+
+  ##
+  # Reads the contents of +filename+ and handles any encoding directives in
+  # the file.
+  #
+  # The content will be converted to the +encoding+.  If the file cannot be
+  # converted a warning will be printed and nil will be returned.
+  #
+  # If +force_transcode+ is true the document will be transcoded and any
+  # unknown character in the target encoding will be replaced with '?'
+
+  def self.read_file filename, encoding, force_transcode = false
+    content = File.open filename, "rb" do |f| f.read end
+    content.gsub!("\r\n", "\n") if RUBY_PLATFORM =~ /mswin|mingw/
+
+    utf8 = content.sub!(/\A\xef\xbb\xbf/, '')
+
+    enc = RDoc::Encoding.detect_encoding content
+    content = RDoc::Encoding.change_encoding content, enc if enc
+
+    begin
+      encoding ||= Encoding.default_external
+      orig_encoding = content.encoding
+
+      if not orig_encoding.ascii_compatible? then
+        content = content.encode encoding
+      elsif utf8 then
+        content = RDoc::Encoding.change_encoding content, Encoding::UTF_8
+        content = content.encode encoding
+      else
+        # assume the content is in our output encoding
+        content = RDoc::Encoding.change_encoding content, encoding
+      end
+
+      unless content.valid_encoding? then
+        # revert and try to transcode
+        content = RDoc::Encoding.change_encoding content, orig_encoding
+        content = content.encode encoding
+      end
+
+      unless content.valid_encoding? then
+        warn "unable to convert #{filename} to #{encoding}, skipping"
+        content = nil
+      end
+    rescue Encoding::InvalidByteSequenceError,
+           Encoding::UndefinedConversionError => e
+      if force_transcode then
+        content = RDoc::Encoding.change_encoding content, orig_encoding
+        content = content.encode(encoding,
+                                 :invalid => :replace,
+                                 :undef => :replace,
+                                 :replace => '?')
+        return content
+      else
+        warn "unable to convert #{e.message} for #{filename}, skipping"
+        return nil
+      end
+    end
+
+    content
+  rescue ArgumentError => e
+    raise unless e.message =~ /unknown encoding name - (.*)/
+    warn "unknown encoding name \"#{$1}\" for #{filename}, skipping"
+    nil
+  rescue Errno::EISDIR, Errno::ENOENT
+    nil
+  end
+
+  def self.remove_frozen_string_literal string
+    string =~ /\A(?:#!.*\n)?(.*\n)/
+    first_line = $1
+
+    if first_line =~ /\A# +frozen[-_]string[-_]literal[=:].+$/i
+      string = string.sub first_line, ''
+    end
+
+    string
+  end
+
+  ##
+  # Detects the encoding of +string+ based on the magic comment
+
+  def self.detect_encoding string
+    result = HEADER_REGEXP.match string
+    name = result && result[:name]
+
+    name ? Encoding.find(name) : nil
+  end
+
+  ##
+  # Removes magic comments and shebang
+
+  def self.remove_magic_comment string
+    string.sub HEADER_REGEXP do |s|
+      s.gsub(/[^\n]/, '')
+    end
+  end
+
+  ##
+  # Changes encoding based on +encoding+ without converting and returns new
+  # string
+
+  def self.change_encoding text, encoding
+    if text.kind_of? RDoc::Comment
+      text.encode! encoding
+    else
+      # TODO: Remove this condition after Ruby 2.2 EOL
+      if RUBY_VERSION < '2.3.0'
+        text.force_encoding encoding
+      else
+        String.new text, encoding: encoding
+      end
+    end
+  end
+
+end