summaryrefslogtreecommitdiff
path: root/lib/rdoc/encoding.rb
blob: 67e190f7820e090079fe936a873bd00fc39c1db3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# coding: US-ASCII
# frozen_string_literal: true

##
# This class is a wrapper around File IO and Encoding that helps RDoc load
# files and convert them to the correct encoding.

module RDoc::Encoding

  HEADER_REGEXP = /^
    (?:
      \A\#!.*\n
      |
      ^\#\s+frozen[-_]string[-_]literal[=:].+\n
      |
      ^\#[^\n]+\b(?:en)?coding[=:]\s*(?<name>[^\s;]+).*\n
      |
      <\?xml[^?]*encoding=(?<quote>["'])(?<name>.*?)\k<quote>.*\n
    )+
  /xi # :nodoc:

  ##
  # Reads the contents of +filename+ and handles any encoding directives in
  # the file.
  #
  # The content will be converted to the +encoding+.  If the file cannot be
  # converted a warning will be printed and nil will be returned.
  #
  # If +force_transcode+ is true the document will be transcoded and any
  # unknown character in the target encoding will be replaced with '?'

  def self.read_file filename, encoding, force_transcode = false
    content = File.open filename, "rb" do |f| f.read end
    content.gsub!("\r\n", "\n") if RUBY_PLATFORM =~ /mswin|mingw/

    utf8 = content.sub!(/\A\xef\xbb\xbf/, '')

    enc = RDoc::Encoding.detect_encoding content
    content = RDoc::Encoding.change_encoding content, enc if enc

    begin
      encoding ||= Encoding.default_external
      orig_encoding = content.encoding

      if not orig_encoding.ascii_compatible? then
        content = content.encode encoding
      elsif utf8 then
        content = RDoc::Encoding.change_encoding content, Encoding::UTF_8
        content = content.encode encoding
      else
        # assume the content is in our output encoding
        content = RDoc::Encoding.change_encoding content, encoding
      end

      unless content.valid_encoding? then
        # revert and try to transcode
        content = RDoc::Encoding.change_encoding content, orig_encoding
        content = content.encode encoding
      end

      unless content.valid_encoding? then
        warn "unable to convert #{filename} to #{encoding}, skipping"
        content = nil
      end
    rescue Encoding::InvalidByteSequenceError,
           Encoding::UndefinedConversionError => e
      if force_transcode then
        content = RDoc::Encoding.change_encoding content, orig_encoding
        content = content.encode(encoding,
                                 :invalid => :replace,
                                 :undef => :replace,
                                 :replace => '?')
        return content
      else
        warn "unable to convert #{e.message} for #{filename}, skipping"
        return nil
      end
    end

    content
  rescue ArgumentError => e
    raise unless e.message =~ /unknown encoding name - (.*)/
    warn "unknown encoding name \"#{$1}\" for #{filename}, skipping"
    nil
  rescue Errno::EISDIR, Errno::ENOENT
    nil
  end

  ##
  # Detects the encoding of +string+ based on the magic comment

  def self.detect_encoding string
    result = HEADER_REGEXP.match string
    name = result && result[:name]

    name ? Encoding.find(name) : nil
  end

  ##
  # Removes magic comments and shebang

  def self.remove_magic_comment string
    string.sub HEADER_REGEXP do |s|
      s.gsub(/[^\n]/, '')
    end
  end

  ##
  # Changes encoding based on +encoding+ without converting and returns new
  # string

  def self.change_encoding text, encoding
    if text.kind_of? RDoc::Comment
      text.encode! encoding
    else
      String.new text, encoding: encoding
    end
  end

end