summaryrefslogtreecommitdiff
path: root/lib/xsd/charset.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xsd/charset.rb')
-rw-r--r--lib/xsd/charset.rb76
1 files changed, 46 insertions, 30 deletions
diff --git a/lib/xsd/charset.rb b/lib/xsd/charset.rb
index ccd22a7744..15d5500fce 100644
--- a/lib/xsd/charset.rb
+++ b/lib/xsd/charset.rb
@@ -1,5 +1,5 @@
# XSD4R - Charset handling library.
-# Copyright (C) 2001, 2003 NAKAMURA, Hiroshi <nahi@ruby-lang.org>.
+# Copyright (C) 2001, 2003, 2005 NAKAMURA, Hiroshi <nahi@ruby-lang.org>.
# This program is copyrighted free software by NAKAMURA, Hiroshi. You can
# redistribute it and/or modify it under the same terms of Ruby's license;
@@ -10,7 +10,7 @@ module XSD
module Charset
- @encoding = $KCODE
+ @internal_encoding = $KCODE
class XSDError < StandardError; end
class CharsetError < XSDError; end
@@ -24,27 +24,40 @@ public
#
EncodingConvertMap = {}
def Charset.init
+ EncodingConvertMap[['UTF8', 'X_ISO8859_1']] =
+ Proc.new { |str| str.unpack('U*').pack('C*') }
+ EncodingConvertMap[['X_ISO8859_1', 'UTF8']] =
+ Proc.new { |str| str.unpack('C*').pack('U*') }
begin
require 'xsd/iconvcharset'
- @encoding = 'UTF8'
- sjtag = (/(mswin|bccwin|mingw|cygwin|emx)/ =~ RUBY_PLATFORM) ? 'cp932' : 'shift_jis'
- EncodingConvertMap[['UTF8', 'EUC' ]] = Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) }
- EncodingConvertMap[['EUC' , 'UTF8']] = Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) }
- EncodingConvertMap[['EUC' , 'SJIS']] = Proc.new { |str| IconvCharset.safe_iconv(sjtag, "euc-jp", str) }
- EncodingConvertMap[['UTF8', 'SJIS']] = Proc.new { |str| IconvCharset.safe_iconv(sjtag, "utf-8", str) }
- EncodingConvertMap[['SJIS', 'UTF8']] = Proc.new { |str| IconvCharset.safe_iconv("utf-8", sjtag, str) }
- EncodingConvertMap[['SJIS', 'EUC' ]] = Proc.new { |str| IconvCharset.safe_iconv("euc-jp", sjtag, str) }
+ @internal_encoding = 'UTF8'
+ sjtag = (/(mswin|bccwin|mingw|cygwin|emx)/ =~ RUBY_PLATFORM) ? 'cp932' :
+ 'shift_jis'
+ EncodingConvertMap[['UTF8', 'EUC' ]] =
+ Proc.new { |str| IconvCharset.safe_iconv("euc-jp", "utf-8", str) }
+ EncodingConvertMap[['EUC' , 'UTF8']] =
+ Proc.new { |str| IconvCharset.safe_iconv("utf-8", "euc-jp", str) }
+ EncodingConvertMap[['EUC' , 'SJIS']] =
+ Proc.new { |str| IconvCharset.safe_iconv(sjtag, "euc-jp", str) }
+ EncodingConvertMap[['UTF8', 'SJIS']] =
+ Proc.new { |str| IconvCharset.safe_iconv(sjtag, "utf-8", str) }
+ EncodingConvertMap[['SJIS', 'UTF8']] =
+ Proc.new { |str| IconvCharset.safe_iconv("utf-8", sjtag, str) }
+ EncodingConvertMap[['SJIS', 'EUC' ]] =
+ Proc.new { |str| IconvCharset.safe_iconv("euc-jp", sjtag, str) }
rescue LoadError
begin
require 'nkf'
- EncodingConvertMap[['EUC' , 'SJIS']] = Proc.new { |str| NKF.nkf('-sXm0', str) }
- EncodingConvertMap[['SJIS', 'EUC' ]] = Proc.new { |str| NKF.nkf('-eXm0', str) }
+ EncodingConvertMap[['EUC' , 'SJIS']] =
+ Proc.new { |str| NKF.nkf('-sXm0', str) }
+ EncodingConvertMap[['SJIS', 'EUC' ]] =
+ Proc.new { |str| NKF.nkf('-eXm0', str) }
rescue LoadError
end
begin
require 'uconv'
- @encoding = 'UTF8'
+ @internal_encoding = 'UTF8'
EncodingConvertMap[['UTF8', 'EUC' ]] = Uconv.method(:u8toeuc)
EncodingConvertMap[['UTF8', 'SJIS']] = Uconv.method(:u8tosjis)
EncodingConvertMap[['EUC' , 'UTF8']] = Uconv.method(:euctou8)
@@ -60,6 +73,8 @@ public
'EUC' => 'euc-jp',
'SJIS' => 'shift_jis',
'UTF8' => 'utf-8',
+ 'X_ISO_8859_1' => 'iso-8859-1',
+ 'X_UNKNOWN' => nil,
}
@@ -67,24 +82,24 @@ public
## handlers
#
def Charset.encoding
- @encoding
+ @internal_encoding
end
def Charset.encoding=(encoding)
warn("xsd charset is set to #{encoding}") if $DEBUG
- @encoding = encoding
+ @internal_encoding = encoding
end
- def Charset.encoding_label
- charset_label(@encoding)
+ def Charset.xml_encoding_label
+ charset_label(@internal_encoding)
end
def Charset.encoding_to_xml(str, charset)
- encoding_conv(str, @encoding, charset_str(charset))
+ encoding_conv(str, @internal_encoding, charset_str(charset))
end
def Charset.encoding_from_xml(str, charset)
- encoding_conv(str, charset_str(charset), @encoding)
+ encoding_conv(str, charset_str(charset), @internal_encoding)
end
def Charset.encoding_conv(str, enc_from, enc_to)
@@ -94,7 +109,7 @@ public
converter.call(str)
else
raise CharsetConversionError.new(
- "Converter not found: #{ enc_from } -> #{ enc_to }")
+ "Converter not found: #{enc_from} -> #{enc_to}")
end
end
@@ -104,26 +119,26 @@ public
def Charset.charset_str(label)
if CharsetMap.respond_to?(:key)
- CharsetMap.key(label.downcase)
+ CharsetMap.key(label.downcase) || 'X_UNKNOWN'
else
- CharsetMap.index(label.downcase)
+ CharsetMap.index(label.downcase) || 'X_UNKNOWN'
end
end
# us_ascii = '[\x00-\x7F]'
us_ascii = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted.
- USASCIIRegexp = Regexp.new("\\A#{ us_ascii }*\\z", nil, "NONE")
+ USASCIIRegexp = Regexp.new("\\A#{us_ascii}*\\z", nil, "NONE")
twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])'
threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])'
- character_euc = "(?:#{ us_ascii }|#{ twobytes_euc }|#{ threebytes_euc })"
- EUCRegexp = Regexp.new("\\A#{ character_euc }*\\z", nil, "NONE")
+ character_euc = "(?:#{us_ascii}|#{twobytes_euc}|#{threebytes_euc})"
+ EUCRegexp = Regexp.new("\\A#{character_euc}*\\z", nil, "NONE")
# onebyte_sjis = '[\x00-\x7F\xA1-\xDF]'
onebyte_sjis = '[\x9\xa\xd\x20-\x7F\xA1-\xDF]' # XML 1.0 restricted.
twobytes_sjis = '(?:[\x81-\x9F\xE0-\xFC][\x40-\x7E\x80-\xFC])'
- character_sjis = "(?:#{ onebyte_sjis }|#{ twobytes_sjis })"
- SJISRegexp = Regexp.new("\\A#{ character_sjis }*\\z", nil, "NONE")
+ character_sjis = "(?:#{onebyte_sjis}|#{twobytes_sjis})"
+ SJISRegexp = Regexp.new("\\A#{character_sjis}*\\z", nil, "NONE")
# 0xxxxxxx
# 110yyyyy 10xxxxxx
@@ -132,8 +147,9 @@ public
threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])'
# 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx
fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])'
- character_utf8 = "(?:#{ us_ascii }|#{ twobytes_utf8 }|#{ threebytes_utf8 }|#{ fourbytes_utf8 })"
- UTF8Regexp = Regexp.new("\\A#{ character_utf8 }*\\z", nil, "NONE")
+ character_utf8 =
+ "(?:#{us_ascii}|#{twobytes_utf8}|#{threebytes_utf8}|#{fourbytes_utf8})"
+ UTF8Regexp = Regexp.new("\\A#{character_utf8}*\\z", nil, "NONE")
def Charset.is_us_ascii(str)
USASCIIRegexp =~ str
@@ -162,7 +178,7 @@ public
when 'SJIS'
is_sjis(str)
else
- raise UnknownCharsetError.new("Unknown charset: #{ code }")
+ raise UnknownCharsetError.new("Unknown charset: #{code}")
end
end
end