From eb8ee5e401cf49a31d4dc5ba2c8e74f379bb2408 Mon Sep 17 00:00:00 2001 From: nahi Date: Sun, 28 Sep 2003 09:33:59 +0000 Subject: * forgot to add this file in the previous commit. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@4615 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/xsd/charset.rb | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/lib/xsd/charset.rb b/lib/xsd/charset.rb index 6dda959155..88d4f99043 100644 --- a/lib/xsd/charset.rb +++ b/lib/xsd/charset.rb @@ -117,12 +117,13 @@ public CharsetMap.index(label.downcase) end - # Original regexps: http://www.din.or.jp/~ohzaki/perl.htm - # ascii_euc = '[\x00-\x7F]' - ascii_euc = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted. + # us_ascii = '[\x00-\x7F]' + us_ascii = '[\x9\xa\xd\x20-\x7F]' # XML 1.0 restricted. + USASCIIRegexp = Regexp.new("\\A#{ us_ascii }*\\z", nil, "NONE") + twobytes_euc = '(?:[\x8E\xA1-\xFE][\xA1-\xFE])' threebytes_euc = '(?:\x8F[\xA1-\xFE][\xA1-\xFE])' - character_euc = "(?:#{ ascii_euc }|#{ twobytes_euc }|#{ threebytes_euc })" + character_euc = "(?:#{ us_ascii }|#{ twobytes_euc }|#{ threebytes_euc })" EUCRegexp = Regexp.new("\\A#{ character_euc }*\\z", nil, "NONE") # onebyte_sjis = '[\x00-\x7F\xA1-\xDF]' @@ -132,17 +133,19 @@ public SJISRegexp = Regexp.new("\\A#{ character_sjis }*\\z", nil, "NONE") # 0xxxxxxx - #ascii_utf8 = '[\0-\x7F]' - ascii_utf8 = '[\x9\xA\xD\x20-\x7F]' # XML 1.0 restricted. # 110yyyyy 10xxxxxx twobytes_utf8 = '(?:[\xC0-\xDF][\x80-\xBF])' # 1110zzzz 10yyyyyy 10xxxxxx threebytes_utf8 = '(?:[\xE0-\xEF][\x80-\xBF][\x80-\xBF])' # 11110uuu 10uuuzzz 10yyyyyy 10xxxxxx fourbytes_utf8 = '(?:[\xF0-\xF7][\x80-\xBF][\x80-\xBF][\x80-\xBF])' - character_utf8 = "(?:#{ ascii_utf8 }|#{ twobytes_utf8 }|#{ threebytes_utf8 }|#{ fourbytes_utf8 })" + character_utf8 = "(?:#{ us_ascii }|#{ twobytes_utf8 }|#{ threebytes_utf8 }|#{ fourbytes_utf8 })" UTF8Regexp = Regexp.new("\\A#{ character_utf8 }*\\z", nil, "NONE") + def Charset.is_us_ascii(str) + USASCIIRegexp =~ str + end + def Charset.is_utf8(str) UTF8Regexp =~ str end @@ -158,7 +161,7 @@ public def Charset.is_ces(str, code = $KCODE) case code when 'NONE' - true + is_us_ascii(str) when 'UTF8' is_utf8(str) when 'EUC' -- cgit v1.2.3