summaryrefslogtreecommitdiff
path: root/enc
diff options
context:
space:
mode:
Diffstat (limited to 'enc')
-rw-r--r--enc/Makefile.in2
-rw-r--r--enc/ascii.c6
-rw-r--r--enc/encdb.c2
-rw-r--r--enc/jis/props.h.blt4
-rw-r--r--enc/jis/props.kwd2
-rw-r--r--enc/jis/props.src2
-rw-r--r--enc/trans/newline.trans20
-rw-r--r--enc/unicode/14.0.0/name2ctype.h6
-rw-r--r--enc/unicode/case-folding.rb418
-rw-r--r--enc/utf_16_32.h2
10 files changed, 32 insertions, 432 deletions
diff --git a/enc/Makefile.in b/enc/Makefile.in
index 5e5d39cd76..dd8ca1b528 100644
--- a/enc/Makefile.in
+++ b/enc/Makefile.in
@@ -22,6 +22,7 @@ TRANSSODIR = $(ENCSODIR)/trans
DLEXT = @DLEXT@
OBJEXT = @OBJEXT@
LIBEXT = @LIBEXT@
+EXEEXT = @EXEEXT@
TIMESTAMPDIR = $(EXTOUT)/.timestamp
ENC_TRANS_D = $(TIMESTAMPDIR)/.enc-trans.time
ENC_TRANS_SO_D = $(TIMESTAMPDIR)/.enc-trans.so.time
@@ -35,6 +36,7 @@ RUBY_SO_NAME = @RUBY_SO_NAME@
LIBRUBY = @LIBRUBY@
LIBRUBYARG_SHARED = @LIBRUBYARG_SHARED@
LIBRUBYARG_STATIC = $(LIBRUBYARG_SHARED)
+BUILTRUBY = $(topdir)/miniruby$(EXEEXT)
empty =
AR = @AR@
diff --git a/enc/ascii.c b/enc/ascii.c
index a2fef2f879..ae7db97f25 100644
--- a/enc/ascii.c
+++ b/enc/ascii.c
@@ -33,8 +33,8 @@
# include "encindex.h"
#endif
-#ifndef ENCINDEX_ASCII
-# define ENCINDEX_ASCII 0
+#ifndef ENCINDEX_ASCII_8BIT
+# define ENCINDEX_ASCII_8BIT 0
#endif
OnigEncodingDefine(ascii, ASCII) = {
@@ -55,7 +55,7 @@ OnigEncodingDefine(ascii, ASCII) = {
onigenc_single_byte_left_adjust_char_head,
onigenc_always_true_is_allowed_reverse_match,
onigenc_single_byte_ascii_only_case_map,
- ENCINDEX_ASCII,
+ ENCINDEX_ASCII_8BIT,
ONIGENC_FLAG_NONE,
};
ENC_ALIAS("BINARY", "ASCII-8BIT")
diff --git a/enc/encdb.c b/enc/encdb.c
index a1936df804..8247e9ff6a 100644
--- a/enc/encdb.c
+++ b/enc/encdb.c
@@ -17,7 +17,7 @@
#define ENC_DEFINE(name) rb_encdb_declare(name)
#define ENC_SET_BASE(name, orig) rb_enc_set_base((name), (orig))
#define ENC_SET_DUMMY(name, orig) rb_enc_set_dummy(name)
-#define ENC_DUMMY_UNICODE(name) rb_encdb_set_unicode(rb_enc_set_dummy(ENC_REPLICATE((name), name "BE")))
+#define ENC_DUMMY_UNICODE(name) ENC_DUMMY(name)
void
Init_encdb(void)
diff --git a/enc/jis/props.h.blt b/enc/jis/props.h.blt
index 54aa94f8bc..508a084449 100644
--- a/enc/jis/props.h.blt
+++ b/enc/jis/props.h.blt
@@ -69,7 +69,7 @@ struct enc_property {
unsigned char ctype;
};
-static const struct enc_property *onig_jis_property(/*const char *str, unsigned int len*/);
+static const struct enc_property *onig_jis_property(register const char *str, register size_t len);
#line 43 "enc/jis/props.kwd"
struct enc_property;
@@ -82,7 +82,7 @@ struct enc_property;
#ifndef GPERF_DOWNCASE
#define GPERF_DOWNCASE 1
-static unsigned char gperf_downcase[256] =
+static const unsigned char gperf_downcase[256] =
{
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
diff --git a/enc/jis/props.kwd b/enc/jis/props.kwd
index 659cf0aff4..9606828459 100644
--- a/enc/jis/props.kwd
+++ b/enc/jis/props.kwd
@@ -37,7 +37,7 @@ struct enc_property {
unsigned char ctype;
};
-static const struct enc_property *onig_jis_property(/*!ANSI{*/const char *str, unsigned int len/*}!ANSI*/);
+static const struct enc_property *onig_jis_property(register const char *str, register size_t len);
%}
struct enc_property;
diff --git a/enc/jis/props.src b/enc/jis/props.src
index 659cf0aff4..9606828459 100644
--- a/enc/jis/props.src
+++ b/enc/jis/props.src
@@ -37,7 +37,7 @@ struct enc_property {
unsigned char ctype;
};
-static const struct enc_property *onig_jis_property(/*!ANSI{*/const char *str, unsigned int len/*}!ANSI*/);
+static const struct enc_property *onig_jis_property(register const char *str, register size_t len);
%}
struct enc_property;
diff --git a/enc/trans/newline.trans b/enc/trans/newline.trans
index 9e763407f9..95e082f5bd 100644
--- a/enc/trans/newline.trans
+++ b/enc/trans/newline.trans
@@ -17,10 +17,16 @@
map_cr["0a"] = "0d"
transcode_generate_node(ActionMap.parse(map_cr), "cr_newline")
+
+ map_normalize = {}
+ map_normalize["{00-ff}"] = :func_so
+
+ transcode_generate_node(ActionMap.parse(map_normalize), "lf_newline")
%>
<%= transcode_generated_code %>
+#define lf_newline universal_newline
#define STATE (sp[0])
#define NORMAL 0
#define JUST_AFTER_CR 1
@@ -126,10 +132,24 @@ rb_cr_newline = {
0, 0, 0, 0
};
+static const rb_transcoder
+rb_lf_newline = {
+ "", "lf_newline", lf_newline,
+ TRANSCODE_TABLE_INFO,
+ 1, /* input_unit_length */
+ 1, /* max_input */
+ 2, /* max_output */
+ asciicompat_converter, /* asciicompat_type */
+ 2, universal_newline_init, universal_newline_init, /* state_size, state_init, state_fini */
+ 0, 0, 0, fun_so_universal_newline,
+ universal_newline_finish
+};
+
void
Init_newline(void)
{
rb_register_transcoder(&rb_universal_newline);
rb_register_transcoder(&rb_crlf_newline);
rb_register_transcoder(&rb_cr_newline);
+ rb_register_transcoder(&rb_lf_newline);
}
diff --git a/enc/unicode/14.0.0/name2ctype.h b/enc/unicode/14.0.0/name2ctype.h
index 99a3eeca19..61c16bafc2 100644
--- a/enc/unicode/14.0.0/name2ctype.h
+++ b/enc/unicode/14.0.0/name2ctype.h
@@ -40642,11 +40642,7 @@ struct uniname2ctype_struct {
};
#define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str)
-static const struct uniname2ctype_struct *uniname2ctype_p(
-#if !(1+0) /* if ANSI, old style not to conflict with generated prototype */
- const char *, unsigned int
-#endif
-);
+static const struct uniname2ctype_struct *uniname2ctype_p(register const char *str, register size_t len);
#ifndef USE_UNICODE_PROPERTIES
#define TOTAL_KEYWORDS 15
diff --git a/enc/unicode/case-folding.rb b/enc/unicode/case-folding.rb
deleted file mode 100644
index 4a29fdebf7..0000000000
--- a/enc/unicode/case-folding.rb
+++ /dev/null
@@ -1,418 +0,0 @@
-#!/usr/bin/ruby
-require 'stringio'
-
-# Usage (for case folding only):
-# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
-# $ ruby case-folding.rb CaseFolding.txt -o casefold.h
-# or (for case folding and case mapping):
-# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
-# $ wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
-# $ wget http://www.unicode.org/Public/UNIDATA/SpecialCasing.txt
-# $ ruby case-folding.rb -m . -o casefold.h
-# using -d or --debug will include UTF-8 characters in comments for debugging
-
-class CaseFolding
- module Util
- module_function
-
- def hex_seq(v)
- v.map { |i| "0x%04x" % i }.join(", ")
- end
-
- def print_table_1(dest, type, mapping_data, data)
- for k, v in data = data.sort
- sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k)
- if type=='CaseUnfold_11' and v.length>1
- # reorder CaseUnfold_11 entries to avoid special treatment for U+03B9/U+03BC/U+A64B
- item = mapping_data.map("%04X" % k[0])
- upper = item.upper if item
- v = v.sort_by { |i| ("%04X"%i) == upper ? 0 : 1 }
- end
- ck = @debug ? ' /* ' + Array(k).pack("U*") + ' */' : ''
- cv = @debug ? ' /* ' + Array(v).map{|c|[c].pack("U*")}.join(", ") + ' */' : ''
- dest.print(" {#{sk}#{ck}, {#{v.length}#{mapping_data.flags(k, type, v)}, {#{hex_seq(v)}#{cv}}}},\n")
- end
- data
- end
-
- def print_table(dest, type, mapping_data, data)
- dest.print("static const #{type}_Type #{type}_Table[] = {\n")
- i = 0
- ret = data.inject([]) do |a, (n, d)|
- dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n")
- i += d.size
- a.concat(print_table_1(dest, type, mapping_data, d))
- end
- dest.print("};\n\n")
- ret
- end
- end
-
- include Util
-
- attr_reader :fold, :fold_locale, :unfold, :unfold_locale, :version
-
- def load(filename)
- pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/
-
- @fold = fold = {}
- @unfold = unfold = [{}, {}, {}]
- @debug = false
- @version = nil
- turkic = []
-
- IO.foreach(filename, mode: "rb") do |line|
- @version ||= line[/-([0-9.]+).txt/, 1]
- next unless res = pattern.match(line)
- ch_from = res[1].to_i(16)
-
- if res[2] == 'T'
- # Turkic case folding
- turkic << ch_from
- next
- end
-
- # store folding data
- ch_to = res[3..6].inject([]) do |a, i|
- break a unless i
- a << i.to_i(16)
- end
- fold[ch_from] = ch_to
-
- # store unfolding data
- i = ch_to.length - 1
- (unfold[i][ch_to] ||= []) << ch_from
- end
-
- # move locale dependent data to (un)fold_locale
- @fold_locale = fold_locale = {}
- @unfold_locale = unfold_locale = [{}, {}]
- for ch_from in turkic
- key = fold[ch_from]
- i = key.length - 1
- unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key)
- fold_locale[ch_from] = fold.delete(ch_from)
- end
- self
- end
-
- def range_check(code)
- "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE"
- end
-
- def lookup_hash(key, type, data)
- hash = "onigenc_unicode_#{key}_hash"
- lookup = "onigenc_unicode_#{key}_lookup"
- arity = Array(data[0][0]).size
- gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(',')} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup} -n"
- argname = arity > 1 ? "codes" : "code"
- argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}"
- n = 7
- m = (1 << n) - 1
- min, max = data.map {|c, *|c}.flatten.minmax
- src = IO.popen(gperf, "r+") {|f|
- f << "short\n%%\n"
- data.each_with_index {|(k, _), i|
- k = Array(k)
- ks = k.map {|j| [(j >> n*2) & m, (j >> n) & m, (j) & m]}.flatten.map {|c| "\\x%.2x" % c}.join("")
- f.printf "\"%s\", ::::/*%s*/ %d\n", ks, k.map {|c| "0x%.4x" % c}.join(","), i
- }
- f << "%%\n"
- f.close_write
- f.read
- }
- src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
- name = $1
- body = $2
- body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)")
- "#{name}(#{argdecl})\n{\n#{body}}"
- }
- src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
- name = $1
- body = $2
- body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
- body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})")
- body.gsub!(/\{"",-1}/, "-1")
- body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
- body.sub!(/(\s+if\s)\(len\b.*\)/) do
- "#$1(" <<
- (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) <<
- ")"
- end
- v = nil
- body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
- pre = $1
- indent = $2
- s = $3
- s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
- v = $1
- s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))")
- "#{pre}{#{s}\n#{indent}}"
- }
- body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
- "static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}"
- }
- src
- end
-
- def display(dest, mapping_data)
- # print the header
- dest.print("/* DO NOT EDIT THIS FILE. */\n")
- dest.print("/* Generated by enc/unicode/case-folding.rb */\n\n")
-
- versions = version.scan(/\d+/)
- dest.print("#if defined ONIG_UNICODE_VERSION_STRING && !( \\\n")
- %w[MAJOR MINOR TEENY].zip(versions) do |n, v|
- dest.print(" ONIG_UNICODE_VERSION_#{n} == #{v} && \\\n")
- end
- dest.print(" 1)\n")
- dest.print("# error ONIG_UNICODE_VERSION_STRING mismatch\n")
- dest.print("#endif\n")
- dest.print("#define ONIG_UNICODE_VERSION_STRING #{version.dump}\n")
- %w[MAJOR MINOR TEENY].zip(versions) do |n, v|
- dest.print("#define ONIG_UNICODE_VERSION_#{n} #{v}\n")
- end
- dest.print("\n")
-
- # print folding data
-
- # CaseFold + CaseFold_Locale
- name = "CaseFold_11"
- data = print_table(dest, name, mapping_data, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
- dest.print lookup_hash(name, "CodePointList3", data)
-
- # print unfolding data
-
- # CaseUnfold_11 + CaseUnfold_11_Locale
- name = "CaseUnfold_11"
- data = print_table(dest, name, mapping_data, name=>unfold[0], "#{name}_Locale"=>unfold_locale[0])
- dest.print lookup_hash(name, "CodePointList3", data)
-
- # CaseUnfold_12 + CaseUnfold_12_Locale
- name = "CaseUnfold_12"
- data = print_table(dest, name, mapping_data, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1])
- dest.print lookup_hash(name, "CodePointList2", data)
-
- # CaseUnfold_13
- name = "CaseUnfold_13"
- data = print_table(dest, name, mapping_data, name=>unfold[2])
- dest.print lookup_hash(name, "CodePointList2", data)
-
- # TitleCase
- dest.print mapping_data.specials_output
- end
-
- def debug!
- @debug = true
- end
-
- def self.load(*args)
- new.load(*args)
- end
-end
-
-class MapItem
- attr_accessor :upper, :lower, :title, :code
-
- def initialize(code, upper, lower, title)
- @code = code
- @upper = upper unless upper == ''
- @lower = lower unless lower == ''
- @title = title unless title == ''
- end
-end
-
-class CaseMapping
- attr_reader :filename, :version
-
- def initialize(mapping_directory)
- @mappings = {}
- @specials = []
- @specials_length = 0
- @version = nil
- IO.foreach(File.join(mapping_directory, 'UnicodeData.txt'), mode: "rb") do |line|
- next if line =~ /^</
- code, _, _, _, _, _, _, _, _, _, _, _, upper, lower, title = line.chomp.split ';'
- unless upper and lower and title and (upper+lower+title)==''
- @mappings[code] = MapItem.new(code, upper, lower, title)
- end
- end
-
- @filename = File.join(mapping_directory, 'SpecialCasing.txt')
- IO.foreach(@filename, mode: "rb") do |line|
- @version ||= line[/-([0-9.]+).txt/, 1]
- line.chomp!
- line, comment = line.split(/ *#/)
- next if not line or line == ''
- code, lower, title, upper, conditions = line.split(/ *; */)
- unless conditions
- item = @mappings[code]
- item.lower = lower
- item.title = title
- item.upper = upper
- end
- end
- end
-
- def map (from)
- @mappings[from]
- end
-
- def flags(from, type, to)
- # types: CaseFold_11, CaseUnfold_11, CaseUnfold_12, CaseUnfold_13
- flags = ""
- from = Array(from).map {|i| "%04X" % i}.join(" ")
- to = Array(to).map {|i| "%04X" % i}.join(" ")
- item = map(from)
- specials = []
- case type
- when 'CaseFold_11'
- flags += '|F'
- if item
- flags += '|U' if to==item.upper
- flags += '|D' if to==item.lower
- unless item.upper == item.title
- if item.code == item.title
- flags += '|IT'
- swap = case item.code
- when '01C5' then '0064 017D'
- when '01C8' then '006C 004A'
- when '01CB' then '006E 004A'
- when '01F2' then '0064 005A'
- else # Greek
- to.split(' ').first + ' 0399'
- end
- specials << swap
- else
- flags += '|ST'
- specials << item.title
- end
- end
- unless item.lower.nil? or item.lower==from or item.lower==to
- specials << item.lower
- flags += '|SL'
- end
- unless item.upper.nil? or item.upper==from or item.upper==to
- specials << item.upper
- flags += '|SU'
- end
- end
- when 'CaseUnfold_11'
- to = to.split(/ /)
- if item
- case to.first
- when item.upper then flags += '|U'
- when item.lower then flags += '|D'
- else
- raise "Unpredicted case 0 in enc/unicode/case_folding.rb. Please contact https://bugs.ruby-lang.org/."
- end
- unless item.upper == item.title
- if item.code == item.title
- flags += '|IT' # was unpredicted case 1
- elsif item.title==to[1]
- flags += '|ST'
- else
- raise "Unpredicted case 2 in enc/unicode/case_folding.rb. Please contact https://bugs.ruby-lang.org/."
- end
- end
- end
- end
- unless specials.empty?
- flags += "|I(#{@specials_length})"
- @specials_length += specials.map { |s| s.split(/ /).length }.reduce(:+)
- @specials << specials
- end
- flags
- end
-
- def debug!
- @debug = true
- end
-
- def specials_output
- "static const OnigCodePoint CaseMappingSpecials[] = {\n" +
- @specials.map do |sps|
- ' ' + sps.map do |sp|
- chars = sp.split(/ /)
- ct = ' /* ' + Array(chars).map{|c|[c.to_i(16)].pack("U*")}.join(", ") + ' */' if @debug
- " L(#{chars.length})|#{chars.map {|c| "0x"+c }.join(', ')}#{ct},"
- end.join + "\n"
- end.join + "};\n"
- end
-
- def self.load(*args)
- new(*args)
- end
-end
-
-class CaseMappingDummy
- def flags(from, type, to)
- ""
- end
-
- def titlecase_output() '' end
- def debug!() end
-end
-
-if $0 == __FILE__
- require 'optparse'
- dest = nil
- mapping_directory = nil
- mapping_data = nil
- debug = false
- fold_1 = false
- ARGV.options do |opt|
- opt.banner << " [INPUT]"
- opt.on("--output-file=FILE", "-o", "output to the FILE instead of STDOUT") {|output|
- dest = (output unless output == '-')
- }
- opt.on('--mapping-data-directory=DIRECTORY', '-m', 'data DIRECTORY of mapping files') { |directory|
- mapping_directory = directory
- }
- opt.on('--debug', '-d') {
- debug = true
- }
- opt.parse!
- abort(opt.to_s) if ARGV.size > 1
- end
- if mapping_directory
- if ARGV[0]
- warn "Either specify directory or individual file, but not both."
- exit
- end
- filename = File.join(mapping_directory, 'CaseFolding.txt')
- mapping_data = CaseMapping.load(mapping_directory)
- end
- filename ||= ARGV[0] || 'CaseFolding.txt'
- data = CaseFolding.load(filename)
- if mapping_data and data.version != mapping_data.version
- abort "Unicode data version mismatch\n" \
- " #{filename} = #{data.version}\n" \
- " #{mapping_data.filename} = #{mapping_data.version}"
- end
- mapping_data ||= CaseMappingDummy.new
-
- if debug
- data.debug!
- mapping_data.debug!
- end
- f = StringIO.new
- begin
- data.display(f, mapping_data)
- rescue Errno::ENOENT => e
- raise unless /gperf/ =~ e.message
- warn e.message
- abort unless dest
- File.utime(nil, nil, dest) # assume existing file is OK
- exit
- else
- s = f.string
- end
- if dest
- open(dest, "wb") do |file|
- file.print(s)
- end
- else
- STDOUT.print(s)
- end
-end
diff --git a/enc/utf_16_32.h b/enc/utf_16_32.h
index 9f9216d8ff..4d669019bf 100644
--- a/enc/utf_16_32.h
+++ b/enc/utf_16_32.h
@@ -1,5 +1,5 @@
#include "regenc.h"
/* dummy for unsupported, stateful encoding */
-#define ENC_DUMMY_UNICODE(name) ENC_REPLICATE(name, name "BE")
+#define ENC_DUMMY_UNICODE(name) ENC_DUMMY(name)
ENC_DUMMY_UNICODE("UTF-16");
ENC_DUMMY_UNICODE("UTF-32");