summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorYusuke Endoh <mame@ruby-lang.org>2020-10-21 13:29:19 +0900
committeraycabta <aycabta@gmail.com>2020-12-05 02:58:58 +0900
commit76cac4c05a7be61a94a709b8b850118ad0bfa684 (patch)
tree3131f03a4c77ab2bccc1a66d4e626a823d096379 /lib
parentb3e0db80606614f11412604f1657a135002326e9 (diff)
[ruby/reline] Improve the performance of `get_mbchar_width`
It is about three times faster to use one big regexp instead of sequential matching. https://github.com/ruby/reline/commit/e36f6c0707
Diffstat (limited to 'lib')
-rw-r--r--lib/reline/unicode.rb40
-rw-r--r--lib/reline/unicode/east_asian_width.rb26
2 files changed, 38 insertions, 28 deletions
diff --git a/lib/reline/unicode.rb b/lib/reline/unicode.rb
index cd8c27e85b..df2f6719a4 100644
--- a/lib/reline/unicode.rb
+++ b/lib/reline/unicode.rb
@@ -72,20 +72,32 @@ class Reline::Unicode
}.join
end
+ require 'reline/unicode/east_asian_width'
+
+ MBCharWidthRE = /
+ (?<width_2_1>
+ [#{ EscapedChars.map {|c| "\\x%02x" % c.ord }.join }] (?# ^ + char, such as ^M, ^H, ^[, ...)
+ )
+ | (?<width_3>^\u{2E3B}) (?# THREE-EM DASH)
+ | (?<width_0>^\p{M})
+ | (?<width_2_2>
+ #{ EastAsianWidth::TYPE_F }
+ | #{ EastAsianWidth::TYPE_W }
+ )
+ | (?<width_1>
+ #{ EastAsianWidth::TYPE_H }
+ | #{ EastAsianWidth::TYPE_NA }
+ | #{ EastAsianWidth::TYPE_N }
+ )
+ /x
+
def self.get_mbchar_width(mbchar)
- case mbchar.encode(Encoding::UTF_8)
- when *EscapedChars # ^ + char, such as ^M, ^H, ^[, ...
- 2
- when /^\u{2E3B}/ # THREE-EM DASH
- 3
- when /^\p{M}/
- 0
- when EastAsianWidth::TYPE_A
- Reline.ambiguous_width
- when EastAsianWidth::TYPE_F, EastAsianWidth::TYPE_W
- 2
- when EastAsianWidth::TYPE_H, EastAsianWidth::TYPE_NA, EastAsianWidth::TYPE_N
- 1
+ m = mbchar.encode(Encoding::UTF_8).match(MBCharWidthRE)
+ case
+ when m[:width_2_1], m[:width_2_2] then 2
+ when m[:width_3] then 3
+ when m[:width_0] then 0
+ when m[:width_1] then 1
else
nil
end
@@ -591,5 +603,3 @@ class Reline::Unicode
[byte_size, width]
end
end
-
-require 'reline/unicode/east_asian_width'
diff --git a/lib/reline/unicode/east_asian_width.rb b/lib/reline/unicode/east_asian_width.rb
index 7483c78936..89bc9d9435 100644
--- a/lib/reline/unicode/east_asian_width.rb
+++ b/lib/reline/unicode/east_asian_width.rb
@@ -1,16 +1,16 @@
class Reline::Unicode::EastAsianWidth
# This is based on EastAsianWidth.txt
- # http://www.unicode.org/Public/13.0.0/ucd/EastAsianWidth.txt
+ # EastAsianWidth.txt
# Fullwidth
- TYPE_F = /^([#{ %W(
+ TYPE_F = /^[#{ %W(
\u{3000}
\u{FF01}-\u{FF60}
\u{FFE0}-\u{FFE6}
- ).join }])/
+ ).join }]/
# Halfwidth
- TYPE_H = /^([#{ %W(
+ TYPE_H = /^[#{ %W(
\u{20A9}
\u{FF61}-\u{FFBE}
\u{FFC2}-\u{FFC7}
@@ -18,10 +18,10 @@ class Reline::Unicode::EastAsianWidth
\u{FFD2}-\u{FFD7}
\u{FFDA}-\u{FFDC}
\u{FFE8}-\u{FFEE}
- ).join }])/
+ ).join }]/
# Wide
- TYPE_W = /^([#{ %W(
+ TYPE_W = /^[#{ %W(
\u{1100}-\u{115F}
\u{231A}-\u{231B}
\u{2329}-\u{232A}
@@ -136,10 +136,10 @@ class Reline::Unicode::EastAsianWidth
\u{1FAD0}-\u{1FAD6}
\u{20000}-\u{2FFFD}
\u{30000}-\u{3FFFD}
- ).join }])/
+ ).join }]/
# Narrow
- TYPE_NA = /^([#{ %W(
+ TYPE_NA = /^[#{ %W(
\u{0020}-\u{007E}
\u{00A2}-\u{00A3}
\u{00A5}-\u{00A6}
@@ -147,10 +147,10 @@ class Reline::Unicode::EastAsianWidth
\u{00AF}
\u{27E6}-\u{27ED}
\u{2985}-\u{2986}
- ).join }])/
+ ).join }]/
# Ambiguous
- TYPE_A = /^([#{ %W(
+ TYPE_A = /^[#{ %W(
\u{00A1}
\u{00A4}
\u{00A7}-\u{00A8}
@@ -330,10 +330,10 @@ class Reline::Unicode::EastAsianWidth
\u{E0100}-\u{E01EF}
\u{F0000}-\u{FFFFD}
\u{100000}-\u{10FFFD}
- ).join }])/
+ ).join }]/
# Neutral
- TYPE_N = /^([#{ %W(
+ TYPE_N = /^[#{ %W(
\u{0000}-\u{001F}
\u{007F}-\u{00A0}
\u{00A9}
@@ -1160,5 +1160,5 @@ class Reline::Unicode::EastAsianWidth
\u{1FBF0}-\u{1FBF9}
\u{E0001}
\u{E0020}-\u{E007F}
- ).join }])/
+ ).join }]/
end