summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkosaki <kosaki@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2011-02-03 16:54:52 +0000
committerkosaki <kosaki@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2011-02-03 16:54:52 +0000
commitaa36d4df21113a36c2e368e0abb417480a6eebbd (patch)
tree4069acdf280a4c93e9cdd8462ec0ca54da385dc1
parentb0d216c81802670eed20fe24ded03df1c8c7abc1 (diff)
* string.c (count_utf8_lead_bytes_with_word): wrote function
comments. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@30778 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog5
-rw-r--r--string.c17
2 files changed, 22 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index 11bb47c732..93bb47ccef 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+Fri Feb 4 01:50:13 2011 KOSAKI Motohiro <kosaki.motohiro@gmail.com>
+
+ * string.c (count_utf8_lead_bytes_with_word): wrote function
+ comments.
+
Fri Feb 4 00:14:55 2011 Nobuyoshi Nakada <nobu@ruby-lang.org>
* ext/zlib/zlib.c (gzfile_reader_get_unused): no need to dup
diff --git a/string.c b/string.c
index 6628eb0aef..96b5d9cc7f 100644
--- a/string.c
+++ b/string.c
@@ -1038,13 +1038,30 @@ rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
#ifdef NONASCII_MASK
#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
+
+/*
+ * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
+ * bit represention. (see http://en.wikipedia.org/wiki/UTF-8)
+ * Therefore, following pseudo code can detect UTF-8 leading byte.
+ *
+ * if (!(byte & 0x80))
+ * byte |= 0x40; // turn on bit6
+ * return ((byte>>6) & 1); // bit6 represent it's leading byte or not.
+ *
+ * This function calculate every bytes in the argument word `s'
+ * using the above logic concurrently. and gather every bytes result.
+ */
static inline VALUE
count_utf8_lead_bytes_with_word(const VALUE *s)
{
VALUE d = *s;
+
+ /* Transform into bit0 represent UTF-8 leading or not. */
d |= ~(d>>1);
d >>= 6;
d &= NONASCII_MASK >> 7;
+
+ /* Gather every bytes. */
d += (d>>8);
d += (d>>16);
#if SIZEOF_VALUE == 8