diff options
author | shyouhei <shyouhei@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2017-04-27 04:21:04 +0000 |
---|---|---|
committer | shyouhei <shyouhei@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2017-04-27 04:21:04 +0000 |
commit | 29ca20de2d998d21c0d41224799182020311ea76 (patch) | |
tree | 1107864822da8ed3c2babd7ca9d8015c8a641654 | |
parent | 42c2df9937854a4c0988e69fbb176006fb511d6b (diff) |
refactor newhash (revision 58463 another try) [fix GH-1600]
* st.c (rb_hash_bulk_insert): new API to bulk insert entries
into a hash. Given arguments are first inserted into the
table at once, then reindexed. This is faster than inserting
things using rb_hash_aset() one by one.
This arrangement (rb_ prefixed function placed in st.c) is
unavoidable because it both touches table internal and write
barrier at once.
* internal.h: delcare the new function.
* hash.c (rb_hash_s_create): use the new function.
* vm.c (core_hash_merge): ditto.
* insns.def (newhash): ditto.
* test/ruby/test_hash.rb: more coverage on hash creation.
* test/ruby/test_literal.rb: ditto.
-----------------------------------------------------------
benchmark results:
minimum results in each 7 measurements.
Execution time (sec)
name before after
loop_whileloop2 0.136 0.137
vm2_bighash* 1.249 0.623
Speedup ratio: compare with the result of `before' (greater is better)
name after
loop_whileloop2 0.996
vm2_bighash* 2.004
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@58492 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | hash.c | 8 | ||||
-rw-r--r-- | insns.def | 8 | ||||
-rw-r--r-- | internal.h | 3 | ||||
-rw-r--r-- | st.c | 165 | ||||
-rw-r--r-- | test/ruby/test_hash.rb | 45 | ||||
-rw-r--r-- | test/ruby/test_literal.rb | 29 | ||||
-rw-r--r-- | vm.c | 6 |
7 files changed, 245 insertions, 19 deletions
@@ -650,7 +650,6 @@ static VALUE rb_hash_s_create(int argc, VALUE *argv, VALUE klass) { VALUE hash, tmp; - int i; if (argc == 1) { tmp = rb_hash_s_try_convert(Qnil, argv[0]); @@ -704,12 +703,7 @@ rb_hash_s_create(int argc, VALUE *argv, VALUE klass) } hash = hash_alloc(klass); - if (argc > 0) { - RHASH(hash)->ntbl = st_init_table_with_size(&objhash, argc / 2); - } - for (i=0; i<argc; i+=2) { - rb_hash_aset(hash, argv[i], argv[i + 1]); - } + rb_hash_bulk_insert(argc, argv, hash); return hash; } @@ -492,16 +492,12 @@ newhash (...) (VALUE val) // inc += 1 - num; { - rb_num_t i; - RUBY_DTRACE_CREATE_HOOK(HASH, num); val = rb_hash_new(); - for (i = num; i > 0; i -= 2) { - const VALUE v = TOPN(i - 2); - const VALUE k = TOPN(i - 1); - rb_hash_aset(val, k, v); + if (num) { + rb_hash_bulk_insert(num, STACK_ADDR_FROM_TOP(num), val); } POPN(num); } diff --git a/internal.h b/internal.h index 0988f301dd..53f31b7804 100644 --- a/internal.h +++ b/internal.h @@ -1550,6 +1550,9 @@ extern int ruby_enable_coredump; int rb_get_next_signal(void); int rb_sigaltstack_size(void); +/* st.c */ +extern void rb_hash_bulk_insert(long, const VALUE *, VALUE); + /* strftime.c */ #ifdef RUBY_ENCODING_H VALUE rb_strftime_timespec(const char *format, size_t format_len, rb_encoding *enc, @@ -1955,3 +1955,168 @@ st_numhash(st_data_t n) enum {s1 = 11, s2 = 3}; return (st_index_t)((n>>s1|(n<<s2)) ^ (n>>s2)); } + +/* Expand TAB to be suitable for holding SIZ entries in total. + Pre-existing entries remain not deleted inside of TAB, but its bins + are cleared to expect future reconstruction. See rehash below. */ +static void +st_expand_table(st_table *tab, st_index_t siz) +{ + st_table *tmp; + st_index_t n; + + if (siz <= get_allocated_entries(tab)) + return; /* enough room already */ + + tmp = st_init_table_with_size(tab->type, siz); + n = get_allocated_entries(tab); + MEMCPY(tmp->entries, tab->entries, st_table_entry, n); + free(tab->entries); + if (tab->bins != NULL) + free(tab->bins); + if (tmp->bins != NULL) + free(tmp->bins); + tab->entry_power = tmp->entry_power; + tab->bin_power = tmp->bin_power; + tab->size_ind = tmp->size_ind; + tab->entries = tmp->entries; + tab->bins = NULL; + tab->rebuilds_num++; + free(tmp); +} + +/* Rehash using linear search. */ +static void +st_rehash_linear(st_table *tab) +{ + st_index_t i, j; + st_table_entry *p, *q; + if (tab->bins) { + free(tab->bins); + tab->bins = NULL; + } + for (i = tab->entries_start; i < tab->entries_bound; i++) { + p = &tab->entries[i]; + if (DELETED_ENTRY_P(p)) + continue; + for (j = i + 1; j < tab->entries_bound; j++) { + q = &tab->entries[j]; + if (DELETED_ENTRY_P(q)) + continue; + if (PTR_EQUAL(tab, p, q->hash, q->key)) { + st_assert(p < q); + *p = *q; + MARK_ENTRY_DELETED(q); + tab->num_entries--; + update_range_for_deleted(tab, j); + } + } + } +} + +/* Rehash using index */ +static void +st_rehash_indexed(st_table *tab) +{ + st_index_t i; + st_index_t const n = bins_size(tab); + unsigned int const size_ind = get_size_ind(tab); + st_index_t *bins = realloc(tab->bins, n); + st_assert(bins != NULL); + tab->bins = bins; + initialize_bins(tab); + for (i = tab->entries_start; i < tab->entries_bound; i++) { + st_table_entry *p = &tab->entries[i]; + st_index_t ind; +#ifdef QUADRATIC_PROBE + st_index_t d = 1; +#else + st_index_t peterb = p->hash; +#endif + + if (DELETED_ENTRY_P(p)) + continue; + + ind = hash_bin(p->hash, tab); + for(;;) { + st_index_t bin = get_bin(bins, size_ind, ind); + st_table_entry *q = &tab->entries[bin - ENTRY_BASE]; + if (EMPTY_OR_DELETED_BIN_P(bin)) { + /* ok, new room */ + set_bin(bins, size_ind, ind, i + ENTRY_BASE); + break; + } + else if (PTR_EQUAL(tab, q, p->hash, p->key)) { + /* duplicated key; delete it */ + st_assert(q < p); + q->record = p->record; + MARK_ENTRY_DELETED(p); + tab->num_entries--; + update_range_for_deleted(tab, bin); + break; + } + else { + /* hash collision; skip it */ +#ifdef QUADRATIC_PROBE + ind = hash_bin(ind + d, tab); + d++; +#else + ind = secondary_hash(ind, tab, &peterb); +#endif + } + } + } +} + +/* Reconstruct TAB's bins according to TAB's entries. This function + permits conflicting keys inside of entries. No errors are reported + then. All but one of them are discarded silently. */ +static void +st_rehash(st_table *tab) +{ + if (tab->bin_power <= MAX_POWER2_FOR_TABLES_WITHOUT_BINS) + st_rehash_linear(tab); + else + st_rehash_indexed(tab); +} + +#ifdef RUBY +/* Mimics ruby's { foo => bar } syntax. This function is placed here + because it touches table internals and write barriers at once. */ +void +rb_hash_bulk_insert(long argc, const VALUE *argv, VALUE hash) +{ + int i; + st_table *tab; + + st_assert(argc % 2); + if (! argc) + return; + if (! RHASH(hash)->ntbl) + rb_hash_tbl_raw(hash); + tab = RHASH(hash)->ntbl; + + /* make room */ + st_expand_table(tab, tab->num_entries + argc); + + /* push elems */ + for (i = 0; i < argc; /* */) { + VALUE key = argv[i++]; + VALUE val = argv[i++]; + st_data_t k = (rb_obj_class(key) == rb_cString) ? + rb_str_new_frozen(key) : key; + st_table_entry e; + e.hash = do_hash(k, tab); + e.key = k; + e.record = val; + + tab->entries[tab->entries_bound++] = e; + tab->num_entries++; + RB_OBJ_WRITTEN(hash, Qundef, k); + RB_OBJ_WRITTEN(hash, Qundef, val); + } + + /* reindex */ + st_rehash(tab); +} +#endif diff --git a/test/ruby/test_hash.rb b/test/ruby/test_hash.rb index 040d25e144..798bd2fe60 100644 --- a/test/ruby/test_hash.rb +++ b/test/ruby/test_hash.rb @@ -133,6 +133,50 @@ class TestHash < Test::Unit::TestCase assert_equal(100, h['a']) assert_equal(200, h['b']) assert_nil(h['c']) + + h = @cls["a", 100, "b", 200] + assert_equal(100, h['a']) + assert_equal(200, h['b']) + assert_nil(h['c']) + + h = @cls[[["a", 100], ["b", 200]]] + assert_equal(100, h['a']) + assert_equal(200, h['b']) + assert_nil(h['c']) + + h = @cls[[["a", 100], ["b"], ["c", 300]]] + assert_equal(100, h['a']) + assert_equal(nil, h['b']) + assert_equal(300, h['c']) + + h = @cls[[["a", 100], "b", ["c", 300]]] + assert_equal(100, h['a']) + assert_equal(nil, h['b']) + assert_equal(300, h['c']) + end + + def test_s_AREF_duplicated_key + alist = [["a", 100], ["b", 200], ["a", 300], ["a", 400]] + h = @cls[alist] + assert_equal(2, h.size) + assert_equal(400, h['a']) + assert_equal(200, h['b']) + assert_nil(h['c']) + assert_equal(nil, h.key('300')) + end + + def test_s_AREF_frozen_key_id + key = "a".freeze + h = @cls[key, 100] + assert_equal(100, h['a']) + assert_same(key, *h.keys) + end + + def test_s_AREF_key_tampering + key = "a".dup + h = @cls[key, 100] + key.upcase! + assert_equal(100, h['a']) end def test_s_new @@ -145,7 +189,6 @@ class TestHash < Test::Unit::TestCase assert_instance_of(@cls, h) assert_equal('default', h.default) assert_equal('default', h['spurious']) - end def test_try_convert diff --git a/test/ruby/test_literal.rb b/test/ruby/test_literal.rb index 28290fb19d..4a447d59fc 100644 --- a/test/ruby/test_literal.rb +++ b/test/ruby/test_literal.rb @@ -394,6 +394,35 @@ class TestRubyLiteral < Test::Unit::TestCase end; end + def test_hash_duplicated_key + h = EnvUtil.suppress_warning do + eval <<~end + # This is a syntax that renders warning at very early stage. + # eval used to delay warning, to be suppressible by EnvUtil. + {"a" => 100, "b" => 200, "a" => 300, "a" => 400} + end + end + assert_equal(2, h.size) + assert_equal(400, h['a']) + assert_equal(200, h['b']) + assert_nil(h['c']) + assert_equal(nil, h.key('300')) + end + + def test_hash_frozen_key_id + key = "a".freeze + h = {key => 100} + assert_equal(100, h['a']) + assert_same(key, *h.keys) + end + + def test_hash_key_tampering + key = "a" + h = {key => 100} + key.upcase! + assert_equal(100, h['a']) + end + def test_range assert_instance_of Range, (1..2) assert_equal(1..2, 1..2) @@ -2653,13 +2653,9 @@ static VALUE core_hash_merge_kwd(int argc, VALUE *argv); static VALUE core_hash_merge(VALUE hash, long argc, const VALUE *argv) { - long i; - Check_Type(hash, T_HASH); VM_ASSERT(argc % 2 == 0); - for (i=0; i<argc; i+=2) { - rb_hash_aset(hash, argv[i], argv[i+1]); - } + rb_hash_bulk_insert(argc, argv, hash); return hash; } |