Combine call info and cache to speed up method invocation

To perform a regular method call, the VM needs two structs, `rb_call_info` and `rb_call_cache`. At the moment, we allocate these two structures in separate buffers. In the worst case, the CPU needs to read 4 cache lines to complete a method call. Putting the two structures together reduces the maximum number of cache line reads to 2. Combining the structures also saves 8 bytes per call site as the current layout uses separate two pointers for the call info and the call cache. This saves about 2 MiB on Discourse. This change improves the Optcarrot benchmark at least 3%. For more details, see attached bugs.ruby-lang.org ticket. Complications: - A new instruction attribute `comptime_sp_inc` is introduced to calculate SP increase at compile time without using call caches. At compile time, a `TS_CALLDATA` operand points to a call info struct, but at runtime, the same operand points to a call data struct. Instruction that explicitly define `sp_inc` also need to define `comptime_sp_inc`. - MJIT code for copying call cache becomes slightly more complicated. - This changes the bytecode format, which might break existing tools. [Misc #16258]
author: Alan Wu <XrXr@users.noreply.github.com> 2019-07-30 21:36:05 -0400
committer: 卜部昌平 <shyouhei@ruby-lang.org> 2019-10-24 18:03:42 +0900
commit: 89e7997622038f82115f34dbb4ea382e02bed163 (patch)
tree: 993a5f6fb17418381e835be1fd51093dc620148a /mjit_compile.c
parent: 38e931fa2ceac6d922f3eabedb8f35f211de0bdb (diff)
1 files changed, 18 insertions, 2 deletions
diff --git a/mjit_compile.c b/mjit_compile.c
index b6ed984b20..27ea836ef4 100644
--- a/mjit_compile.c
+++ b/mjit_compile.c
@@ -25,6 +25,21 @@
 #define NOT_COMPILED_STACK_SIZE -1
 #define ALREADY_COMPILED_P(status, pos) (status->stack_size_for_pos[pos] != NOT_COMPILED_STACK_SIZE)
 
+static size_t
+call_data_index(CALL_DATA cd, const struct rb_iseq_constant_body *body)
+{
+    const struct rb_kwarg_call_data *kw_calls = (const struct rb_kwarg_call_data *)&body->call_data[body->ci_size];
+    const struct rb_kwarg_call_data *kw_cd = (const struct rb_kwarg_call_data *)cd;
+
+    VM_ASSERT(cd >= body->call_data && kw_cd < (kw_calls + body->ci_kw_size));
+    if (kw_cd < kw_calls) {
+        return cd - body->call_data;
+    }
+    else {
+        return kw_cd - kw_calls + body->ci_size;
+    }
+}
+
 // For propagating information needed for lazily pushing a frame.
 struct inlined_call_context {
     int orig_argc; // ci->orig_argc
@@ -383,8 +398,9 @@ precompile_inlinable_iseqs(FILE *f, const rb_iseq_t *iseq, struct compile_status
 #endif
 
         if (insn == BIN(opt_send_without_block)) { // `compile_inlined_cancel_handler` supports only `opt_send_without_block`
-            CALL_INFO ci = (CALL_INFO)body->iseq_encoded[pos + 1];
-            CALL_CACHE cc_copy = status->cc_entries + ((CALL_CACHE)body->iseq_encoded[pos + 2] - body->cc_entries); // use copy to avoid race condition
+            CALL_DATA cd = (CALL_DATA)body->iseq_encoded[pos + 1];
+            CALL_INFO ci = &cd->ci;
+            CALL_CACHE cc_copy = status->cc_entries + call_data_index(cd, body); // use copy to avoid race condition
 
             const rb_iseq_t *child_iseq;
             if (has_valid_method_type(cc_copy) &&
author	Alan Wu <XrXr@users.noreply.github.com>	2019-07-30 21:36:05 -0400
committer	卜部昌平 <shyouhei@ruby-lang.org>	2019-10-24 18:03:42 +0900
commit	89e7997622038f82115f34dbb4ea382e02bed163 (patch)
tree	993a5f6fb17418381e835be1fd51093dc620148a /mjit_compile.c
parent	38e931fa2ceac6d922f3eabedb8f35f211de0bdb (diff)