From 3cf7d1b57e3622430065f6a6ce8cbd5548d3d894 Mon Sep 17 00:00:00 2001 From: drbrain Date: Tue, 10 Jul 2012 18:28:40 +0000 Subject: * ext/zlib/zlib.c: Added streaming support to inflate processing. This allows zlib streams to be processed without huge memory growth. [Feature #6612] * NEWS: ditto * ext/zlib/zlib.c (zstream_expand_buffer): Uses rb_yield when a block is given for streaming support. Refactored to use zstream_expand_buffer_into to remove duplicate code. * ext/zlib/zlib.c (zstream_expand_buffer_protect): Added wrapper function to pass jump state back through GVL-free section to allow zstream clean-up before terminating the ruby call. * ext/zlib/zlib.c (zstream_expand_buffer_without_gvl): Acquire GVL to yield processed chunk of output stream. * ext/zlib/zlib.c (zstream_detach_buffer): When a block is given, returns Qnil mid-stream and yields the output buffer at the end of the stream. * test/zlib/test_zlib.rb: Updated tests git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@36356 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 19 +++++ NEWS | 2 + ext/zlib/zlib.c | 184 +++++++++++++++++++++++++++++++++++-------------- test/zlib/test_zlib.rb | 153 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 307 insertions(+), 51 deletions(-) diff --git a/ChangeLog b/ChangeLog index 763393e710..c920b56377 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,22 @@ +Wed Jul 11 03:26:47 2012 Eric Hodel + + * ext/zlib/zlib.c: Added streaming support to inflate processing. + This allows zlib streams to be processed without huge memory growth. + [Feature #6612] + * NEWS: ditto + * ext/zlib/zlib.c (zstream_expand_buffer): Uses rb_yield when a block + is given for streaming support. Refactored to use + zstream_expand_buffer_into to remove duplicate code. + * ext/zlib/zlib.c (zstream_expand_buffer_protect): Added wrapper + function to pass jump state back through GVL-free section to allow + zstream clean-up before terminating the ruby call. + * ext/zlib/zlib.c (zstream_expand_buffer_without_gvl): Acquire GVL to + yield processed chunk of output stream. + * ext/zlib/zlib.c (zstream_detach_buffer): When a block is given, + returns Qnil mid-stream and yields the output buffer at the end of + the stream. + * test/zlib/test_zlib.rb: Updated tests + Tue Jul 10 22:57:03 2012 Nobuyoshi Nakada * include/ruby/thread.h: new header file for thread stuff. diff --git a/NEWS b/NEWS index b1c74ac05e..7c356b9352 100644 --- a/NEWS +++ b/NEWS @@ -123,6 +123,8 @@ with all sufficient information, see the ChangeLog file. accessible from other users. * zlib + * Added streaming support for Zlib::Inflate and Zlib::Deflate. This allows + processing of a stream without the use of large amounts of memory. * Added support for the new deflate strategies Zlib::RLE and Zlib::FIXED. * Zlib streams are now processed without the GVL. This allows gzip, zlib and deflate streams to be processed in parallel. diff --git a/ext/zlib/zlib.c b/ext/zlib/zlib.c index 3fb1be91ae..e2b4f44c90 100644 --- a/ext/zlib/zlib.c +++ b/ext/zlib/zlib.c @@ -545,12 +545,17 @@ struct zstream { #define ZSTREAM_FLAG_IN_STREAM 0x2 #define ZSTREAM_FLAG_FINISHED 0x4 #define ZSTREAM_FLAG_CLOSING 0x8 -#define ZSTREAM_FLAG_UNUSED 0x10 +#define ZSTREAM_FLAG_GZFILE 0x10 /* disallows yield from expand_buffer for + gzip*/ +#define ZSTREAM_FLAG_UNUSED 0x20 #define ZSTREAM_READY(z) ((z)->flags |= ZSTREAM_FLAG_READY) #define ZSTREAM_IS_READY(z) ((z)->flags & ZSTREAM_FLAG_READY) #define ZSTREAM_IS_FINISHED(z) ((z)->flags & ZSTREAM_FLAG_FINISHED) #define ZSTREAM_IS_CLOSING(z) ((z)->flags & ZSTREAM_FLAG_CLOSING) +#define ZSTREAM_IS_GZFILE(z) ((z)->flags & ZSTREAM_FLAG_GZFILE) + +#define ZSTREAM_EXPAND_BUFFER_OK 0 /* I think that more better value should be found, but I gave up finding it. B) */ @@ -569,8 +574,10 @@ static const struct zstream_funcs inflate_funcs = { struct zstream_run_args { struct zstream * z; - int flush; - int interrupt; + int flush; /* stream flush value for inflate() or deflate() */ + int interrupt; /* stop processing the stream and return to ruby */ + int jump_state; /* for buffer expansion block break or exception */ + int stream_output; /* for streaming zlib processing */ }; static voidpf @@ -615,33 +622,50 @@ zstream_init(struct zstream *z, const struct zstream_funcs *func) static void zstream_expand_buffer(struct zstream *z) { - long inc; - if (NIL_P(z->buf)) { - /* I uses rb_str_new here not rb_str_buf_new because - rb_str_buf_new makes a zero-length string. */ - z->buf = rb_str_new(0, ZSTREAM_INITIAL_BUFSIZE); - z->buf_filled = 0; - z->stream.next_out = (Bytef*)RSTRING_PTR(z->buf); - z->stream.avail_out = ZSTREAM_INITIAL_BUFSIZE; - RBASIC(z->buf)->klass = 0; + zstream_expand_buffer_into(z, ZSTREAM_INITIAL_BUFSIZE); return; } - if (RSTRING_LEN(z->buf) - z->buf_filled >= ZSTREAM_AVAIL_OUT_STEP_MAX) { - /* to keep other threads from freezing */ - z->stream.avail_out = ZSTREAM_AVAIL_OUT_STEP_MAX; + if (!ZSTREAM_IS_GZFILE(z) && rb_block_given_p()) { + if (z->buf_filled >= ZSTREAM_AVAIL_OUT_STEP_MAX) { + int state = 0; + VALUE self = (VALUE)z->stream.opaque; + + rb_str_resize(z->buf, z->buf_filled); + RBASIC(z->buf)->klass = rb_cString; + OBJ_INFECT(z->buf, self); + + rb_protect(rb_yield, z->buf, &state); + + z->buf = Qnil; + zstream_expand_buffer_into(z, ZSTREAM_AVAIL_OUT_STEP_MAX); + + if (state) + rb_jump_tag(state); + + return; + } + else { + zstream_expand_buffer_into(z, + ZSTREAM_AVAIL_OUT_STEP_MAX - z->buf_filled); + } } else { - inc = z->buf_filled / 2; - if (inc < ZSTREAM_AVAIL_OUT_STEP_MIN) { - inc = ZSTREAM_AVAIL_OUT_STEP_MIN; + if (RSTRING_LEN(z->buf) - z->buf_filled >= ZSTREAM_AVAIL_OUT_STEP_MAX) { + z->stream.avail_out = ZSTREAM_AVAIL_OUT_STEP_MAX; } - rb_str_resize(z->buf, z->buf_filled + inc); - z->stream.avail_out = (inc < ZSTREAM_AVAIL_OUT_STEP_MAX) ? - (int)inc : ZSTREAM_AVAIL_OUT_STEP_MAX; + else { + long inc = z->buf_filled / 2; + if (inc < ZSTREAM_AVAIL_OUT_STEP_MIN) { + inc = ZSTREAM_AVAIL_OUT_STEP_MIN; + } + rb_str_resize(z->buf, z->buf_filled + inc); + z->stream.avail_out = (inc < ZSTREAM_AVAIL_OUT_STEP_MAX) ? + (int)inc : ZSTREAM_AVAIL_OUT_STEP_MAX; + } + z->stream.next_out = (Bytef*)RSTRING_PTR(z->buf) + z->buf_filled; } - z->stream.next_out = (Bytef*)RSTRING_PTR(z->buf) + z->buf_filled; } static void @@ -663,6 +687,17 @@ zstream_expand_buffer_into(struct zstream *z, unsigned long size) } } +static void * +zstream_expand_buffer_protect(void *ptr) +{ + struct zstream *z = (struct zstream *)ptr; + int state = 0; + + rb_protect((VALUE (*)(VALUE))zstream_expand_buffer, (VALUE)z, &state); + + return (void *)state; +} + static int zstream_expand_buffer_without_gvl(struct zstream *z) { @@ -682,9 +717,6 @@ zstream_expand_buffer_without_gvl(struct zstream *z) new_str = ruby_xrealloc(RSTRING(z->buf)->as.heap.ptr, len + 1); - if (!new_str) - return 0; - /* from rb_str_resize */ RSTRING(z->buf)->as.heap.ptr = new_str; RSTRING(z->buf)->as.heap.ptr[len] = '\0'; /* sentinel */ @@ -696,7 +728,7 @@ zstream_expand_buffer_without_gvl(struct zstream *z) } z->stream.next_out = (Bytef*)RSTRING_PTR(z->buf) + z->buf_filled; - return 1; + return ZSTREAM_EXPAND_BUFFER_OK; } static void @@ -737,6 +769,13 @@ zstream_detach_buffer(struct zstream *z) { VALUE dst, self = (VALUE)z->stream.opaque; + if (!ZSTREAM_IS_FINISHED(z) && !ZSTREAM_IS_GZFILE(z) && + rb_block_given_p()) { + /* prevent tiny yields mid-stream, save for next + * zstream_expand_buffer() or stream end */ + return Qnil; + } + if (NIL_P(z->buf)) { dst = rb_str_new(0, 0); } @@ -752,6 +791,12 @@ zstream_detach_buffer(struct zstream *z) z->buf_filled = 0; z->stream.next_out = 0; z->stream.avail_out = 0; + + if (!ZSTREAM_IS_GZFILE(z) && rb_block_given_p()) { + rb_yield(dst); + dst = Qnil; + } + return dst; } @@ -921,7 +966,7 @@ static void * zstream_run_func(void *ptr) { struct zstream_run_args *args = (struct zstream_run_args *)ptr; - int err, flush = args->flush; + int err, state, flush = args->flush; struct zstream *z = args->z; uInt n; @@ -945,8 +990,16 @@ zstream_run_func(void *ptr) break; } - if (!zstream_expand_buffer_without_gvl(z)) { - err = Z_MEM_ERROR; /* realloc failed */ + if (args->stream_output) { + state = (int)rb_thread_call_with_gvl(zstream_expand_buffer_protect, + (void *)z); + } else { + state = zstream_expand_buffer_without_gvl(z); + } + + if (state) { + err = Z_OK; /* buffer expanded but stream processing was stopped */ + args->jump_state = state; break; } } @@ -975,6 +1028,8 @@ zstream_run(struct zstream *z, Bytef *src, long len, int flush) args.z = z; args.flush = flush; args.interrupt = 0; + args.jump_state = 0; + args.stream_output = !ZSTREAM_IS_GZFILE(z) && rb_block_given_p(); if (NIL_P(z->input) && len == 0) { z->stream.next_in = (Bytef*)""; @@ -1026,6 +1081,9 @@ loop: zstream_append_input(z, z->stream.next_in, z->stream.avail_in); guard = Qnil; /* prevent tail call to make guard effective */ } + + if (args.jump_state) + rb_jump_tag(args.jump_state); } static VALUE @@ -1208,8 +1266,13 @@ rb_zstream_reset(VALUE obj) } /* - * Finishes the stream and flushes output buffer. See Zlib::Deflate#finish and - * Zlib::Inflate#finish for details of this behavior. + * call-seq: + * finish -> String + * finish { |chunk| ... } -> nil + * + * Finishes the stream and flushes output buffer. If a block is given each + * chunk is yielded to the block until the input buffer has been flushed to + * the output buffer. */ static VALUE rb_zstream_finish(VALUE obj) @@ -1222,7 +1285,13 @@ rb_zstream_finish(VALUE obj) } /* - * Flushes input buffer and returns all data in that buffer. + * call-seq: + * flush_next_out -> String + * flush_next_out { |chunk| ... } -> nil + * + * Flushes output buffer and returns all data in that buffer. If a block is + * given each chunk is yielded to the block until the current output buffer + * has been flushed. */ static VALUE rb_zstream_flush_next_in(VALUE obj) @@ -1504,13 +1573,13 @@ deflate_run(VALUE args) /* * Document-method: Zlib::Deflate.deflate * - * call-seq: Zlib.deflate(string[, level]) - * Zlib::Deflate.deflate(string[, level]) + * call-seq: + * Zlib.deflate(string[, level]) + * Zlib::Deflate.deflate(string[, level]) * * Compresses the given +string+. Valid values of level are - * NO_COMPRESSION, BEST_SPEED, - * BEST_COMPRESSION, DEFAULT_COMPRESSION, and an - * integer from 0 to 9 (the default is 6). + * Zlib::NO_COMPRESSION, Zlib::BEST_SPEED, Zlib::BEST_COMPRESSION, + * Zlib::DEFAULT_COMPRESSION, or an integer from 0 to 9 (the default is 6). * * This method is almost equivalent to the following code: * @@ -1564,17 +1633,19 @@ do_deflate(struct zstream *z, VALUE src, int flush) } /* - * Document-method: Zlib#deflate + * Document-method: Zlib::Deflate#deflate * * call-seq: - * deflate(string, flush = Zlib::NO_FLUSH) + * z.deflate(string, flush = Zlib::NO_FLUSH) -> String + * z.deflate(string, flush = Zlib::NO_FLUSH) { |chunk| ... } -> nil * * Inputs +string+ into the deflate stream and returns the output from the * stream. On calling this method, both the input and the output buffers of - * the stream are flushed. + * the stream are flushed. If +string+ is nil, this method finishes the + * stream, just like Zlib::ZStream#finish. * - * If +string+ is nil, this method finishes the stream, just like - * Zlib::ZStream#finish. + * If a block is given consecutive deflated chunks from the +string+ are + * yielded to the block and +nil+ is returned. * * The +flush+ parameter specifies the flush mode. The following constants * may be used: @@ -1621,10 +1692,13 @@ rb_deflate_addstr(VALUE obj, VALUE src) * Document-method: Zlib::Deflate#flush * * call-seq: - * flush(flush = Zlib::SYNC_FLUSH) + * flush(flush = Zlib::SYNC_FLUSH) -> String + * flush(flush = Zlib::SYNC_FLUSH) { |chunk| ... } -> nil * * This method is equivalent to deflate('', flush). This method is - * just provided to improve the readability of your Ruby program. + * just provided to improve the readability of your Ruby program. If a block + * is given chunks of deflate output are yielded to the block until the buffer + * is flushed. * * See Zlib::Deflate#deflate for detail on the +flush+ constants NO_FLUSH, * SYNC_FLUSH, FULL_FLUSH and FINISH. @@ -1812,9 +1886,11 @@ inflate_run(VALUE args) } /* - * Document-method: Zlib::Inflate.inflate + * Document-method: Zlib::inflate * - * call-seq: Zlib::Inflate.inflate(string) + * call-seq: + * Zlib.inflate(string) + * Zlib::Inflate.inflate(string) * * Decompresses +string+. Raises a Zlib::NeedDict exception if a preset * dictionary is needed for decompression. @@ -1890,12 +1966,17 @@ rb_inflate_add_dictionary(VALUE obj, VALUE dictionary) { /* * Document-method: Zlib::Inflate#inflate * - * call-seq: inflate(string) + * call-seq: + * inflate(deflate_string) -> String + * inflate(deflate_string) { |chunk| ... } -> nil + * + * Inputs +deflate_string+ into the inflate stream and returns the output from + * the stream. Calling this method, both the input and the output buffer of + * the stream are flushed. If string is +nil+, this method finishes the + * stream, just like Zlib::ZStream#finish. * - * Inputs +string+ into the inflate stream and returns the output from the - * stream. Calling this method, both the input and the output buffer of the - * stream are flushed. If string is +nil+, this method finishes the stream, - * just like Zlib::ZStream#finish. + * If a block is given consecutive inflated chunks from the +deflate_string+ + * are yielded to the block and +nil+ is returned. * * Raises a Zlib::NeedDict exception if a preset dictionary is needed to * decompress. Set the dictionary by Zlib::Inflate#set_dictionary and then @@ -2169,6 +2250,7 @@ gzfile_new(klass, funcs, endfunc) obj = Data_Make_Struct(klass, struct gzfile, gzfile_mark, gzfile_free, gz); zstream_init(&gz->z, funcs); + gz->z.flags |= ZSTREAM_FLAG_GZFILE; gz->io = Qnil; gz->level = 0; gz->mtime = 0; diff --git a/test/zlib/test_zlib.rb b/test/zlib/test_zlib.rb index 61e24e4d44..aa3486359c 100644 --- a/test/zlib/test_zlib.rb +++ b/test/zlib/test_zlib.rb @@ -39,6 +39,62 @@ if defined? Zlib assert_raise(Zlib::StreamError) { Zlib::Deflate.deflate("foo", 10000) } end + def test_deflate_chunked + original = '' + chunks = [] + r = Random.new 0 + + z = Zlib::Deflate.new + + 2.times do + input = r.bytes(20000) + original << input + z.deflate(input) do |chunk| + chunks << chunk + end + end + + assert_equal [16384, 16384], + chunks.map { |chunk| chunk.length } + + final = z.finish + + assert_equal 7253, final.length + + chunks << final + all = chunks.join + + inflated = Zlib.inflate all + + assert_equal original, inflated + end + + def test_deflate_chunked_break + chunks = [] + r = Random.new 0 + + z = Zlib::Deflate.new + + input = r.bytes(20000) + z.deflate(input) do |chunk| + chunks << chunk + break + end + + assert_equal [16384], chunks.map { |chunk| chunk.length } + + final = z.finish + + assert_equal 3632, final.length + + all = chunks.join + all << final + + original = Zlib.inflate all + + assert_equal input, original + end + def test_addstr z = Zlib::Deflate.new z << "foo" @@ -202,6 +258,38 @@ if defined? Zlib assert_equal "foofoofoo", out end + def test_finish_chunked + # zeros = Zlib::Deflate.deflate("0" * 100_000) + zeros = "x\234\355\3011\001\000\000\000\302\240J\353\237\316\032\036@" \ + "\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\257\006\351\247BH" + + chunks = [] + + z = Zlib::Inflate.new + + z.inflate(zeros) do |chunk| + chunks << chunk + break + end + + z.finish do |chunk| + chunks << chunk + end + + assert_equal [16384, 16384, 16384, 16384, 16384, 16384, 1696], + chunks.map { |chunk| chunk.size } + + assert chunks.all? { |chunk| + chunk =~ /\A0+\z/ + } + end + def test_inflate s = Zlib::Deflate.deflate("foo") z = Zlib::Inflate.new @@ -231,6 +319,58 @@ if defined? Zlib assert_equal "\0", inflated end + def test_inflate_chunked + # s = Zlib::Deflate.deflate("0" * 100_000) + zeros = "x\234\355\3011\001\000\000\000\302\240J\353\237\316\032\036@" \ + "\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\257\006\351\247BH" + + chunks = [] + + z = Zlib::Inflate.new + + z.inflate(zeros) do |chunk| + chunks << chunk + end + + assert_equal [16384, 16384, 16384, 16384, 16384, 16384, 1696], + chunks.map { |chunk| chunk.size } + + assert chunks.all? { |chunk| + chunk =~ /\A0+\z/ + } + end + + def test_inflate_chunked_break + # zeros = Zlib::Deflate.deflate("0" * 100_000) + zeros = "x\234\355\3011\001\000\000\000\302\240J\353\237\316\032\036@" \ + "\001\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" \ + "\000\000\000\000\000\000\000\257\006\351\247BH" + + chunks = [] + + z = Zlib::Inflate.new + + z.inflate(zeros) do |chunk| + chunks << chunk + break + end + + out = z.inflate nil + + assert_equal 100_000 - chunks.first.length, out.length + end + def test_inflate_dictionary dictionary = "foo" @@ -896,5 +1036,18 @@ if defined? Zlib def test_deflate TestZlibDeflate.new(__name__).test_deflate end + + def test_deflate_stream + r = Random.new 0 + + deflated = '' + + Zlib.deflate(r.bytes(20000)) do |chunk| + deflated << chunk + end + + assert_equal 20016, deflated.length + end + end end -- cgit v1.2.3