diff --git a/benchmark/string_codepoints.yml b/benchmark/string_codepoints.yml new file mode 100644 index 00000000000000..6a07db7ce1fc82 --- /dev/null +++ b/benchmark/string_codepoints.yml @@ -0,0 +1,9 @@ +prelude: | + mixed_ascii64 = ("a" * 63 + "\u{100}") * 2048 + mixed_ascii256 = ("a" * 255 + "\u{100}") * 512 + utf8_2byte = "\u{100}" * 65536 + +benchmark: + codepoints_mixed_ascii64: mixed_ascii64.codepoints + codepoints_mixed_ascii256: mixed_ascii256.codepoints + codepoints_utf8_2byte: utf8_2byte.codepoints diff --git a/hash.c b/hash.c index fbad278bee8170..7fae2f284def65 100644 --- a/hash.c +++ b/hash.c @@ -5109,6 +5109,14 @@ rb_hash_bulk_insert(long argc, const VALUE *argv, VALUE hash) } } +VALUE +rb_hash_new_with_bulk_insert(long argc, const VALUE *argv) +{ + VALUE val = rb_hash_new_with_size(argc / 2); + rb_hash_bulk_insert(argc, argv, val); + return val; +} + static char **origenviron; #ifdef _WIN32 #define GET_ENVIRON(e) ((e) = rb_w32_get_environ()) diff --git a/insns.def b/insns.def index e32caef2dc8c66..26a44299db8064 100644 --- a/insns.def +++ b/insns.def @@ -429,9 +429,7 @@ toregexp // attr bool leaf = false; // attr rb_snum_t sp_inc = 1 - (rb_snum_t)cnt; { - const VALUE ary = rb_ary_tmp_new_from_values(0, cnt, STACK_ADDR_FROM_TOP(cnt)); - val = rb_reg_new_ary(ary, (int)opt); - rb_ary_clear(ary); + val = rb_reg_new_from_values(cnt, STACK_ADDR_FROM_TOP(cnt), (int)opt); } /* intern str to Symbol and push it. */ @@ -591,8 +589,7 @@ newhash RUBY_DTRACE_CREATE_HOOK(HASH, num); if (num) { - val = rb_hash_new_with_size(num / 2); - rb_hash_bulk_insert(num, STACK_ADDR_FROM_TOP(num), val); + val = rb_hash_new_with_bulk_insert(num, STACK_ADDR_FROM_TOP(num)); } else { val = rb_hash_new(); diff --git a/internal/hash.h b/internal/hash.h index 6671cd496d173f..baf5af9abd09e3 100644 --- a/internal/hash.h +++ b/internal/hash.h @@ -112,6 +112,7 @@ int rb_hash_stlike_foreach(VALUE hash, st_foreach_callback_func *func, st_data_t RUBY_SYMBOL_EXPORT_END VALUE rb_hash_new_with_size(st_index_t size); +VALUE rb_hash_new_with_bulk_insert(long argc, const VALUE *argv); VALUE rb_hash_resurrect(VALUE hash); int rb_hash_stlike_lookup(VALUE hash, st_data_t key, st_data_t *pval); VALUE rb_hash_keys(VALUE hash); diff --git a/internal/re.h b/internal/re.h index da165e4756969d..3ad364a1a69a1f 100644 --- a/internal/re.h +++ b/internal/re.h @@ -69,6 +69,7 @@ VALUE rb_backref_set_string(VALUE string, long pos, long len); void rb_match_unbusy(VALUE); int rb_match_count(VALUE match); VALUE rb_reg_new_ary(VALUE ary, int options); +VALUE rb_reg_new_from_values(long cnt, const VALUE *elements, int opt); VALUE rb_reg_last_defined(VALUE match); #define ARG_REG_OPTION_MASK \ diff --git a/pathname_builtin.rb b/pathname_builtin.rb index 63426812204082..ac436d49469273 100644 --- a/pathname_builtin.rb +++ b/pathname_builtin.rb @@ -1463,9 +1463,31 @@ def blockdev?() FileTest.blockdev?(@path) end # The returned value is OS-dependent; on Windows, almost always `false`. def chardev?() FileTest.chardev?(@path) end - # Tests the file is empty. + # :markup: markdown + # + # call-seq: + # empty? -> true or false + # + # Returns whether the entry represented by `self` exists and is empty: + # + # ```ruby + # dir_pn = Pathname('example_dir') + # dir_pn.empty? # => false # Dir does not exist. + # dir_pn.mkdir + # dir_pn.empty? # => true # Dir exists and is empty. + # + # file_pn = Pathname('example_dir/example.txt') + # file_pn.empty? # => false # File does not exist. + # file_pn.write('') + # file_pn.empty? # => true # File exists and is empty. + # dir_pn.empty? # => false # Dir exists and is not empty. + # file_pn.write('foo') + # file_pn.empty? # => false # File exists and is not empty. + # + # file_pn.delete + # dir_pn.delete + # ``` # - # See Dir#empty? and FileTest.empty?. def empty? if FileTest.directory?(@path) Dir.empty?(@path) @@ -1474,13 +1496,53 @@ def empty? end end - # See FileTest.executable?. + # :markup: markdown + # + # call-seq: + # executable? -> true or false + # + # Returns whether the entry represented by `self` is executable; + # calls FileTest.executable? with argument `self.to_s`: + # + # ```ruby + # Pathname('bin/gem').executable? # => true + # Pathname('README.md').executable? # => false + # ``` + # def executable?() FileTest.executable?(@path) end - # See FileTest.executable_real?. + # :markup: markdown + # + # call-seq: + # executable_real? -> true or false + # + # Returns whether the entry represented by `self` is executable + # by the real user and group id of the current process; + # calls FileTest.executable_real? with argument `self.to_s`: + # + # ```ruby + # pn = Pathname('example') + # pn.write('') + # pn.executable_real? # => false + # pn.chmod(0100) + # pn.executable_real? # => true + # ``` + # def executable_real?() FileTest.executable_real?(@path) end - # See FileTest.exist?. + # :markup: markdown + # + # call-seq: + # exist? -> true or false + # + # Returns whether the entry represented by `self` exists: + # + # ```ruby + # Pathname('.').exist? # => true + # Pathname('README.md').exist? # => true + # Pathname('nosuch').exist? # => false + # ``` + # def exist?() FileTest.exist?(@path) end # See FileTest.grpowned?. diff --git a/re.c b/re.c index b778fa08f331e6..ec337cd21cf2f6 100644 --- a/re.c +++ b/re.c @@ -3528,6 +3528,15 @@ rb_reg_new_ary(VALUE ary, int opt) return rb_reg_new_str(rb_reg_preprocess_dregexp(ary, opt), opt); } +VALUE +rb_reg_new_from_values(long cnt, const VALUE *elements, int opt) +{ + const VALUE ary = rb_ary_tmp_new_from_values(0, cnt, elements); + VALUE val = rb_reg_new_ary(ary, (int)opt); + rb_ary_clear(ary); + return val; +} + VALUE rb_enc_reg_new(const char *s, long len, rb_encoding *enc, int options) { diff --git a/string.c b/string.c index eb249662db74eb..a59340adfd7842 100644 --- a/string.c +++ b/string.c @@ -9772,6 +9772,7 @@ rb_str_enumerate_codepoints(VALUE str, VALUE ary) unsigned int c; const char *ptr, *end; rb_encoding *enc; + int enc_asciicompat; if (single_byte_optimizable(str)) return rb_str_enumerate_bytes(str, ary); @@ -9780,9 +9781,15 @@ rb_str_enumerate_codepoints(VALUE str, VALUE ary) ptr = RSTRING_PTR(str); end = RSTRING_END(str); enc = STR_ENC_GET(str); + enc_asciicompat = rb_enc_asciicompat(enc); while (ptr < end) { - c = rb_enc_codepoint_len(ptr, end, &n, enc); + /* Fast path: ASCII byte in an ASCII-compatible encoding is its own codepoint; + * skip rb_enc_codepoint_len and return the byte directly. + */ + n = 1; + c = (enc_asciicompat && ISASCII(*ptr)) ? + (unsigned char)*ptr : rb_enc_codepoint_len(ptr, end, &n, enc); ENUM_ELEM(ary, UINT2NUM(c)); ptr += n; } diff --git a/zjit/bindgen/src/main.rs b/zjit/bindgen/src/main.rs index 573eb37a72f59d..2cde74facd66c1 100644 --- a/zjit/bindgen/src/main.rs +++ b/zjit/bindgen/src/main.rs @@ -123,6 +123,7 @@ fn main() { .allowlist_function("rb_hash_aset") .allowlist_function("rb_hash_aref") .allowlist_function("rb_hash_bulk_insert") + .allowlist_function("rb_hash_new_with_bulk_insert") .allowlist_function("rb_hash_stlike_lookup") .allowlist_function("rb_ary_new_capa") .allowlist_function("rb_ary_store") @@ -215,6 +216,7 @@ fn main() { .allowlist_function("rb_reg_match_last") .allowlist_function("rb_reg_nth_match") .allowlist_function("rb_reg_new_ary") + .allowlist_function("rb_reg_new_from_values") .allowlist_var("ARG_ENCODING_FIXED") .allowlist_var("ARG_ENCODING_NONE") .allowlist_var("ONIG_OPTION_IGNORECASE") diff --git a/zjit/src/codegen.rs b/zjit/src/codegen.rs index f1ef17d794e7af..4eee7693150c20 100644 --- a/zjit/src/codegen.rs +++ b/zjit/src/codegen.rs @@ -2111,19 +2111,17 @@ fn gen_new_hash( elements: Vec, state: &FrameState, ) -> lir::Opnd { - gen_prepare_non_leaf_call(jit, asm, state); - - let cap: c_long = elements.len().try_into().expect("Unable to fit length of elements into c_long"); - let new_hash = asm_ccall!(asm, rb_hash_new_with_size, lir::Opnd::Imm(cap)); + if elements.is_empty() { + gen_prepare_leaf_call_with_gc(asm, state); + asm_ccall!(asm, rb_hash_new,) + } else { + gen_prepare_non_leaf_call(jit, asm, state); - if !elements.is_empty() { let argv = gen_push_opnds(asm, &elements); - asm_ccall!(asm, rb_hash_bulk_insert, elements.len().into(), argv, new_hash); - + let hash = asm_ccall!(asm, rb_hash_new_with_bulk_insert, elements.len().into(), argv); gen_pop_opnds(asm, &elements); + hash } - - new_hash } /// Compile a new range instruction @@ -3401,11 +3399,7 @@ fn gen_toregexp(jit: &mut JITState, asm: &mut Assembler, opt: usize, values: Vec gen_prepare_non_leaf_call(jit, asm, state); let first_opnd_ptr = gen_push_opnds(asm, &values); - - let tmp_ary = asm_ccall!(asm, rb_ary_tmp_new_from_values, Opnd::Imm(0), values.len().into(), first_opnd_ptr); - let result = asm_ccall!(asm, rb_reg_new_ary, tmp_ary, opt.into()); - asm_ccall!(asm, rb_ary_clear, tmp_ary); - + let result = asm_ccall!(asm, rb_reg_new_from_values, values.len().into(), first_opnd_ptr, opt.into()); gen_pop_opnds(asm, &values); result diff --git a/zjit/src/cruby_bindings.inc.rs b/zjit/src/cruby_bindings.inc.rs index c61e61edd1bdec..5a7c3de606c5f1 100644 --- a/zjit/src/cruby_bindings.inc.rs +++ b/zjit/src/cruby_bindings.inc.rs @@ -2060,6 +2060,11 @@ unsafe extern "C" { pub fn rb_class_allocate_instance(klass: VALUE) -> VALUE; pub fn rb_obj_equal(obj1: VALUE, obj2: VALUE) -> VALUE; pub fn rb_reg_new_ary(ary: VALUE, options: ::std::os::raw::c_int) -> VALUE; + pub fn rb_reg_new_from_values( + cnt: ::std::os::raw::c_long, + elements: *const VALUE, + opt: ::std::os::raw::c_int, + ) -> VALUE; pub fn rb_ary_tmp_new_from_values( arg1: VALUE, arg2: ::std::os::raw::c_long, @@ -2132,6 +2137,7 @@ unsafe extern "C" { arg: st_data_t, ) -> ::std::os::raw::c_int; pub fn rb_hash_new_with_size(size: st_index_t) -> VALUE; + pub fn rb_hash_new_with_bulk_insert(argc: ::std::os::raw::c_long, argv: *const VALUE) -> VALUE; pub fn rb_hash_resurrect(hash: VALUE) -> VALUE; pub fn rb_hash_stlike_lookup( hash: VALUE,