diff --git a/compiler/rustc_codegen_cranelift/src/driver/jit.rs b/compiler/rustc_codegen_cranelift/src/driver/jit.rs index 3118105a4e2d7..62a3e09691234 100644 --- a/compiler/rustc_codegen_cranelift/src/driver/jit.rs +++ b/compiler/rustc_codegen_cranelift/src/driver/jit.rs @@ -325,7 +325,7 @@ fn dep_symbol_lookup_fn( Linkage::NotLinked | Linkage::IncludedFromDylib => {} Linkage::Static => { let name = crate_info.crate_name[&cnum]; - let mut err = sess.struct_err(&format!("Can't load static lib {}", name)); + let mut err = sess.struct_err(format!("Can't load static lib {}", name)); err.note("rustc_codegen_cranelift can only load dylibs in JIT mode."); err.emit(); } diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs index 1e83c30bd677a..38c120fd4ba86 100644 --- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs +++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs @@ -567,6 +567,130 @@ fn codegen_regular_intrinsic_call<'tcx>( // FIXME use emit_small_memset fx.bcx.call_memset(fx.target_config, dst_ptr, val, count); } + + sym::swap_nonoverlapping_single => { + intrinsic_args!(fx, args => (x_ptr, y_ptr); intrinsic); + let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty; + let pointee_layout = fx.layout_of(pointee_ty); + + // ZSTs swap is noop. + if pointee_layout.size != Size::ZERO { + // Probably, it would be better to have dedicated method for this in + // `cranelift_frontend::FunctionBuilder` + // with optimizations based on size and alignment of values. + + let x_ptr_val = x_ptr.load_scalar(fx); + let y_ptr_val = y_ptr.load_scalar(fx); + + let tmp_place = CPlace::new_stack_slot(fx, pointee_layout); + let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx); + + let size_bytes = pointee_layout.size.bytes(); + let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap(); + fx.bcx.emit_small_memory_copy( + fx.target_config, + tmp_ptr_val, + x_ptr_val, + size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + fx.bcx.emit_small_memory_copy( + fx.target_config, + x_ptr_val, + y_ptr_val, + size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + fx.bcx.emit_small_memory_copy( + fx.target_config, + y_ptr_val, + tmp_ptr_val, + size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + } + } + + sym::swap_nonoverlapping_many => { + intrinsic_args!(fx, args => (x_ptr, y_ptr, count); intrinsic); + let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty; + let pointee_layout = fx.layout_of(pointee_ty); + + // ZSTs swap is noop. + if pointee_layout.size != Size::ZERO { + let x_ptr_val = x_ptr.load_scalar(fx); + let y_ptr_val = y_ptr.load_scalar(fx); + + let count = count.load_scalar(fx); + + let tmp_place = CPlace::new_stack_slot(fx, pointee_layout); + let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx); + + let elem_size_bytes = pointee_layout.size.bytes(); + let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap(); + + let loop_header = fx.bcx.create_block(); + let loop_body = fx.bcx.create_block(); + let loop_done = fx.bcx.create_block(); + + let index = fx.bcx.append_block_param(loop_header, fx.pointer_type); + let zero = fx.bcx.ins().iconst(fx.pointer_type, 0); + fx.bcx.ins().jump(loop_header, &[zero]); + + fx.bcx.switch_to_block(loop_header); + let is_done = fx.bcx.ins().icmp(IntCC::Equal, index, count); + fx.bcx.ins().brif(is_done, loop_done, &[], loop_body, &[]); + + fx.bcx.switch_to_block(loop_body); + let curr_x_ptr_val = fx.bcx.ins().iadd(x_ptr_val, index); + let curr_y_ptr_val = fx.bcx.ins().iadd(y_ptr_val, index); + fx.bcx.emit_small_memory_copy( + fx.target_config, + tmp_ptr_val, + curr_x_ptr_val, + elem_size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + fx.bcx.emit_small_memory_copy( + fx.target_config, + curr_x_ptr_val, + curr_y_ptr_val, + elem_size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + fx.bcx.emit_small_memory_copy( + fx.target_config, + curr_y_ptr_val, + tmp_ptr_val, + elem_size_bytes, + align_bytes, + align_bytes, + true, + MemFlags::trusted(), + ); + let next_index = fx.bcx.ins().iadd_imm(index, 1); + fx.bcx.ins().jump(loop_header, &[next_index]); + + fx.bcx.switch_to_block(loop_done); + fx.bcx.ins().nop(); + } + } + sym::ctlz | sym::ctlz_nonzero => { intrinsic_args!(fx, args => (arg); intrinsic); let val = arg.load_scalar(fx); diff --git a/compiler/rustc_codegen_gcc/src/builder.rs b/compiler/rustc_codegen_gcc/src/builder.rs index f9ea0f004564b..3eec43d3745f8 100644 --- a/compiler/rustc_codegen_gcc/src/builder.rs +++ b/compiler/rustc_codegen_gcc/src/builder.rs @@ -1070,6 +1070,55 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> { self.block.add_eval(None, self.context.new_call(None, memset, &[ptr, fill_byte, size])); } + fn make_memory_loop( + &mut self, + loop_name: &str, + start_ptrs: [Self::Value; VAR_COUNT], + steps: [Size; VAR_COUNT], + iterations: Self::Value, + body_visitor: BodyPtrsVisitor, + ) where + BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]), + { + assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero."); + + for step in steps { + assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected."); + } + + let header_bb = self.append_sibling_block(&format!("{}_header", loop_name)); + let body_bb = self.append_sibling_block(&format!("{}_body", loop_name)); + let next_bb = self.append_sibling_block(&format!("{}_next", loop_name)); + + let zero = self.const_usize(0); + let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes())); + + let loop_i = self.llbb().get_function().new_local(None, self.type_size_t(), "loop_i"); + self.assign(loop_i, zero); + let loop_i_val = loop_i.to_rvalue(); + + self.br(header_bb); + + self.switch_to_block(header_bb); + let keep_going = self.icmp(IntPredicate::IntNE, loop_i_val, iterations); + self.cond_br(keep_going, body_bb, next_bb); + + self.switch_to_block(body_bb); + let current_ptrs: [Self::Value; VAR_COUNT] = core::array::from_fn( + |i|{ + let start = self.pointercast(start_ptrs[i], self.type_i8p()); + let offset = self.unchecked_umul(additions[i], loop_i_val); + self.inbounds_gep(self.type_i8(), start, &[offset]) + } + ); + body_visitor(self, ¤t_ptrs); + let next_i = self.unchecked_uadd(loop_i_val, self.const_usize(1)); + self.assign(loop_i, next_i); + self.br(header_bb); + + self.switch_to_block(next_bb); + } + fn select(&mut self, cond: RValue<'gcc>, then_val: RValue<'gcc>, mut else_val: RValue<'gcc>) -> RValue<'gcc> { let func = self.current_func(); let variable = func.new_local(None, then_val.get_type(), "selectVar"); diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs index b4aa001547c4c..adf96ca8cd240 100644 --- a/compiler/rustc_codegen_llvm/src/builder.rs +++ b/compiler/rustc_codegen_llvm/src/builder.rs @@ -935,6 +935,54 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> { } } + fn make_memory_loop( + &mut self, + loop_name: &str, + start_ptrs: [Self::Value; VAR_COUNT], + steps: [Size; VAR_COUNT], + iterations: Self::Value, + body_visitor: BodyPtrsVisitor, + ) where + BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]), + { + const { + assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero."); + } + for step in steps { + assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected."); + } + + let zero = self.const_usize(0); + let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes())); + + let header_bb = self.append_sibling_block(&format!("{}_header", loop_name)); + let body_bb = self.append_sibling_block(&format!("{}_body", loop_name)); + let next_bb = self.append_sibling_block(&format!("{}_next", loop_name)); + self.br(header_bb); + + let mut header_bx = Builder::build(self.cx, header_bb); + // Use integer for iteration instead of pointers because LLVM canonicalize loop into indexed anyway. + let loop_i = header_bx.phi(self.type_isize(), &[zero], &[self.llbb()]); + let keep_going = header_bx.icmp(IntPredicate::IntNE, loop_i, iterations); + header_bx.cond_br(keep_going, body_bb, next_bb); + + let mut body_bx = Builder::build(self.cx, body_bb); + let current_ptrs: [Self::Value; VAR_COUNT] = std::array::from_fn(|i| { + let start = start_ptrs[i]; + // FIXME: Remove pointercast after dropping supporting of LLVM 14. + let start = self.pointercast(start, self.type_i8p()); + let addition = additions[i]; + let offset = body_bx.unchecked_umul(loop_i, addition); + body_bx.inbounds_gep(body_bx.type_i8(), start, &[offset]) + }); + body_visitor(&mut body_bx, ¤t_ptrs); + let next_i = body_bx.unchecked_uadd(loop_i, body_bx.const_usize(1)); + header_bx.add_incoming_to_phi(loop_i, next_i, body_bb); + body_bx.br(header_bb); + + *self = Builder::build(self.cx, next_bb); + } + fn select( &mut self, cond: &'ll Value, diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs index 24968e00cc8e5..35ac7b33d3ff7 100644 --- a/compiler/rustc_codegen_llvm/src/lib.rs +++ b/compiler/rustc_codegen_llvm/src/lib.rs @@ -7,6 +7,7 @@ #![doc(html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/")] #![feature(extern_types)] #![feature(hash_raw_entry)] +#![feature(inline_const)] #![feature(iter_intersperse)] #![feature(let_chains)] #![feature(never_type)] diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs index 9ac2424e76be0..bf9610e5ee6ce 100644 --- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs +++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs @@ -9,12 +9,12 @@ use crate::meth; use crate::traits::*; use crate::MemFlags; -use rustc_middle::ty::{self, Ty, TyCtxt}; -use rustc_span::{sym, Span}; -use rustc_target::abi::{ - call::{FnAbi, PassMode}, - WrappingRange, -}; +use rustc_middle::ty; +use rustc_middle::ty::{Ty, TyCtxt}; +use rustc_span::sym; +use rustc_span::Span; +use rustc_target::abi::call::{FnAbi, PassMode}; +use rustc_target::abi::WrappingRange; fn copy_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( bx: &mut Bx, @@ -37,6 +37,442 @@ fn copy_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( } } +mod swap_intrinsic { + use crate::traits::*; + use crate::MemFlags; + + use rustc_middle::mir::interpret::PointerArithmetic; + use rustc_middle::ty::Ty; + use rustc_span::Span; + use rustc_target::abi::{Align, Size}; + use rustc_target::spec::HasTargetSpec; + + // Note: We deliberately interpret our values as some ranges of bytes + // for performance like did earlier in the old `core::mem::swap` implementation + // and use immediate values instead of PlaceRefs. + pub(super) fn single<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + ty: Ty<'tcx>, + span: Span, + ) { + let layout = bx.layout_of(ty); + if layout.is_unsized() { + span_bug!(span, "swap_nonoverlapping_single must be called only for sized types"); + } + if layout.is_zst() { + // no-op + return; + } + let should_use_2_temp_vals = { + // Primitive integer or something equal to it by size. + (layout.size <= bx.cx().pointer_size() && layout.size.bytes().is_power_of_two()) + // SPIR-V doesn't allow reinterpretation of values as chunks of arbitrary ints + // so we need to read and copy them full. + // For small values we use double read-double write. + || (layout.size <= bx.cx().pointer_size() && bx.cx().target_spec().arch == "spirv") + }; + if should_use_2_temp_vals { + let ty = bx.backend_type(layout); + let align = layout.align.abi; + swap_using_2_temps(bx, x_ptr, y_ptr, ty, align); + return; + } + + // If need to swap large value, + // it probably better to do single memcpy from one elem + // to another after saving the old value. + let should_use_single_temp_val = { + // Most likely some `Simd` type from portable simd or manual simd. + // There is no difference with double read in release build + // but it reduces amount of code generated in debug build. + (layout.align.abi.bytes() == layout.size.bytes() && layout.size > bx.cx().pointer_size()) + // Probably aggregate with some SIMD type field. + // E.g. `Option`. + // Need to think how to do it better. + || layout.align.abi > bx.data_layout().pointer_align.abi + // SPIRV doesn't allow partial reads/writes and value reinterpretations + // so our best chance to reduce stack usage is to use single alloca. + || bx.cx().target_spec().arch == "spirv" + }; + if should_use_single_temp_val { + let ty = bx.backend_type(layout); + swap_using_single_temp(bx, x_ptr, y_ptr, ty, layout.size, layout.align.abi); + return; + } + + // Both LLVM and GCC seem to benefit from same splitting loops + // so place this code here to prevent duplication. + // https://godbolt.org/z/arzvePb8T + + if bx.cx().target_spec().arch == "x86_64" { + swap_unaligned_x86_64_single(bx, layout, x_ptr, y_ptr); + return; + } + + // Swap using aligned integers as chunks. + assert!(layout.align.abi.bytes() <= bx.pointer_size().bytes()); + assert_eq!(bx.data_layout().pointer_align.abi.bytes(), bx.pointer_size().bytes()); + let chunk_size = std::cmp::min(layout.align.abi.bytes(), bx.pointer_size().bytes()); + let chunk_size = Size::from_bytes(chunk_size); + make_swaps_loop( + bx, + x_ptr, + y_ptr, + ToSwap::Bytes(layout.size), + ChunkInfo::IntChunk(chunk_size), + NumOfTemps::Two, + Align::from_bytes(chunk_size.bytes()).unwrap(), + ); + } + + // `x86_64` allows optimization using unaligned accesses + // because unaligned reads/writes are fast on x86_64. + // https://lemire.me/blog/2012/05/31/data-alignment-for-speed-myth-or-reality/ + // We manually swap last `x % ZMM_BYTES` bytes in a way that would always vectorize + // them AVX and/or SSE because both GCC and LLVM generate fails to use smaller SIMD registers + // if they had used larger ones. + fn swap_unaligned_x86_64_single<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + layout: Bx::LayoutOfResult, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + ) { + const ZMM_BYTES: u64 = 512 / 8; + const YMM_BYTES: u64 = 256 / 8; + const XMM_BYTES: u64 = 128 / 8; + + let min_align = Align::from_bytes(1).expect("One is always valid align."); + let ptr_size = bx.cx().pointer_size(); + // Need to do pointercasts because `rustc_codegen_gcc` ignores passed type + // in `inbounds_gep`. + let x_ptr = bx.pointercast(x_ptr, bx.type_i8p()); + let y_ptr = bx.pointercast(y_ptr, bx.type_i8p()); + + let mut total_offset = Size::ZERO; + // Make a loop that is vectorized using largest vectors. + // It would use largest available vectors, not necessary ZMM. + if layout.size.bytes() >= ZMM_BYTES { + let to_swap = Size::from_bytes(layout.size.bytes() / ZMM_BYTES * ZMM_BYTES); + make_swaps_loop( + bx, + x_ptr, + y_ptr, + ToSwap::Bytes(to_swap), + ChunkInfo::IntChunk(ptr_size), + NumOfTemps::Two, + min_align, + ); + total_offset += to_swap; + } + // This loop contents are based on knowledge from this: https://godbolt.org/z/Mr4rWfoad + // And this: https://godbolt.org/z/YzcWofG5Y + // Both LLVM and GCC fail to use SIMD registers for swapping tails without this. + for (num_temps, chunk_size) in [(4, YMM_BYTES), (2, XMM_BYTES)] { + let chunk_size = Size::from_bytes(chunk_size); + assert_eq!( + ptr_size * num_temps, + chunk_size, + "Invalid assumption about pointer size or register size", + ); + if layout.size < total_offset + chunk_size { + continue; + } + + let x_tmps_and_offsets: Vec<_> = (0..num_temps) + .map(|i| { + let curr_off = total_offset + i * ptr_size; + let curr_off = bx.const_usize(curr_off.bytes()); + let x_gep = bx.inbounds_gep(bx.type_i8(), x_ptr, &[curr_off]); + // FIXME: Remove pointercast after stopping support of LLVM 14. + let x_gep = bx.pointercast(x_gep, bx.type_ptr_to(bx.type_isize())); + (bx.load(bx.type_isize(), x_gep, min_align), curr_off) + }) + .collect(); + + let chunk_size_val = bx.const_usize(chunk_size.bytes()); + let chunk_offset = bx.const_usize(total_offset.bytes()); + let x_chunk_gep = bx.inbounds_gep(bx.type_i8(), x_ptr, &[chunk_offset]); + let y_chunk_gep = bx.inbounds_gep(bx.type_i8(), y_ptr, &[chunk_offset]); + // FIXME(AngelicosPhosphoros): Use memcpy.inline here. + bx.memcpy( + x_chunk_gep, + min_align, + y_chunk_gep, + min_align, + chunk_size_val, + MemFlags::UNALIGNED, + ); + for (x_tmp, curr_off) in x_tmps_and_offsets { + let y_gep = bx.inbounds_gep(bx.type_i8(), y_ptr, &[curr_off]); + // FIXME: Remove pointercast after stopping support of LLVM 14. + let y_gep = bx.pointercast(y_gep, bx.type_ptr_to(bx.type_isize())); + bx.store(x_tmp, y_gep, min_align); + } + + total_offset += chunk_size; + } + + // I decided to use swaps by pow2 ints here based + // on this codegen example: https://godbolt.org/z/rWYqMGnWh + // This loops implements it using minimal amount of instructions + // and registers involved. + let mut current_size = bx.pointer_size(); + while total_offset < layout.size { + // In each loop iteration, remaining amount of unswapped bytes + // is less than in previous iteration. + + assert_ne!(current_size, Size::ZERO, "We must had finished swapping when it was 1"); + + let next_size = Size::from_bytes(current_size.bytes() / 2); + if total_offset + current_size > layout.size { + current_size = next_size; + continue; + } + + let tail_offset = bx.const_usize(total_offset.bytes()); + let x_tail_ptr = bx.inbounds_gep(bx.type_i8(), x_ptr, &[tail_offset]); + let y_tail_ptr = bx.inbounds_gep(bx.type_i8(), y_ptr, &[tail_offset]); + + let chunt_ty = choose_int_by_size(bx, current_size); + swap_using_2_temps(bx, x_tail_ptr, y_tail_ptr, chunt_ty, min_align); + + total_offset += current_size; + current_size = next_size; + } + } + + // We cannot use some of optimizations available for [`single`] + // because we don't know how many bytes exactly we need to swap. + pub(super) fn many<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + count: Bx::Value, + ty: Ty<'tcx>, + span: Span, + ) { + let layout = bx.layout_of(ty); + if layout.is_unsized() { + span_bug!(span, "swap_nonoverlapping_many must be called only for sized types"); + } + if layout.is_zst() { + // no-op + return; + } + + let must_not_split_values = { + // Unusual type, maybe some manual SIMD optimization. + layout.align.abi > bx.data_layout().pointer_align.abi && layout.align.abi.bytes() == layout.size.bytes() + // Probably aggregate with some SIMD type field. + // E.g. `Option`. + // Need to think how to do it better. + || layout.align.abi > bx.data_layout().pointer_align.abi + // SPIR-V doesn't allow reinterpretation of values as chunks of arbitrary ints + // so we need to read and copy them by element full. + || bx.cx().target_spec().arch == "spirv" + }; + + if must_not_split_values { + let back_ty = bx.backend_type(layout); + let num_of_temps = + if layout.size > bx.pointer_size() { NumOfTemps::Single } else { NumOfTemps::Two }; + make_swaps_loop( + bx, + x_ptr, + y_ptr, + ToSwap::Iterations(count), + ChunkInfo::RealTyChunk(back_ty, layout.size), + num_of_temps, + layout.align.abi, + ); + return; + } + + let chunk_size = if bx.cx().target_spec().arch == "x86_64" { + // x86_64 allows unaligned reads/writes + // and it is relatively fast + // so try largest chunk available. + const INT_SIZES: [u64; 4] = [1, 2, 4, 8]; + INT_SIZES + .into_iter() + .map(Size::from_bytes) + .take_while(|x| *x <= layout.size) + .filter(|x| layout.size.bytes() % x.bytes() == 0) + .last() + .unwrap() + } else { + // Fallback to integer with size equal to alignment + Size::from_bytes(layout.align.abi.bytes()) + }; + + let chunks_per_elem = layout.size.bytes() / chunk_size.bytes(); + assert_ne!(chunks_per_elem, 0); + let iterations = if chunks_per_elem == 1 { + count + } else { + let chunks_per_elem = bx.const_usize(chunks_per_elem); + bx.unchecked_umul(count, chunks_per_elem) + }; + + make_swaps_loop( + bx, + x_ptr, + y_ptr, + ToSwap::Iterations(iterations), + ChunkInfo::IntChunk(chunk_size), + NumOfTemps::Two, + // It iterates either by chunks equal to alignment + // or multiply of alignment so it would always be correct. + layout.align.abi, + ); + } + + fn choose_int_by_size<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + size: Size, + ) -> Bx::Type { + match size.bits() { + 8 => bx.type_i8(), + 16 => bx.type_i16(), + 32 => bx.type_i32(), + 64 => bx.type_i64(), + 128 => bx.type_i128(), + _ => unreachable!("Unexpected target int {:?}.", size), + } + } + + #[derive(Clone, Copy)] + enum ToSwap { + /// Size of region to swap. Useful when we know exact value. + Bytes(Size), + /// Number of chunks to swap. For runtime value. + Iterations(BxValue), + } + + #[derive(Clone, Copy)] + enum ChunkInfo { + /// When we want to use it directly + RealTyChunk(BxType, Size), + /// When we want to split value by integer chunk. + IntChunk(Size), + } + + #[derive(Copy, Clone, Eq, PartialEq)] + enum NumOfTemps { + Single, + Two, + } + + fn make_swaps_loop<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + to_swap: ToSwap, + chunk_info: ChunkInfo, + num_of_temps: NumOfTemps, + access_align: Align, + ) { + let (ChunkInfo::IntChunk(chunk_size) | ChunkInfo::RealTyChunk(_, chunk_size)) = chunk_info; + + assert_ne!(chunk_size, Size::ZERO); + + if let ToSwap::Bytes(total_bytes) = to_swap { + assert!( + total_bytes > chunk_size, + "No need to generate loop when simple swap is enough." + ); + assert_eq!( + total_bytes.bytes() % chunk_size.bytes(), + 0, + "Cannot split size of swap into chunks." + ); + } + + assert_eq!( + chunk_size.bytes() % access_align.bytes(), + 0, + "Ensure that access align doesn't shift", + ); + + let chunk_ty = match chunk_info { + ChunkInfo::RealTyChunk(ty, _) => ty, + ChunkInfo::IntChunk(size) => choose_int_by_size(bx, size), + }; + + let iterations = match to_swap { + ToSwap::Bytes(s) => { + let iterations_val = s.bytes() / chunk_size.bytes(); + bx.const_usize(iterations_val) + } + ToSwap::Iterations(it) => it, + }; + + // Need to do pointercasts because `rustc_codegen_gcc` ignores passed type + // in `inbounds_gep`. + let x_ptr = bx.pointercast(x_ptr, bx.type_i8p()); + let y_ptr = bx.pointercast(y_ptr, bx.type_i8p()); + bx.make_memory_loop( + "swap_loop", + [x_ptr, y_ptr], + [chunk_size; 2], + iterations, + |body_bx, &[curr_x_ptr, curr_y_ptr]| match num_of_temps { + NumOfTemps::Single => swap_using_single_temp( + body_bx, + curr_x_ptr, + curr_y_ptr, + chunk_ty, + chunk_size, + access_align, + ), + NumOfTemps::Two => { + swap_using_2_temps(body_bx, curr_x_ptr, curr_y_ptr, chunk_ty, access_align) + } + }, + ); + } + + fn swap_using_2_temps<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + tmp_ty: Bx::Type, + access_align: Align, + ) { + // FIXME: Remove pointercast when stop support of LLVM 14. + let tmp_ptr_ty = bx.type_ptr_to(tmp_ty); + let x_ptr = bx.pointercast(x_ptr, tmp_ptr_ty); + let y_ptr = bx.pointercast(y_ptr, tmp_ptr_ty); + + let tmp_x = bx.load(tmp_ty, x_ptr, access_align); + let tmp_y = bx.load(tmp_ty, y_ptr, access_align); + bx.store(tmp_y, x_ptr, access_align); + bx.store(tmp_x, y_ptr, access_align); + } + + fn swap_using_single_temp<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( + bx: &mut Bx, + x_ptr: Bx::Value, + y_ptr: Bx::Value, + tmp_ty: Bx::Type, + tmp_size: Size, + access_align: Align, + ) { + // FIXME: Remove pointercast when stop support of LLVM 14. + let tmp_ptr_ty = bx.type_ptr_to(tmp_ty); + let x_ptr = bx.pointercast(x_ptr, tmp_ptr_ty); + let y_ptr = bx.pointercast(y_ptr, tmp_ptr_ty); + + let num_bytes = bx.const_usize(tmp_size.bytes()); + let tmp_x = bx.load(tmp_ty, x_ptr, access_align); + // FIXME(AngelicosPhosphoros): Use memcpy.inline here. + bx.memcpy(x_ptr, access_align, y_ptr, access_align, num_bytes, MemFlags::empty()); + bx.store(tmp_x, y_ptr, access_align); + } +} + fn memset_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>( bx: &mut Bx, volatile: bool, @@ -154,6 +590,27 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> { ); return; } + sym::swap_nonoverlapping_single => { + swap_intrinsic::single( + bx, + args[0].immediate(), + args[1].immediate(), + substs.type_at(0), + span, + ); + return; + } + sym::swap_nonoverlapping_many => { + swap_intrinsic::many( + bx, + args[0].immediate(), + args[1].immediate(), + args[2].immediate(), + substs.type_at(0), + span, + ); + return; + } sym::write_bytes => { memset_intrinsic( bx, diff --git a/compiler/rustc_codegen_ssa/src/traits/builder.rs b/compiler/rustc_codegen_ssa/src/traits/builder.rs index 853c6934c2c24..a8fea579875bc 100644 --- a/compiler/rustc_codegen_ssa/src/traits/builder.rs +++ b/compiler/rustc_codegen_ssa/src/traits/builder.rs @@ -257,6 +257,21 @@ pub trait BuilderMethods<'a, 'tcx>: flags: MemFlags, ); + /// Loop that iterates over some memory using offsets steps. + /// Interprets pointers as u8 pointers. + /// `BodyPtrsVisitor` allow access to body and current iteration pointers. + /// Steps MUST not be zeros. + /// `steps[i]*iterations` MUST not overflow targets `usize`. + fn make_memory_loop( + &mut self, + loop_name: &str, + start_ptrs: [Self::Value; VAR_COUNT], + steps: [Size; VAR_COUNT], + iterations: Self::Value, + visitor: BodyPtrsVisitor, + ) where + BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]); + fn select( &mut self, cond: Self::Value, diff --git a/compiler/rustc_const_eval/src/interpret/intrinsics.rs b/compiler/rustc_const_eval/src/interpret/intrinsics.rs index 7192bbc00d556..3ae610e2954e2 100644 --- a/compiler/rustc_const_eval/src/interpret/intrinsics.rs +++ b/compiler/rustc_const_eval/src/interpret/intrinsics.rs @@ -291,6 +291,24 @@ impl<'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> InterpCx<'mir, 'tcx, M> { sym::write_bytes => { self.write_bytes_intrinsic(&args[0], &args[1], &args[2])?; } + sym::swap_nonoverlapping_single => { + let layout = self.layout_of(substs.type_at(0))?; + self.mem_swap_nonoverlapping( + self.read_pointer(&args[0])?, + self.read_pointer(&args[1])?, + 1, + layout, + )?; + } + sym::swap_nonoverlapping_many => { + let layout = self.layout_of(substs.type_at(0))?; + self.mem_swap_nonoverlapping( + self.read_pointer(&args[0])?, + self.read_pointer(&args[1])?, + self.read_target_usize(&args[2])?, + layout, + )?; + } sym::arith_offset => { let ptr = self.read_pointer(&args[0])?; let offset_count = self.read_target_isize(&args[1])?; diff --git a/compiler/rustc_const_eval/src/interpret/memory.rs b/compiler/rustc_const_eval/src/interpret/memory.rs index 1125d8d1f0e08..eb33c1f4fdb93 100644 --- a/compiler/rustc_const_eval/src/interpret/memory.rs +++ b/compiler/rustc_const_eval/src/interpret/memory.rs @@ -21,10 +21,10 @@ use rustc_target::abi::{Align, HasDataLayout, Size}; use crate::const_eval::CheckAlignment; use crate::fluent_generated as fluent; +use super::alloc_range; use super::{ - alloc_range, AllocBytes, AllocId, AllocMap, AllocRange, Allocation, CheckInAllocMsg, - GlobalAlloc, InterpCx, InterpResult, Machine, MayLeak, Pointer, PointerArithmetic, Provenance, - Scalar, + AllocBytes, AllocId, AllocMap, AllocRange, Allocation, CheckInAllocMsg, GlobalAlloc, InterpCx, + InterpResult, Machine, MayLeak, Pointer, PointerArithmetic, Provenance, Scalar, }; #[derive(Debug, PartialEq, Copy, Clone)] @@ -1222,6 +1222,59 @@ impl<'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> InterpCx<'mir, 'tcx, M> { Ok(()) } + + pub fn mem_swap_nonoverlapping( + &mut self, + x_ptr: Pointer>, + y_ptr: Pointer>, + count: u64, + layout: ty::layout::TyAndLayout<'tcx>, + ) -> InterpResult<'tcx> { + let elem_size = layout.size; + let align = layout.align.abi; + + if count > i64::MAX as u64 { + throw_ub_format!("`count` argument to `swap_nonoverlapping_many` is too large."); + } + + let first_ptr_acc = self.get_ptr_access(x_ptr, elem_size * count, align)?; + let second_ptr_acc = self.get_ptr_access(y_ptr, elem_size * count, align)?; + + let Some((x_alloc_id, x_offset, _)) = first_ptr_acc else { + assert_eq!(elem_size, Size::ZERO); + // Called on ZST so it is noop. + return Ok(()) + }; + let Some((y_alloc_id, y_offset, _)) = second_ptr_acc else { + unreachable!("If right param is ZST, left must be too") + }; + + if x_alloc_id == y_alloc_id { + if (x_offset..x_offset + elem_size * count).contains(&y_offset) + || (y_offset..y_offset + elem_size * count).contains(&x_offset) + { + throw_ub_format!("swap was called on overlapping memory."); + } + } + + if count == 0 { + return Ok(()); + } + + let tmp_stack_alloc = self.allocate(layout, MemoryKind::Stack)?; + + for i in 0..i64::try_from(count).unwrap() { + let curr_x_ptr = self.ptr_offset_inbounds(x_ptr, layout.ty, i)?; + let curr_y_ptr = self.ptr_offset_inbounds(y_ptr, layout.ty, i)?; + + self.mem_copy(curr_x_ptr, align, tmp_stack_alloc.ptr, align, elem_size, true)?; + self.mem_copy(curr_y_ptr, align, curr_x_ptr, align, elem_size, true)?; + self.mem_copy(tmp_stack_alloc.ptr, align, curr_y_ptr, align, elem_size, true)?; + } + self.deallocate_ptr(tmp_stack_alloc.ptr, Some((elem_size, align)), MemoryKind::Stack)?; + + Ok(()) + } } /// Machine pointer introspection. diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs index 36c468e778986..b93bc57e2002e 100644 --- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs +++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs @@ -278,6 +278,23 @@ pub fn check_intrinsic_type(tcx: TyCtxt<'_>, it: &hir::ForeignItem<'_>) { ], tcx.mk_unit(), ), + sym::swap_nonoverlapping_single => ( + 1, + vec![ + tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }), + tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }), + ], + tcx.mk_unit(), + ), + sym::swap_nonoverlapping_many => ( + 1, + vec![ + tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }), + tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }), + tcx.types.usize, + ], + tcx.mk_unit(), + ), sym::sqrtf32 => (0, vec![tcx.types.f32], tcx.types.f32), sym::sqrtf64 => (0, vec![tcx.types.f64], tcx.types.f64), sym::powif32 => (0, vec![tcx.types.f32, tcx.types.i32], tcx.types.f32), diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index c5ce2575fff06..aacee6a44ce1f 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -1480,6 +1480,8 @@ symbols! { sub_assign, sub_with_overflow, suggestion, + swap_nonoverlapping_many, + swap_nonoverlapping_single, sym, sync, t32, diff --git a/library/core/src/intrinsics.rs b/library/core/src/intrinsics.rs index 9b8612485ac1e..7eb145e99e73c 100644 --- a/library/core/src/intrinsics.rs +++ b/library/core/src/intrinsics.rs @@ -2768,6 +2768,67 @@ pub const unsafe fn copy(src: *const T, dst: *mut T, count: usize) { } } +#[cfg(not(bootstrap))] +extern "rust-intrinsic" { + /// This is an implementation detail of [`crate::mem::swap`] and should + /// not be used anywhere else. + /// + /// Swaps 2 values using minimal extra memory depending on target. + /// Created to remove target/backend specific optimizations from library code to + /// make MIR-level optimizations simpler to implement. + /// + /// The operation is "untyped" in the sense that data may be uninitialized or otherwise violate the + /// requirements of `T`. The initialization state is preserved exactly. + /// + /// # Safety + /// + /// Behavior is undefined if any of the following conditions are violated: + /// + /// * Both `x` and `y` must be valid for both reads and writes of `size_of::()` bytes. + /// + /// * Both `x` and `y` must be properly aligned. + /// + /// * The region of memory beginning at `x` with a size of `size_of::()` + /// bytes must *not* overlap with the region of memory beginning at `y` + /// with the same size. + /// + /// Note that even if the effectively copied size (`size_of::()`) is `0`, + /// the pointers must be non-null and properly aligned. + #[rustc_nounwind] + #[rustc_const_unstable(feature = "const_swap", issue = "83163")] + pub fn swap_nonoverlapping_single(x: *mut T, y: *mut T); + + /// This is an implementation detail of [`crate::ptr::swap_nonoverlapping`] and should + /// not be used anywhere else. + /// + /// Swaps 2 ranges of values starting from `x` and `y` using minimal extra memory depending on target. + /// Created to remove target/backend specific optimizations from library code to + /// make MIR-level optimizations simpler to implement. + /// + /// The operation is "untyped" in the sense that data may be uninitialized or otherwise violate the + /// requirements of `T`. The initialization state is preserved exactly. + /// + /// # Safety + /// + /// Behavior is undefined if any of the following conditions are violated: + /// + /// * Both `x` and `y` must be valid for both reads and writes of `count * + /// size_of::()` bytes. + /// + /// * Both `x` and `y` must be properly aligned. + /// + /// * The region of memory beginning at `x` with a size of `count * + /// size_of::()` bytes must *not* overlap with the region of memory + /// beginning at `y` with the same size. + /// + /// Note that even if the effectively copied size (`count * size_of::()`) is `0`, + /// the pointers must be non-null and properly aligned. + /// + #[rustc_nounwind] + #[rustc_const_unstable(feature = "const_swap", issue = "83163")] + pub fn swap_nonoverlapping_many(x: *mut T, y: *mut T, count: usize); +} + /// Sets `count * size_of::()` bytes of memory starting at `dst` to /// `val`. /// diff --git a/library/core/src/mem/mod.rs b/library/core/src/mem/mod.rs index 39c9a04eea92b..9b374745cd3c5 100644 --- a/library/core/src/mem/mod.rs +++ b/library/core/src/mem/mod.rs @@ -724,40 +724,52 @@ pub unsafe fn uninitialized() -> T { #[stable(feature = "rust1", since = "1.0.0")] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] pub const fn swap(x: &mut T, y: &mut T) { - // NOTE(eddyb) SPIR-V's Logical addressing model doesn't allow for arbitrary - // reinterpretation of values as (chunkable) byte arrays, and the loop in the - // block optimization in `swap_slice` is hard to rewrite back - // into the (unoptimized) direct swapping implementation, so we disable it. - // FIXME(eddyb) the block optimization also prevents MIR optimizations from - // understanding `mem::replace`, `Option::take`, etc. - a better overall - // solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which - // a backend can choose to implement using the block optimization, or not. - #[cfg(not(any(target_arch = "spirv")))] + #[cfg(bootstrap)] { - // For types that are larger multiples of their alignment, the simple way - // tends to copy the whole thing to stack rather than doing it one part - // at a time, so instead treat them as one-element slices and piggy-back - // the slice optimizations that will split up the swaps. - if size_of::() / align_of::() > 4 { - // SAFETY: exclusive references always point to one non-overlapping - // element and are non-null and properly aligned. - return unsafe { ptr::swap_nonoverlapping(x, y, 1) }; + // NOTE(eddyb) SPIR-V's Logical addressing model doesn't allow for arbitrary + // reinterpretation of values as (chunkable) byte arrays, and the loop in the + // block optimization in `swap_slice` is hard to rewrite back + // into the (unoptimized) direct swapping implementation, so we disable it. + // FIXME(eddyb) the block optimization also prevents MIR optimizations from + // understanding `mem::replace`, `Option::take`, etc. - a better overall + // solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which + // a backend can choose to implement using the block optimization, or not. + #[cfg(not(any(target_arch = "spirv")))] + { + // For types that are larger multiples of their alignment, the simple way + // tends to copy the whole thing to stack rather than doing it one part + // at a time, so instead treat them as one-element slices and piggy-back + // the slice optimizations that will split up the swaps. + if size_of::() / align_of::() > 4 { + // SAFETY: exclusive references always point to one non-overlapping + // element and are non-null and properly aligned. + return unsafe { ptr::swap_nonoverlapping(x, y, 1) }; + } } - } - // If a scalar consists of just a small number of alignment units, let - // the codegen just swap those pieces directly, as it's likely just a - // few instructions and anything else is probably overcomplicated. - // - // Most importantly, this covers primitives and simd types that tend to - // have size=align where doing anything else can be a pessimization. - // (This will also be used for ZSTs, though any solution works for them.) - swap_simple(x, y); + // If a scalar consists of just a small number of alignment units, let + // the codegen just swap those pieces directly, as it's likely just a + // few instructions and anything else is probably overcomplicated. + // + // Most importantly, this covers primitives and simd types that tend to + // have size=align where doing anything else can be a pessimization. + // (This will also be used for ZSTs, though any solution works for them.) + swap_simple(x, y); + } + #[cfg(not(bootstrap))] + // SAFETY: since `x` and `y` are mutable references, + // 1. `x` and `y` are initialized. + // 2. `x` and `y` cannot overlap. + // 3. `x` and `y` are aligned. + unsafe { + core::intrinsics::swap_nonoverlapping_single(x, y); + } } /// Same as [`swap`] semantically, but always uses the simple implementation. /// /// Used elsewhere in `mem` and `ptr` at the bottom layer of calls. +#[cfg(bootstrap)] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] #[inline] pub(crate) const fn swap_simple(x: &mut T, y: &mut T) { diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs index acc9ca29d41a1..a6de56da986ae 100644 --- a/library/core/src/ptr/mod.rs +++ b/library/core/src/ptr/mod.rs @@ -908,25 +908,6 @@ pub const unsafe fn swap(x: *mut T, y: *mut T) { #[stable(feature = "swap_nonoverlapping", since = "1.27.0")] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { - #[allow(unused)] - macro_rules! attempt_swap_as_chunks { - ($ChunkTy:ty) => { - if mem::align_of::() >= mem::align_of::<$ChunkTy>() - && mem::size_of::() % mem::size_of::<$ChunkTy>() == 0 - { - let x: *mut $ChunkTy = x.cast(); - let y: *mut $ChunkTy = y.cast(); - let count = count * (mem::size_of::() / mem::size_of::<$ChunkTy>()); - // SAFETY: these are the same bytes that the caller promised were - // ok, just typed as `MaybeUninit`s instead of as `T`s. - // The `if` condition above ensures that we're not violating - // alignment requirements, and that the division is exact so - // that we don't lose any bytes off the end. - return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }; - } - }; - } - // SAFETY: the caller must guarantee that `x` and `y` are // valid for writes and properly aligned. unsafe { @@ -940,19 +921,16 @@ pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { ); } - // Split up the slice into small power-of-two-sized chunks that LLVM is able - // to vectorize (unless it's a special type with more-than-pointer alignment, - // because we don't want to pessimize things like slices of SIMD vectors.) - if mem::align_of::() <= mem::size_of::() - && (!mem::size_of::().is_power_of_two() - || mem::size_of::() > mem::size_of::() * 2) - { - attempt_swap_as_chunks!(usize); - attempt_swap_as_chunks!(u8); + #[cfg(bootstrap)] + // SAFETY: Same preconditions as this function + unsafe { + swap_nonoverlapping_simple_untyped(x, y, count) } - + #[cfg(not(bootstrap))] // SAFETY: Same preconditions as this function - unsafe { swap_nonoverlapping_simple_untyped(x, y, count) } + unsafe { + intrinsics::swap_nonoverlapping_many(x, y, count) + } } /// Same behaviour and safety conditions as [`swap_nonoverlapping`] @@ -960,6 +938,7 @@ pub const unsafe fn swap_nonoverlapping(x: *mut T, y: *mut T, count: usize) { /// LLVM can vectorize this (at least it can for the power-of-two-sized types /// `swap_nonoverlapping` tries to use) so no need to manually SIMD it. #[inline] +#[cfg(bootstrap)] #[rustc_const_unstable(feature = "const_swap", issue = "83163")] const unsafe fn swap_nonoverlapping_simple_untyped(x: *mut T, y: *mut T, count: usize) { let x = x.cast::>(); diff --git a/library/core/tests/mem.rs b/library/core/tests/mem.rs index 5c2e18745ea21..251ebb274c520 100644 --- a/library/core/tests/mem.rs +++ b/library/core/tests/mem.rs @@ -103,6 +103,92 @@ fn test_swap() { assert_eq!(y, 31337); } +#[test] +fn test_many() { + // This tests if chunking works properly + fn swap_sized(a: T, b: T) { + let mut x: [T; SIZE] = [a; SIZE]; + let mut y: [T; SIZE] = [b; SIZE]; + swap::<[T; SIZE]>(&mut x, &mut y); + assert_eq!(x, [b; SIZE]); + assert_eq!(y, [a; SIZE]); + } + + fn swap_t(a: T, b: T) { + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + swap_sized::(a, b); + } + + swap_t::(7, 0xFF); + swap_t::(0xFAFA, 0x9898); + swap_t::(0xF0F0_F0F0, 0x0E0E_0E0E); + swap_t::(7, 8); + + #[derive(Eq, PartialEq, Debug)] + #[repr(align(32))] + struct LargeAlign([u8; 32]); + + let mut x = LargeAlign([9; 32]); + let mut y = LargeAlign([20; 32]); + swap(&mut x, &mut y); + assert_eq!(x, LargeAlign([20; 32])); + assert_eq!(y, LargeAlign([9; 32])); + + #[derive(Eq, PartialEq, Debug)] + #[repr(align(32))] + struct LargeAlignAndSize([u8; 96]); + + let mut x = LargeAlignAndSize([9; 96]); + let mut y = LargeAlignAndSize([20; 96]); + swap(&mut x, &mut y); + assert_eq!(x, LargeAlignAndSize([20; 96])); + assert_eq!(y, LargeAlignAndSize([9; 96])); + + #[derive(Eq, PartialEq, Debug)] + struct WithPadding { + a: u16, + b: u64, + } + + let mut x = WithPadding { a: 7, b: 27 }; + let mut y = WithPadding { a: 77, b: u64::MAX }; + swap(&mut x, &mut y); + assert_eq!(x, WithPadding { a: 77, b: u64::MAX }); + assert_eq!(y, WithPadding { a: 7, b: 27 }); +} + #[test] fn test_replace() { let mut x = Some("test".to_string()); diff --git a/library/core/tests/ptr.rs b/library/core/tests/ptr.rs index c02cd99cc4477..80837f040039b 100644 --- a/library/core/tests/ptr.rs +++ b/library/core/tests/ptr.rs @@ -1088,6 +1088,28 @@ fn swap_copy_untyped() { assert_eq!(y, 5); } +#[test] +fn test_swap_unaligned_on_x86_64() { + #[derive(Copy, Clone, Eq, PartialEq, Debug)] + struct AlignedTo2([u16; 4]); + + assert!( + mem::size_of::() >= mem::size_of::() + && mem::align_of::() == 2 + && mem::align_of::() < mem::align_of::() + ); + + let buff0: &mut [_] = &mut [AlignedTo2([1, 2, 3, 4]); 20]; + let buff1: &mut [_] = &mut [AlignedTo2([5, 6, 7, 8]); 20]; + let len = 20; + + unsafe { + swap_nonoverlapping(buff0.as_mut_ptr(), buff1.as_mut_ptr(), read_volatile(&len)); + } + assert_eq!(buff0, &[AlignedTo2([5, 6, 7, 8]); 20]); + assert_eq!(buff1, &[AlignedTo2([1, 2, 3, 4]); 20]); +} + #[test] fn test_const_copy() { const { diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs index 4a68403578d1e..69caa87514f29 100644 --- a/tests/codegen/swap-large-types.rs +++ b/tests/codegen/swap-large-types.rs @@ -16,7 +16,7 @@ type KeccakBuffer = [[u64; 5]; 5]; // CHECK-LABEL: @swap_basic #[no_mangle] pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { -// CHECK: alloca [5 x [5 x i64]] + // CHECK: alloca [5 x [5 x i64]] // SAFETY: exclusive references are always valid to read/write, // are non-overlapping, and nothing here panics so it's drop-safe. @@ -33,9 +33,14 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { // CHECK-LABEL: @swap_std #[no_mangle] pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i64> -// CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: load <{{[0-9]+}} x i64>{{.*}}align 8 + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: store <{{[0-9]+}} x i64>{{.*}}align 8 + // CHECK-NOT: alloca + // CHECK-NOT: br swap(x, y) } @@ -45,9 +50,11 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) { // CHECK-LABEL: @swap_slice #[no_mangle] pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i64> -// CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i{{8|16|32|64}}>{{.*}}align 8 + // CHECK-NOT: alloca + // CHECK: store <{{[0-9]+}} x i{{8|16|32|64}}>{{.*}}align 8 + // CHECK-NOT: alloca if x.len() == y.len() { x.swap_with_slice(y); } @@ -60,32 +67,24 @@ type OneKilobyteBuffer = [u8; 1024]; // CHECK-LABEL: @swap_1kb_slices #[no_mangle] pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i8> -// CHECK: store <{{[0-9]+}} x i8> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i{{8|16|32|64}}> + // CHECK: store <{{[0-9]+}} x i{{8|16|32|64}}> + // CHECK-NOT: alloca if x.len() == y.len() { x.swap_with_slice(y); } } -// This verifies that the 2×read + 2×write optimizes to just 3 memcpys -// for an unusual type like this. It's not clear whether we should do anything -// smarter in Rust for these, so for now it's fine to leave these up to the backend. -// That's not as bad as it might seem, as for example, LLVM will lower the -// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature. -// Eventually we'll be able to pass `align_of::` to a const generic and -// thus pick a smarter chunk size ourselves without huge code duplication. - #[repr(align(64))] pub struct BigButHighlyAligned([u8; 64 * 3]); // CHECK-LABEL: @swap_big_aligned #[no_mangle] pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) { -// CHECK-NOT: call void @llvm.memcpy -// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) -// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) -// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) -// CHECK-NOT: call void @llvm.memcpy + // CHECK-NOT: alloca + // CHECK-NOT: call void @llvm.memcpy + // CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192) + // CHECK-NOT: call void @llvm.memcpy swap(x, y) } diff --git a/tests/codegen/swap-simd-types.rs b/tests/codegen/swap-simd-types.rs index 3472a42b0e65e..5d702144b3973 100644 --- a/tests/codegen/swap-simd-types.rs +++ b/tests/codegen/swap-simd-types.rs @@ -1,4 +1,4 @@ -// compile-flags: -O -C target-feature=+avx +// compile-flags: -Copt-level=3 -C target-feature=+avx // only-x86_64 // ignore-debug: the debug assertions get in the way @@ -35,7 +35,7 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) { #[no_mangle] pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) { // CHECK-NOT: alloca -// CHECK: load <32 x i8>{{.+}}align 1 -// CHECK: store <32 x i8>{{.+}}align 1 +// CHECK: load <4 x i64>{{.+}}align 1 +// CHECK: store <4 x i64>{{.+}}align 1 swap(x, y) } diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs index 419645a3fc6bc..03dd4f7a5dddd 100644 --- a/tests/codegen/swap-small-types.rs +++ b/tests/codegen/swap-small-types.rs @@ -1,8 +1,9 @@ -// compile-flags: -O -Z merge-functions=disabled +// compile-flags: -Copt-level=3 -Z merge-functions=disabled // only-x86_64 // ignore-debug: the debug assertions get in the way #![crate_type = "lib"] +#![feature(portable_simd)] use std::mem::swap; @@ -26,10 +27,61 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) { #[no_mangle] pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) { // CHECK-NOT: alloca - // CHECK: load <3 x i16> - // CHECK: load <3 x i16> - // CHECK: store <3 x i16> - // CHECK: store <3 x i16> + // CHECK-NOT: br + // CHECK: load i32 + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: store i32 + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: load i16 + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: store i16 + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: ret void + swap(x, y) +} + +// CHECK-LABEL: @swap_vecs +#[no_mangle] +pub fn swap_vecs(x: &mut Vec, y: &mut Vec) { + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: load <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: load i64 + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: store i64 + // CHECK-NOT: alloca + // CHECK-NOT: br + + // CHECK: ret void + swap(x, y) +} + +// CHECK-LABEL: @swap_slices +#[no_mangle] +pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) { + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: load <{{[0-9]+}} x i64> + // CHECK: store <{{[0-9]+}} x i64> + // CHECK: ret void swap(x, y) } @@ -40,23 +92,23 @@ type RGB24 = [u8; 3]; // CHECK-LABEL: @swap_rgb24_slices #[no_mangle] pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i8> -// CHECK: store <{{[0-9]+}} x i8> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i8> + // CHECK: store <{{[0-9]+}} x i8> if x.len() == y.len() { x.swap_with_slice(y); } } -// This one has a power-of-two size, so we iterate over it directly +// This one has a power-of-two size, so we iterate over it using ints. type RGBA32 = [u8; 4]; // CHECK-LABEL: @swap_rgba32_slices #[no_mangle] pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i32> -// CHECK: store <{{[0-9]+}} x i32> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i32> + // CHECK: store <{{[0-9]+}} x i32> if x.len() == y.len() { x.swap_with_slice(y); } @@ -69,10 +121,38 @@ const _: () = assert!(!std::mem::size_of::().is_power_of_two()); // CHECK-LABEL: @swap_string_slices #[no_mangle] pub fn swap_string_slices(x: &mut [String], y: &mut [String]) { -// CHECK-NOT: alloca -// CHECK: load <{{[0-9]+}} x i64> -// CHECK: store <{{[0-9]+}} x i64> + // CHECK-NOT: alloca + // CHECK: load <{{[0-9]+}} x i64> + // CHECK: store <{{[0-9]+}} x i64> if x.len() == y.len() { x.swap_with_slice(y); } } + +#[repr(C, packed)] +pub struct Packed { + pub first: bool, + pub second: u64, +} + +// CHECK-LABEL: @swap_packed_structs +#[no_mangle] +pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) { + // CHECK-NOT: alloca + // CHECK: ret void + swap(x, y) +} + +// CHECK-LABEL: @swap_simd_type +#[no_mangle] +pub fn swap_simd_type(x: &mut std::simd::f32x4, y: &mut std::simd::f32x4){ + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: load <4 x float> + + // CHECK-NOT: alloca + // CHECK-NOT: br + // CHECK: store <4 x float> + // CHECK: ret void + swap(x, y) +} diff --git a/tests/ui/consts/missing_span_in_backtrace.stderr b/tests/ui/consts/missing_span_in_backtrace.stderr index fcfb9fbb3f8c0..1535777326566 100644 --- a/tests/ui/consts/missing_span_in_backtrace.stderr +++ b/tests/ui/consts/missing_span_in_backtrace.stderr @@ -3,12 +3,6 @@ error[E0080]: evaluation of constant value failed | = note: unable to copy parts of a pointer from memory at ALLOC_ID | -note: inside `std::ptr::read::>>` - --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL -note: inside `mem::swap_simple::>>` - --> $SRC_DIR/core/src/mem/mod.rs:LL:COL -note: inside `ptr::swap_nonoverlapping_simple_untyped::>` - --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL note: inside `swap_nonoverlapping::>` --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL note: inside `X` diff --git a/tests/ui/intrinsics/swap_nonoverlapping_single.rs b/tests/ui/intrinsics/swap_nonoverlapping_single.rs new file mode 100644 index 0000000000000..64afeceeb7ca8 --- /dev/null +++ b/tests/ui/intrinsics/swap_nonoverlapping_single.rs @@ -0,0 +1,132 @@ +#![feature(core_intrinsics, const_mut_refs, const_swap)] +#![crate_type = "rlib"] + +//! This module tests if `swap_nonoverlapping_single` works properly in const contexts. + +use std::intrinsics::swap_nonoverlapping_single; + +pub const OK_A: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + assert!(a == 5, "Must NOT fail."); + assert!(b == 0, "Must NOT fail."); +}; + +pub const ERR_A0: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(a != 5, "Must fail."); //~ ERROR evaluation of constant value failed +}; + +pub const ERR_A1: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(b != 0, "Must fail."); //~ ERROR evaluation of constant value failed +}; + +// This must NOT fail. +pub const B: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + assert!(a == 0, "Must NOT fail."); + assert!(b == 5, "Must NOT fail."); +}; + +pub const ERR_B0: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(a != 0, "Must fail."); //~ ERROR evaluation of constant value failed +}; + +pub const ERR_B1: () = { + let mut a = 0i32; + let mut b = 5i32; + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(b != 5, "Must fail."); //~ ERROR evaluation of constant value failed +}; + +// This must NOT fail. +pub const NON_OVERLAPPING_PTRS: () = { + let mut chunk = [0_i32, 1, 2, 3]; + + let ptr = chunk.as_mut_ptr(); + let ptr2 = unsafe { ptr.add(2) }; + let x: &mut [i32; 2] = unsafe { &mut *ptr.cast() }; + let y: &mut [i32; 2] = unsafe { &mut *ptr2.cast() }; + unsafe { + swap_nonoverlapping_single(x, y); + } + + assert!(matches!(chunk, [2, 3, 0, 1]), "Must NOT fail."); +}; + +pub const OVERLAPPING_PTRS_0: () = { + let mut chunk = [0_i32, 1, 2, 3]; + + let ptr = chunk.as_mut_ptr(); + let ptr2 = unsafe { ptr.add(1) }; + let x: &mut [i32; 2] = unsafe { &mut *ptr.cast() }; + let y: &mut [i32; 2] = unsafe { &mut *ptr2.cast() }; + + unsafe { + swap_nonoverlapping_single(x, y); //~ ERROR evaluation of constant value failed + } +}; + +pub const OVERLAPPING_PTRS_1: () = { + let mut val = 7; + + let ptr: *mut _ = &mut val; + let x: &mut i32 = unsafe { &mut *ptr }; + let y: &mut i32 = unsafe { &mut *ptr }; + + unsafe { + swap_nonoverlapping_single(x, y); //~ ERROR evaluation of constant value failed + } +}; + +pub const OK_STRUCT: () = { + struct Adt { + fl: bool, + val: usize, + } + let mut a = Adt { fl: false, val: 10 }; + let mut b = Adt { fl: true, val: 77 }; + + unsafe { + swap_nonoverlapping_single(&mut a, &mut b); + } + + assert!(matches!(a, Adt { fl: true, val: 77 }), "Must NOT fail."); + assert!(matches!(b, Adt { fl: false, val: 10 }), "Must NOT fail."); +}; diff --git a/tests/ui/intrinsics/swap_nonoverlapping_single.stderr b/tests/ui/intrinsics/swap_nonoverlapping_single.stderr new file mode 100644 index 0000000000000..d9a77927ea13a --- /dev/null +++ b/tests/ui/intrinsics/swap_nonoverlapping_single.stderr @@ -0,0 +1,47 @@ +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:25:5 + | +LL | assert!(a != 5, "Must fail."); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:25:5 + | + = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info) + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:35:5 + | +LL | assert!(b != 0, "Must fail."); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:35:5 + | + = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info) + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:62:5 + | +LL | assert!(a != 0, "Must fail."); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:62:5 + | + = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info) + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:75:5 + | +LL | assert!(b != 5, "Must fail."); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:75:5 + | + = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info) + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:102:9 + | +LL | swap_nonoverlapping_single(x, y); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ swap was called on overlapping memory. + +error[E0080]: evaluation of constant value failed + --> $DIR/swap_nonoverlapping_single.rs:114:9 + | +LL | swap_nonoverlapping_single(x, y); + | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ swap was called on overlapping memory. + +error: aborting due to 6 previous errors + +For more information about this error, try `rustc --explain E0080`.