diff --git a/compiler/rustc_codegen_cranelift/src/driver/jit.rs b/compiler/rustc_codegen_cranelift/src/driver/jit.rs
index 3118105a4e2d7..62a3e09691234 100644
--- a/compiler/rustc_codegen_cranelift/src/driver/jit.rs
+++ b/compiler/rustc_codegen_cranelift/src/driver/jit.rs
@@ -325,7 +325,7 @@ fn dep_symbol_lookup_fn(
             Linkage::NotLinked | Linkage::IncludedFromDylib => {}
             Linkage::Static => {
                 let name = crate_info.crate_name[&cnum];
-                let mut err = sess.struct_err(&format!("Can't load static lib {}", name));
+                let mut err = sess.struct_err(format!("Can't load static lib {}", name));
                 err.note("rustc_codegen_cranelift can only load dylibs in JIT mode.");
                 err.emit();
             }
diff --git a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
index 1e83c30bd677a..38c120fd4ba86 100644
--- a/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
+++ b/compiler/rustc_codegen_cranelift/src/intrinsics/mod.rs
@@ -567,6 +567,130 @@ fn codegen_regular_intrinsic_call<'tcx>(
             // FIXME use emit_small_memset
             fx.bcx.call_memset(fx.target_config, dst_ptr, val, count);
         }
+
+        sym::swap_nonoverlapping_single => {
+            intrinsic_args!(fx, args => (x_ptr, y_ptr); intrinsic);
+            let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty;
+            let pointee_layout = fx.layout_of(pointee_ty);
+
+            // ZSTs swap is noop.
+            if pointee_layout.size != Size::ZERO {
+                // Probably, it would be better to have dedicated method for this in
+                // `cranelift_frontend::FunctionBuilder`
+                // with optimizations based on size and alignment of values.
+
+                let x_ptr_val = x_ptr.load_scalar(fx);
+                let y_ptr_val = y_ptr.load_scalar(fx);
+
+                let tmp_place = CPlace::new_stack_slot(fx, pointee_layout);
+                let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx);
+
+                let size_bytes = pointee_layout.size.bytes();
+                let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap();
+                fx.bcx.emit_small_memory_copy(
+                    fx.target_config,
+                    tmp_ptr_val,
+                    x_ptr_val,
+                    size_bytes,
+                    align_bytes,
+                    align_bytes,
+                    true,
+                    MemFlags::trusted(),
+                );
+                fx.bcx.emit_small_memory_copy(
+                    fx.target_config,
+                    x_ptr_val,
+                    y_ptr_val,
+                    size_bytes,
+                    align_bytes,
+                    align_bytes,
+                    true,
+                    MemFlags::trusted(),
+                );
+                fx.bcx.emit_small_memory_copy(
+                    fx.target_config,
+                    y_ptr_val,
+                    tmp_ptr_val,
+                    size_bytes,
+                    align_bytes,
+                    align_bytes,
+                    true,
+                    MemFlags::trusted(),
+                );
+            }
+        }
+
+        sym::swap_nonoverlapping_many => {
+            intrinsic_args!(fx, args => (x_ptr, y_ptr, count); intrinsic);
+            let pointee_ty = x_ptr.layout().ty.builtin_deref(true).unwrap().ty;
+            let pointee_layout = fx.layout_of(pointee_ty);
+
+            // ZSTs swap is noop.
+            if pointee_layout.size != Size::ZERO {
+                let x_ptr_val = x_ptr.load_scalar(fx);
+                let y_ptr_val = y_ptr.load_scalar(fx);
+
+                let count = count.load_scalar(fx);
+
+                let tmp_place = CPlace::new_stack_slot(fx, pointee_layout);
+                let tmp_ptr_val = tmp_place.to_ptr().get_addr(fx);
+
+                let elem_size_bytes = pointee_layout.size.bytes();
+                let align_bytes: u8 = pointee_layout.align.abi.bytes().try_into().unwrap();
+
+                let loop_header = fx.bcx.create_block();
+                let loop_body = fx.bcx.create_block();
+                let loop_done = fx.bcx.create_block();
+
+                let index = fx.bcx.append_block_param(loop_header, fx.pointer_type);
+                let zero = fx.bcx.ins().iconst(fx.pointer_type, 0);
+                fx.bcx.ins().jump(loop_header, &[zero]);
+
+                fx.bcx.switch_to_block(loop_header);
+                let is_done = fx.bcx.ins().icmp(IntCC::Equal, index, count);
+                fx.bcx.ins().brif(is_done, loop_done, &[], loop_body, &[]);
+
+                fx.bcx.switch_to_block(loop_body);
+                let curr_x_ptr_val = fx.bcx.ins().iadd(x_ptr_val, index);
+                let curr_y_ptr_val = fx.bcx.ins().iadd(y_ptr_val, index);
+                fx.bcx.emit_small_memory_copy(
+                    fx.target_config,
+                    tmp_ptr_val,
+                    curr_x_ptr_val,
+                    elem_size_bytes,
+                    align_bytes,
+                    align_bytes,
+                    true,
+                    MemFlags::trusted(),
+                );
+                fx.bcx.emit_small_memory_copy(
+                    fx.target_config,
+                    curr_x_ptr_val,
+                    curr_y_ptr_val,
+                    elem_size_bytes,
+                    align_bytes,
+                    align_bytes,
+                    true,
+                    MemFlags::trusted(),
+                );
+                fx.bcx.emit_small_memory_copy(
+                    fx.target_config,
+                    curr_y_ptr_val,
+                    tmp_ptr_val,
+                    elem_size_bytes,
+                    align_bytes,
+                    align_bytes,
+                    true,
+                    MemFlags::trusted(),
+                );
+                let next_index = fx.bcx.ins().iadd_imm(index, 1);
+                fx.bcx.ins().jump(loop_header, &[next_index]);
+
+                fx.bcx.switch_to_block(loop_done);
+                fx.bcx.ins().nop();
+            }
+        }
+
         sym::ctlz | sym::ctlz_nonzero => {
             intrinsic_args!(fx, args => (arg); intrinsic);
             let val = arg.load_scalar(fx);
diff --git a/compiler/rustc_codegen_gcc/src/builder.rs b/compiler/rustc_codegen_gcc/src/builder.rs
index f9ea0f004564b..3eec43d3745f8 100644
--- a/compiler/rustc_codegen_gcc/src/builder.rs
+++ b/compiler/rustc_codegen_gcc/src/builder.rs
@@ -1070,6 +1070,55 @@ impl<'a, 'gcc, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'gcc, 'tcx> {
         self.block.add_eval(None, self.context.new_call(None, memset, &[ptr, fill_byte, size]));
     }
 
+    fn make_memory_loop<BodyPtrsVisitor, const VAR_COUNT: usize>(
+        &mut self,
+        loop_name: &str,
+        start_ptrs: [Self::Value; VAR_COUNT],
+        steps: [Size; VAR_COUNT],
+        iterations: Self::Value,
+        body_visitor: BodyPtrsVisitor,
+    ) where
+        BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]),
+    {
+        assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero.");
+
+        for step in steps {
+            assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected.");
+        }
+
+        let header_bb = self.append_sibling_block(&format!("{}_header", loop_name));
+        let body_bb = self.append_sibling_block(&format!("{}_body", loop_name));
+        let next_bb = self.append_sibling_block(&format!("{}_next", loop_name));
+
+        let zero = self.const_usize(0);
+        let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes()));
+
+        let loop_i = self.llbb().get_function().new_local(None, self.type_size_t(), "loop_i");
+        self.assign(loop_i, zero);
+        let loop_i_val = loop_i.to_rvalue();
+
+        self.br(header_bb);
+
+        self.switch_to_block(header_bb);
+        let keep_going = self.icmp(IntPredicate::IntNE, loop_i_val, iterations);
+        self.cond_br(keep_going, body_bb, next_bb);
+
+        self.switch_to_block(body_bb);
+        let current_ptrs: [Self::Value; VAR_COUNT] = core::array::from_fn(
+            |i|{
+                let start = self.pointercast(start_ptrs[i], self.type_i8p());
+                let offset = self.unchecked_umul(additions[i], loop_i_val);
+                self.inbounds_gep(self.type_i8(), start, &[offset])
+            }
+        );
+        body_visitor(self, &current_ptrs);
+        let next_i = self.unchecked_uadd(loop_i_val, self.const_usize(1));
+        self.assign(loop_i, next_i);
+        self.br(header_bb);
+
+        self.switch_to_block(next_bb);
+    }
+
     fn select(&mut self, cond: RValue<'gcc>, then_val: RValue<'gcc>, mut else_val: RValue<'gcc>) -> RValue<'gcc> {
         let func = self.current_func();
         let variable = func.new_local(None, then_val.get_type(), "selectVar");
diff --git a/compiler/rustc_codegen_llvm/src/builder.rs b/compiler/rustc_codegen_llvm/src/builder.rs
index b4aa001547c4c..adf96ca8cd240 100644
--- a/compiler/rustc_codegen_llvm/src/builder.rs
+++ b/compiler/rustc_codegen_llvm/src/builder.rs
@@ -935,6 +935,54 @@ impl<'a, 'll, 'tcx> BuilderMethods<'a, 'tcx> for Builder<'a, 'll, 'tcx> {
         }
     }
 
+    fn make_memory_loop<BodyPtrsVisitor, const VAR_COUNT: usize>(
+        &mut self,
+        loop_name: &str,
+        start_ptrs: [Self::Value; VAR_COUNT],
+        steps: [Size; VAR_COUNT],
+        iterations: Self::Value,
+        body_visitor: BodyPtrsVisitor,
+    ) where
+        BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]),
+    {
+        const {
+            assert!(VAR_COUNT > 0, "VAR_COUNT must be bigger than zero.");
+        }
+        for step in steps {
+            assert_ne!(step.bytes(), 0, "We are iterating over memory, ZSTs unexpected.");
+        }
+
+        let zero = self.const_usize(0);
+        let additions: [Self::Value; VAR_COUNT] = steps.map(|st| self.const_usize(st.bytes()));
+
+        let header_bb = self.append_sibling_block(&format!("{}_header", loop_name));
+        let body_bb = self.append_sibling_block(&format!("{}_body", loop_name));
+        let next_bb = self.append_sibling_block(&format!("{}_next", loop_name));
+        self.br(header_bb);
+
+        let mut header_bx = Builder::build(self.cx, header_bb);
+        // Use integer for iteration instead of pointers because LLVM canonicalize loop into indexed anyway.
+        let loop_i = header_bx.phi(self.type_isize(), &[zero], &[self.llbb()]);
+        let keep_going = header_bx.icmp(IntPredicate::IntNE, loop_i, iterations);
+        header_bx.cond_br(keep_going, body_bb, next_bb);
+
+        let mut body_bx = Builder::build(self.cx, body_bb);
+        let current_ptrs: [Self::Value; VAR_COUNT] = std::array::from_fn(|i| {
+            let start = start_ptrs[i];
+            // FIXME: Remove pointercast after dropping supporting of LLVM 14.
+            let start = self.pointercast(start, self.type_i8p());
+            let addition = additions[i];
+            let offset = body_bx.unchecked_umul(loop_i, addition);
+            body_bx.inbounds_gep(body_bx.type_i8(), start, &[offset])
+        });
+        body_visitor(&mut body_bx, &current_ptrs);
+        let next_i = body_bx.unchecked_uadd(loop_i, body_bx.const_usize(1));
+        header_bx.add_incoming_to_phi(loop_i, next_i, body_bb);
+        body_bx.br(header_bb);
+
+        *self = Builder::build(self.cx, next_bb);
+    }
+
     fn select(
         &mut self,
         cond: &'ll Value,
diff --git a/compiler/rustc_codegen_llvm/src/lib.rs b/compiler/rustc_codegen_llvm/src/lib.rs
index 24968e00cc8e5..35ac7b33d3ff7 100644
--- a/compiler/rustc_codegen_llvm/src/lib.rs
+++ b/compiler/rustc_codegen_llvm/src/lib.rs
@@ -7,6 +7,7 @@
 #![doc(html_root_url = "https://doc.rust-lang.org/nightly/nightly-rustc/")]
 #![feature(extern_types)]
 #![feature(hash_raw_entry)]
+#![feature(inline_const)]
 #![feature(iter_intersperse)]
 #![feature(let_chains)]
 #![feature(never_type)]
diff --git a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
index 9ac2424e76be0..bf9610e5ee6ce 100644
--- a/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
+++ b/compiler/rustc_codegen_ssa/src/mir/intrinsic.rs
@@ -9,12 +9,12 @@ use crate::meth;
 use crate::traits::*;
 use crate::MemFlags;
 
-use rustc_middle::ty::{self, Ty, TyCtxt};
-use rustc_span::{sym, Span};
-use rustc_target::abi::{
-    call::{FnAbi, PassMode},
-    WrappingRange,
-};
+use rustc_middle::ty;
+use rustc_middle::ty::{Ty, TyCtxt};
+use rustc_span::sym;
+use rustc_span::Span;
+use rustc_target::abi::call::{FnAbi, PassMode};
+use rustc_target::abi::WrappingRange;
 
 fn copy_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
     bx: &mut Bx,
@@ -37,6 +37,442 @@ fn copy_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
     }
 }
 
+mod swap_intrinsic {
+    use crate::traits::*;
+    use crate::MemFlags;
+
+    use rustc_middle::mir::interpret::PointerArithmetic;
+    use rustc_middle::ty::Ty;
+    use rustc_span::Span;
+    use rustc_target::abi::{Align, Size};
+    use rustc_target::spec::HasTargetSpec;
+
+    // Note: We deliberately interpret our values as some ranges of bytes
+    // for performance like did earlier in the old `core::mem::swap` implementation
+    // and use immediate values instead of PlaceRefs.
+    pub(super) fn single<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
+        bx: &mut Bx,
+        x_ptr: Bx::Value,
+        y_ptr: Bx::Value,
+        ty: Ty<'tcx>,
+        span: Span,
+    ) {
+        let layout = bx.layout_of(ty);
+        if layout.is_unsized() {
+            span_bug!(span, "swap_nonoverlapping_single must be called only for sized types");
+        }
+        if layout.is_zst() {
+            // no-op
+            return;
+        }
+        let should_use_2_temp_vals = {
+            // Primitive integer or something equal to it by size.
+            (layout.size <= bx.cx().pointer_size() && layout.size.bytes().is_power_of_two())
+            // SPIR-V doesn't allow reinterpretation of values as chunks of arbitrary ints
+            // so we need to read and copy them full.
+            // For small values we use double read-double write.
+            || (layout.size <= bx.cx().pointer_size() && bx.cx().target_spec().arch == "spirv")
+        };
+        if should_use_2_temp_vals {
+            let ty = bx.backend_type(layout);
+            let align = layout.align.abi;
+            swap_using_2_temps(bx, x_ptr, y_ptr, ty, align);
+            return;
+        }
+
+        // If need to swap large value,
+        // it probably better to do single memcpy from one elem
+        // to another after saving the old value.
+        let should_use_single_temp_val = {
+            // Most likely some `Simd<X, N>` type from portable simd or manual simd.
+            // There is no difference with double read in release build
+            // but it reduces amount of code generated in debug build.
+            (layout.align.abi.bytes() == layout.size.bytes() && layout.size > bx.cx().pointer_size())
+            // Probably aggregate with some SIMD type field.
+            // E.g. `Option<f32x4>`.
+            // Need to think how to do it better.
+            || layout.align.abi > bx.data_layout().pointer_align.abi
+            // SPIRV doesn't allow partial reads/writes and value reinterpretations
+            // so our best chance to reduce stack usage is to use single alloca.
+            || bx.cx().target_spec().arch == "spirv"
+        };
+        if should_use_single_temp_val {
+            let ty = bx.backend_type(layout);
+            swap_using_single_temp(bx, x_ptr, y_ptr, ty, layout.size, layout.align.abi);
+            return;
+        }
+
+        // Both LLVM and GCC seem to benefit from same splitting loops
+        // so place this code here to prevent duplication.
+        // https://godbolt.org/z/arzvePb8T
+
+        if bx.cx().target_spec().arch == "x86_64" {
+            swap_unaligned_x86_64_single(bx, layout, x_ptr, y_ptr);
+            return;
+        }
+
+        // Swap using aligned integers as chunks.
+        assert!(layout.align.abi.bytes() <= bx.pointer_size().bytes());
+        assert_eq!(bx.data_layout().pointer_align.abi.bytes(), bx.pointer_size().bytes());
+        let chunk_size = std::cmp::min(layout.align.abi.bytes(), bx.pointer_size().bytes());
+        let chunk_size = Size::from_bytes(chunk_size);
+        make_swaps_loop(
+            bx,
+            x_ptr,
+            y_ptr,
+            ToSwap::Bytes(layout.size),
+            ChunkInfo::IntChunk(chunk_size),
+            NumOfTemps::Two,
+            Align::from_bytes(chunk_size.bytes()).unwrap(),
+        );
+    }
+
+    // `x86_64` allows optimization using unaligned accesses
+    // because unaligned reads/writes are fast on x86_64.
+    // https://lemire.me/blog/2012/05/31/data-alignment-for-speed-myth-or-reality/
+    // We manually swap last `x % ZMM_BYTES` bytes in a way that would always vectorize
+    // them AVX and/or SSE because both GCC and LLVM generate fails to use smaller SIMD registers
+    // if they had used larger ones.
+    fn swap_unaligned_x86_64_single<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
+        bx: &mut Bx,
+        layout: Bx::LayoutOfResult,
+        x_ptr: Bx::Value,
+        y_ptr: Bx::Value,
+    ) {
+        const ZMM_BYTES: u64 = 512 / 8;
+        const YMM_BYTES: u64 = 256 / 8;
+        const XMM_BYTES: u64 = 128 / 8;
+
+        let min_align = Align::from_bytes(1).expect("One is always valid align.");
+        let ptr_size = bx.cx().pointer_size();
+        // Need to do pointercasts because `rustc_codegen_gcc` ignores passed type
+        // in `inbounds_gep`.
+        let x_ptr = bx.pointercast(x_ptr, bx.type_i8p());
+        let y_ptr = bx.pointercast(y_ptr, bx.type_i8p());
+
+        let mut total_offset = Size::ZERO;
+        // Make a loop that is vectorized using largest vectors.
+        // It would use largest available vectors, not necessary ZMM.
+        if layout.size.bytes() >= ZMM_BYTES {
+            let to_swap = Size::from_bytes(layout.size.bytes() / ZMM_BYTES * ZMM_BYTES);
+            make_swaps_loop(
+                bx,
+                x_ptr,
+                y_ptr,
+                ToSwap::Bytes(to_swap),
+                ChunkInfo::IntChunk(ptr_size),
+                NumOfTemps::Two,
+                min_align,
+            );
+            total_offset += to_swap;
+        }
+        // This loop contents are based on knowledge from this: https://godbolt.org/z/Mr4rWfoad
+        // And this: https://godbolt.org/z/YzcWofG5Y
+        // Both LLVM and GCC fail to use SIMD registers for swapping tails without this.
+        for (num_temps, chunk_size) in [(4, YMM_BYTES), (2, XMM_BYTES)] {
+            let chunk_size = Size::from_bytes(chunk_size);
+            assert_eq!(
+                ptr_size * num_temps,
+                chunk_size,
+                "Invalid assumption about pointer size or register size",
+            );
+            if layout.size < total_offset + chunk_size {
+                continue;
+            }
+
+            let x_tmps_and_offsets: Vec<_> = (0..num_temps)
+                .map(|i| {
+                    let curr_off = total_offset + i * ptr_size;
+                    let curr_off = bx.const_usize(curr_off.bytes());
+                    let x_gep = bx.inbounds_gep(bx.type_i8(), x_ptr, &[curr_off]);
+                    // FIXME: Remove pointercast after stopping support of LLVM 14.
+                    let x_gep = bx.pointercast(x_gep, bx.type_ptr_to(bx.type_isize()));
+                    (bx.load(bx.type_isize(), x_gep, min_align), curr_off)
+                })
+                .collect();
+
+            let chunk_size_val = bx.const_usize(chunk_size.bytes());
+            let chunk_offset = bx.const_usize(total_offset.bytes());
+            let x_chunk_gep = bx.inbounds_gep(bx.type_i8(), x_ptr, &[chunk_offset]);
+            let y_chunk_gep = bx.inbounds_gep(bx.type_i8(), y_ptr, &[chunk_offset]);
+            // FIXME(AngelicosPhosphoros): Use memcpy.inline here.
+            bx.memcpy(
+                x_chunk_gep,
+                min_align,
+                y_chunk_gep,
+                min_align,
+                chunk_size_val,
+                MemFlags::UNALIGNED,
+            );
+            for (x_tmp, curr_off) in x_tmps_and_offsets {
+                let y_gep = bx.inbounds_gep(bx.type_i8(), y_ptr, &[curr_off]);
+                // FIXME: Remove pointercast after stopping support of LLVM 14.
+                let y_gep = bx.pointercast(y_gep, bx.type_ptr_to(bx.type_isize()));
+                bx.store(x_tmp, y_gep, min_align);
+            }
+
+            total_offset += chunk_size;
+        }
+
+        // I decided to use swaps by pow2 ints here based
+        // on this codegen example: https://godbolt.org/z/rWYqMGnWh
+        // This loops implements it using minimal amount of instructions
+        // and registers involved.
+        let mut current_size = bx.pointer_size();
+        while total_offset < layout.size {
+            // In each loop iteration, remaining amount of unswapped bytes
+            // is less than in previous iteration.
+
+            assert_ne!(current_size, Size::ZERO, "We must had finished swapping when it was 1");
+
+            let next_size = Size::from_bytes(current_size.bytes() / 2);
+            if total_offset + current_size > layout.size {
+                current_size = next_size;
+                continue;
+            }
+
+            let tail_offset = bx.const_usize(total_offset.bytes());
+            let x_tail_ptr = bx.inbounds_gep(bx.type_i8(), x_ptr, &[tail_offset]);
+            let y_tail_ptr = bx.inbounds_gep(bx.type_i8(), y_ptr, &[tail_offset]);
+
+            let chunt_ty = choose_int_by_size(bx, current_size);
+            swap_using_2_temps(bx, x_tail_ptr, y_tail_ptr, chunt_ty, min_align);
+
+            total_offset += current_size;
+            current_size = next_size;
+        }
+    }
+
+    // We cannot use some of optimizations available for [`single`]
+    // because we don't know how many bytes exactly we need to swap.
+    pub(super) fn many<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
+        bx: &mut Bx,
+        x_ptr: Bx::Value,
+        y_ptr: Bx::Value,
+        count: Bx::Value,
+        ty: Ty<'tcx>,
+        span: Span,
+    ) {
+        let layout = bx.layout_of(ty);
+        if layout.is_unsized() {
+            span_bug!(span, "swap_nonoverlapping_many must be called only for sized types");
+        }
+        if layout.is_zst() {
+            // no-op
+            return;
+        }
+
+        let must_not_split_values = {
+            // Unusual type, maybe some manual SIMD optimization.
+            layout.align.abi > bx.data_layout().pointer_align.abi && layout.align.abi.bytes() == layout.size.bytes()
+            // Probably aggregate with some SIMD type field.
+            // E.g. `Option<f32x4>`.
+            // Need to think how to do it better.
+            || layout.align.abi > bx.data_layout().pointer_align.abi
+            // SPIR-V doesn't allow reinterpretation of values as chunks of arbitrary ints
+            // so we need to read and copy them by element full.
+            || bx.cx().target_spec().arch == "spirv"
+        };
+
+        if must_not_split_values {
+            let back_ty = bx.backend_type(layout);
+            let num_of_temps =
+                if layout.size > bx.pointer_size() { NumOfTemps::Single } else { NumOfTemps::Two };
+            make_swaps_loop(
+                bx,
+                x_ptr,
+                y_ptr,
+                ToSwap::Iterations(count),
+                ChunkInfo::RealTyChunk(back_ty, layout.size),
+                num_of_temps,
+                layout.align.abi,
+            );
+            return;
+        }
+
+        let chunk_size = if bx.cx().target_spec().arch == "x86_64" {
+            // x86_64 allows unaligned reads/writes
+            // and it is relatively fast
+            // so try largest chunk available.
+            const INT_SIZES: [u64; 4] = [1, 2, 4, 8];
+            INT_SIZES
+                .into_iter()
+                .map(Size::from_bytes)
+                .take_while(|x| *x <= layout.size)
+                .filter(|x| layout.size.bytes() % x.bytes() == 0)
+                .last()
+                .unwrap()
+        } else {
+            // Fallback to integer with size equal to alignment
+            Size::from_bytes(layout.align.abi.bytes())
+        };
+
+        let chunks_per_elem = layout.size.bytes() / chunk_size.bytes();
+        assert_ne!(chunks_per_elem, 0);
+        let iterations = if chunks_per_elem == 1 {
+            count
+        } else {
+            let chunks_per_elem = bx.const_usize(chunks_per_elem);
+            bx.unchecked_umul(count, chunks_per_elem)
+        };
+
+        make_swaps_loop(
+            bx,
+            x_ptr,
+            y_ptr,
+            ToSwap::Iterations(iterations),
+            ChunkInfo::IntChunk(chunk_size),
+            NumOfTemps::Two,
+            // It iterates either by chunks equal to alignment
+            // or multiply of alignment so it would always be correct.
+            layout.align.abi,
+        );
+    }
+
+    fn choose_int_by_size<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
+        bx: &mut Bx,
+        size: Size,
+    ) -> Bx::Type {
+        match size.bits() {
+            8 => bx.type_i8(),
+            16 => bx.type_i16(),
+            32 => bx.type_i32(),
+            64 => bx.type_i64(),
+            128 => bx.type_i128(),
+            _ => unreachable!("Unexpected target int {:?}.", size),
+        }
+    }
+
+    #[derive(Clone, Copy)]
+    enum ToSwap<BxValue> {
+        /// Size of region to swap. Useful when we know exact value.
+        Bytes(Size),
+        /// Number of chunks to swap. For runtime value.
+        Iterations(BxValue),
+    }
+
+    #[derive(Clone, Copy)]
+    enum ChunkInfo<BxType> {
+        /// When we want to use it directly
+        RealTyChunk(BxType, Size),
+        /// When we want to split value by integer chunk.
+        IntChunk(Size),
+    }
+
+    #[derive(Copy, Clone, Eq, PartialEq)]
+    enum NumOfTemps {
+        Single,
+        Two,
+    }
+
+    fn make_swaps_loop<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
+        bx: &mut Bx,
+        x_ptr: Bx::Value,
+        y_ptr: Bx::Value,
+        to_swap: ToSwap<Bx::Value>,
+        chunk_info: ChunkInfo<Bx::Type>,
+        num_of_temps: NumOfTemps,
+        access_align: Align,
+    ) {
+        let (ChunkInfo::IntChunk(chunk_size) | ChunkInfo::RealTyChunk(_, chunk_size)) = chunk_info;
+
+        assert_ne!(chunk_size, Size::ZERO);
+
+        if let ToSwap::Bytes(total_bytes) = to_swap {
+            assert!(
+                total_bytes > chunk_size,
+                "No need to generate loop when simple swap is enough."
+            );
+            assert_eq!(
+                total_bytes.bytes() % chunk_size.bytes(),
+                0,
+                "Cannot split size of swap into chunks."
+            );
+        }
+
+        assert_eq!(
+            chunk_size.bytes() % access_align.bytes(),
+            0,
+            "Ensure that access align doesn't shift",
+        );
+
+        let chunk_ty = match chunk_info {
+            ChunkInfo::RealTyChunk(ty, _) => ty,
+            ChunkInfo::IntChunk(size) => choose_int_by_size(bx, size),
+        };
+
+        let iterations = match to_swap {
+            ToSwap::Bytes(s) => {
+                let iterations_val = s.bytes() / chunk_size.bytes();
+                bx.const_usize(iterations_val)
+            }
+            ToSwap::Iterations(it) => it,
+        };
+
+        // Need to do pointercasts because `rustc_codegen_gcc` ignores passed type
+        // in `inbounds_gep`.
+        let x_ptr = bx.pointercast(x_ptr, bx.type_i8p());
+        let y_ptr = bx.pointercast(y_ptr, bx.type_i8p());
+        bx.make_memory_loop(
+            "swap_loop",
+            [x_ptr, y_ptr],
+            [chunk_size; 2],
+            iterations,
+            |body_bx, &[curr_x_ptr, curr_y_ptr]| match num_of_temps {
+                NumOfTemps::Single => swap_using_single_temp(
+                    body_bx,
+                    curr_x_ptr,
+                    curr_y_ptr,
+                    chunk_ty,
+                    chunk_size,
+                    access_align,
+                ),
+                NumOfTemps::Two => {
+                    swap_using_2_temps(body_bx, curr_x_ptr, curr_y_ptr, chunk_ty, access_align)
+                }
+            },
+        );
+    }
+
+    fn swap_using_2_temps<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
+        bx: &mut Bx,
+        x_ptr: Bx::Value,
+        y_ptr: Bx::Value,
+        tmp_ty: Bx::Type,
+        access_align: Align,
+    ) {
+        // FIXME: Remove pointercast when stop support of LLVM 14.
+        let tmp_ptr_ty = bx.type_ptr_to(tmp_ty);
+        let x_ptr = bx.pointercast(x_ptr, tmp_ptr_ty);
+        let y_ptr = bx.pointercast(y_ptr, tmp_ptr_ty);
+
+        let tmp_x = bx.load(tmp_ty, x_ptr, access_align);
+        let tmp_y = bx.load(tmp_ty, y_ptr, access_align);
+        bx.store(tmp_y, x_ptr, access_align);
+        bx.store(tmp_x, y_ptr, access_align);
+    }
+
+    fn swap_using_single_temp<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
+        bx: &mut Bx,
+        x_ptr: Bx::Value,
+        y_ptr: Bx::Value,
+        tmp_ty: Bx::Type,
+        tmp_size: Size,
+        access_align: Align,
+    ) {
+        // FIXME: Remove pointercast when stop support of LLVM 14.
+        let tmp_ptr_ty = bx.type_ptr_to(tmp_ty);
+        let x_ptr = bx.pointercast(x_ptr, tmp_ptr_ty);
+        let y_ptr = bx.pointercast(y_ptr, tmp_ptr_ty);
+
+        let num_bytes = bx.const_usize(tmp_size.bytes());
+        let tmp_x = bx.load(tmp_ty, x_ptr, access_align);
+        // FIXME(AngelicosPhosphoros): Use memcpy.inline here.
+        bx.memcpy(x_ptr, access_align, y_ptr, access_align, num_bytes, MemFlags::empty());
+        bx.store(tmp_x, y_ptr, access_align);
+    }
+}
+
 fn memset_intrinsic<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>>(
     bx: &mut Bx,
     volatile: bool,
@@ -154,6 +590,27 @@ impl<'a, 'tcx, Bx: BuilderMethods<'a, 'tcx>> FunctionCx<'a, 'tcx, Bx> {
                 );
                 return;
             }
+            sym::swap_nonoverlapping_single => {
+                swap_intrinsic::single(
+                    bx,
+                    args[0].immediate(),
+                    args[1].immediate(),
+                    substs.type_at(0),
+                    span,
+                );
+                return;
+            }
+            sym::swap_nonoverlapping_many => {
+                swap_intrinsic::many(
+                    bx,
+                    args[0].immediate(),
+                    args[1].immediate(),
+                    args[2].immediate(),
+                    substs.type_at(0),
+                    span,
+                );
+                return;
+            }
             sym::write_bytes => {
                 memset_intrinsic(
                     bx,
diff --git a/compiler/rustc_codegen_ssa/src/traits/builder.rs b/compiler/rustc_codegen_ssa/src/traits/builder.rs
index 853c6934c2c24..a8fea579875bc 100644
--- a/compiler/rustc_codegen_ssa/src/traits/builder.rs
+++ b/compiler/rustc_codegen_ssa/src/traits/builder.rs
@@ -257,6 +257,21 @@ pub trait BuilderMethods<'a, 'tcx>:
         flags: MemFlags,
     );
 
+    /// Loop that iterates over some memory using offsets steps.
+    /// Interprets pointers as u8 pointers.
+    /// `BodyPtrsVisitor` allow access to body and current iteration pointers.
+    /// Steps MUST not be zeros.
+    /// `steps[i]*iterations` MUST not overflow targets `usize`.
+    fn make_memory_loop<BodyPtrsVisitor, const VAR_COUNT: usize>(
+        &mut self,
+        loop_name: &str,
+        start_ptrs: [Self::Value; VAR_COUNT],
+        steps: [Size; VAR_COUNT],
+        iterations: Self::Value,
+        visitor: BodyPtrsVisitor,
+    ) where
+        BodyPtrsVisitor: FnOnce(&mut Self, &[Self::Value; VAR_COUNT]);
+
     fn select(
         &mut self,
         cond: Self::Value,
diff --git a/compiler/rustc_const_eval/src/interpret/intrinsics.rs b/compiler/rustc_const_eval/src/interpret/intrinsics.rs
index 7192bbc00d556..3ae610e2954e2 100644
--- a/compiler/rustc_const_eval/src/interpret/intrinsics.rs
+++ b/compiler/rustc_const_eval/src/interpret/intrinsics.rs
@@ -291,6 +291,24 @@ impl<'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> InterpCx<'mir, 'tcx, M> {
             sym::write_bytes => {
                 self.write_bytes_intrinsic(&args[0], &args[1], &args[2])?;
             }
+            sym::swap_nonoverlapping_single => {
+                let layout = self.layout_of(substs.type_at(0))?;
+                self.mem_swap_nonoverlapping(
+                    self.read_pointer(&args[0])?,
+                    self.read_pointer(&args[1])?,
+                    1,
+                    layout,
+                )?;
+            }
+            sym::swap_nonoverlapping_many => {
+                let layout = self.layout_of(substs.type_at(0))?;
+                self.mem_swap_nonoverlapping(
+                    self.read_pointer(&args[0])?,
+                    self.read_pointer(&args[1])?,
+                    self.read_target_usize(&args[2])?,
+                    layout,
+                )?;
+            }
             sym::arith_offset => {
                 let ptr = self.read_pointer(&args[0])?;
                 let offset_count = self.read_target_isize(&args[1])?;
diff --git a/compiler/rustc_const_eval/src/interpret/memory.rs b/compiler/rustc_const_eval/src/interpret/memory.rs
index 1125d8d1f0e08..eb33c1f4fdb93 100644
--- a/compiler/rustc_const_eval/src/interpret/memory.rs
+++ b/compiler/rustc_const_eval/src/interpret/memory.rs
@@ -21,10 +21,10 @@ use rustc_target::abi::{Align, HasDataLayout, Size};
 use crate::const_eval::CheckAlignment;
 use crate::fluent_generated as fluent;
 
+use super::alloc_range;
 use super::{
-    alloc_range, AllocBytes, AllocId, AllocMap, AllocRange, Allocation, CheckInAllocMsg,
-    GlobalAlloc, InterpCx, InterpResult, Machine, MayLeak, Pointer, PointerArithmetic, Provenance,
-    Scalar,
+    AllocBytes, AllocId, AllocMap, AllocRange, Allocation, CheckInAllocMsg, GlobalAlloc, InterpCx,
+    InterpResult, Machine, MayLeak, Pointer, PointerArithmetic, Provenance, Scalar,
 };
 
 #[derive(Debug, PartialEq, Copy, Clone)]
@@ -1222,6 +1222,59 @@ impl<'mir, 'tcx: 'mir, M: Machine<'mir, 'tcx>> InterpCx<'mir, 'tcx, M> {
 
         Ok(())
     }
+
+    pub fn mem_swap_nonoverlapping(
+        &mut self,
+        x_ptr: Pointer<Option<M::Provenance>>,
+        y_ptr: Pointer<Option<M::Provenance>>,
+        count: u64,
+        layout: ty::layout::TyAndLayout<'tcx>,
+    ) -> InterpResult<'tcx> {
+        let elem_size = layout.size;
+        let align = layout.align.abi;
+
+        if count > i64::MAX as u64 {
+            throw_ub_format!("`count` argument to `swap_nonoverlapping_many` is too large.");
+        }
+
+        let first_ptr_acc = self.get_ptr_access(x_ptr, elem_size * count, align)?;
+        let second_ptr_acc = self.get_ptr_access(y_ptr, elem_size * count, align)?;
+
+        let Some((x_alloc_id, x_offset, _)) = first_ptr_acc else {
+            assert_eq!(elem_size, Size::ZERO);
+            // Called on ZST so it is noop.
+            return Ok(())
+        };
+        let Some((y_alloc_id, y_offset, _)) = second_ptr_acc else {
+            unreachable!("If right param is ZST, left must be too")
+        };
+
+        if x_alloc_id == y_alloc_id {
+            if (x_offset..x_offset + elem_size * count).contains(&y_offset)
+                || (y_offset..y_offset + elem_size * count).contains(&x_offset)
+            {
+                throw_ub_format!("swap was called on overlapping memory.");
+            }
+        }
+
+        if count == 0 {
+            return Ok(());
+        }
+
+        let tmp_stack_alloc = self.allocate(layout, MemoryKind::Stack)?;
+
+        for i in 0..i64::try_from(count).unwrap() {
+            let curr_x_ptr = self.ptr_offset_inbounds(x_ptr, layout.ty, i)?;
+            let curr_y_ptr = self.ptr_offset_inbounds(y_ptr, layout.ty, i)?;
+
+            self.mem_copy(curr_x_ptr, align, tmp_stack_alloc.ptr, align, elem_size, true)?;
+            self.mem_copy(curr_y_ptr, align, curr_x_ptr, align, elem_size, true)?;
+            self.mem_copy(tmp_stack_alloc.ptr, align, curr_y_ptr, align, elem_size, true)?;
+        }
+        self.deallocate_ptr(tmp_stack_alloc.ptr, Some((elem_size, align)), MemoryKind::Stack)?;
+
+        Ok(())
+    }
 }
 
 /// Machine pointer introspection.
diff --git a/compiler/rustc_hir_analysis/src/check/intrinsic.rs b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
index 36c468e778986..b93bc57e2002e 100644
--- a/compiler/rustc_hir_analysis/src/check/intrinsic.rs
+++ b/compiler/rustc_hir_analysis/src/check/intrinsic.rs
@@ -278,6 +278,23 @@ pub fn check_intrinsic_type(tcx: TyCtxt<'_>, it: &hir::ForeignItem<'_>) {
                 ],
                 tcx.mk_unit(),
             ),
+            sym::swap_nonoverlapping_single => (
+                1,
+                vec![
+                    tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }),
+                    tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }),
+                ],
+                tcx.mk_unit(),
+            ),
+            sym::swap_nonoverlapping_many => (
+                1,
+                vec![
+                    tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }),
+                    tcx.mk_ptr(ty::TypeAndMut { ty: param(0), mutbl: hir::Mutability::Mut }),
+                    tcx.types.usize,
+                ],
+                tcx.mk_unit(),
+            ),
             sym::sqrtf32 => (0, vec![tcx.types.f32], tcx.types.f32),
             sym::sqrtf64 => (0, vec![tcx.types.f64], tcx.types.f64),
             sym::powif32 => (0, vec![tcx.types.f32, tcx.types.i32], tcx.types.f32),
diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs
index c5ce2575fff06..aacee6a44ce1f 100644
--- a/compiler/rustc_span/src/symbol.rs
+++ b/compiler/rustc_span/src/symbol.rs
@@ -1480,6 +1480,8 @@ symbols! {
         sub_assign,
         sub_with_overflow,
         suggestion,
+        swap_nonoverlapping_many,
+        swap_nonoverlapping_single,
         sym,
         sync,
         t32,
diff --git a/library/core/src/intrinsics.rs b/library/core/src/intrinsics.rs
index 9b8612485ac1e..7eb145e99e73c 100644
--- a/library/core/src/intrinsics.rs
+++ b/library/core/src/intrinsics.rs
@@ -2768,6 +2768,67 @@ pub const unsafe fn copy<T>(src: *const T, dst: *mut T, count: usize) {
     }
 }
 
+#[cfg(not(bootstrap))]
+extern "rust-intrinsic" {
+    /// This is an implementation detail of [`crate::mem::swap`] and should
+    /// not be used anywhere else.
+    ///
+    /// Swaps 2 values using minimal extra memory depending on target.
+    /// Created to remove target/backend specific optimizations from library code to
+    /// make MIR-level optimizations simpler to implement.
+    ///
+    /// The operation is "untyped" in the sense that data may be uninitialized or otherwise violate the
+    /// requirements of `T`. The initialization state is preserved exactly.
+    ///
+    /// # Safety
+    ///
+    /// Behavior is undefined if any of the following conditions are violated:
+    ///
+    /// * Both `x` and `y` must be valid for both reads and writes of `size_of::<T>()` bytes.
+    ///
+    /// * Both `x` and `y` must be properly aligned.
+    ///
+    /// * The region of memory beginning at `x` with a size of `size_of::<T>()`
+    ///   bytes must *not* overlap with the region of memory beginning at `y`
+    ///   with the same size.
+    ///
+    /// Note that even if the effectively copied size (`size_of::<T>()`) is `0`,
+    /// the pointers must be non-null and properly aligned.
+    #[rustc_nounwind]
+    #[rustc_const_unstable(feature = "const_swap", issue = "83163")]
+    pub fn swap_nonoverlapping_single<T: Sized>(x: *mut T, y: *mut T);
+
+    /// This is an implementation detail of [`crate::ptr::swap_nonoverlapping`] and should
+    /// not be used anywhere else.
+    ///
+    /// Swaps 2 ranges of values starting from `x` and `y` using minimal extra memory depending on target.
+    /// Created to remove target/backend specific optimizations from library code to
+    /// make MIR-level optimizations simpler to implement.
+    ///
+    /// The operation is "untyped" in the sense that data may be uninitialized or otherwise violate the
+    /// requirements of `T`. The initialization state is preserved exactly.
+    ///
+    /// # Safety
+    ///
+    /// Behavior is undefined if any of the following conditions are violated:
+    ///
+    /// * Both `x` and `y` must be valid for both reads and writes of `count *
+    ///   size_of::<T>()` bytes.
+    ///
+    /// * Both `x` and `y` must be properly aligned.
+    ///
+    /// * The region of memory beginning at `x` with a size of `count *
+    ///   size_of::<T>()` bytes must *not* overlap with the region of memory
+    ///   beginning at `y` with the same size.
+    ///
+    /// Note that even if the effectively copied size (`count * size_of::<T>()`) is `0`,
+    /// the pointers must be non-null and properly aligned.
+    ///
+    #[rustc_nounwind]
+    #[rustc_const_unstable(feature = "const_swap", issue = "83163")]
+    pub fn swap_nonoverlapping_many<T: Sized>(x: *mut T, y: *mut T, count: usize);
+}
+
 /// Sets `count * size_of::<T>()` bytes of memory starting at `dst` to
 /// `val`.
 ///
diff --git a/library/core/src/mem/mod.rs b/library/core/src/mem/mod.rs
index 39c9a04eea92b..9b374745cd3c5 100644
--- a/library/core/src/mem/mod.rs
+++ b/library/core/src/mem/mod.rs
@@ -724,40 +724,52 @@ pub unsafe fn uninitialized<T>() -> T {
 #[stable(feature = "rust1", since = "1.0.0")]
 #[rustc_const_unstable(feature = "const_swap", issue = "83163")]
 pub const fn swap<T>(x: &mut T, y: &mut T) {
-    // NOTE(eddyb) SPIR-V's Logical addressing model doesn't allow for arbitrary
-    // reinterpretation of values as (chunkable) byte arrays, and the loop in the
-    // block optimization in `swap_slice` is hard to rewrite back
-    // into the (unoptimized) direct swapping implementation, so we disable it.
-    // FIXME(eddyb) the block optimization also prevents MIR optimizations from
-    // understanding `mem::replace`, `Option::take`, etc. - a better overall
-    // solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which
-    // a backend can choose to implement using the block optimization, or not.
-    #[cfg(not(any(target_arch = "spirv")))]
+    #[cfg(bootstrap)]
     {
-        // For types that are larger multiples of their alignment, the simple way
-        // tends to copy the whole thing to stack rather than doing it one part
-        // at a time, so instead treat them as one-element slices and piggy-back
-        // the slice optimizations that will split up the swaps.
-        if size_of::<T>() / align_of::<T>() > 4 {
-            // SAFETY: exclusive references always point to one non-overlapping
-            // element and are non-null and properly aligned.
-            return unsafe { ptr::swap_nonoverlapping(x, y, 1) };
+        // NOTE(eddyb) SPIR-V's Logical addressing model doesn't allow for arbitrary
+        // reinterpretation of values as (chunkable) byte arrays, and the loop in the
+        // block optimization in `swap_slice` is hard to rewrite back
+        // into the (unoptimized) direct swapping implementation, so we disable it.
+        // FIXME(eddyb) the block optimization also prevents MIR optimizations from
+        // understanding `mem::replace`, `Option::take`, etc. - a better overall
+        // solution might be to make `ptr::swap_nonoverlapping` into an intrinsic, which
+        // a backend can choose to implement using the block optimization, or not.
+        #[cfg(not(any(target_arch = "spirv")))]
+        {
+            // For types that are larger multiples of their alignment, the simple way
+            // tends to copy the whole thing to stack rather than doing it one part
+            // at a time, so instead treat them as one-element slices and piggy-back
+            // the slice optimizations that will split up the swaps.
+            if size_of::<T>() / align_of::<T>() > 4 {
+                // SAFETY: exclusive references always point to one non-overlapping
+                // element and are non-null and properly aligned.
+                return unsafe { ptr::swap_nonoverlapping(x, y, 1) };
+            }
         }
-    }
 
-    // If a scalar consists of just a small number of alignment units, let
-    // the codegen just swap those pieces directly, as it's likely just a
-    // few instructions and anything else is probably overcomplicated.
-    //
-    // Most importantly, this covers primitives and simd types that tend to
-    // have size=align where doing anything else can be a pessimization.
-    // (This will also be used for ZSTs, though any solution works for them.)
-    swap_simple(x, y);
+        // If a scalar consists of just a small number of alignment units, let
+        // the codegen just swap those pieces directly, as it's likely just a
+        // few instructions and anything else is probably overcomplicated.
+        //
+        // Most importantly, this covers primitives and simd types that tend to
+        // have size=align where doing anything else can be a pessimization.
+        // (This will also be used for ZSTs, though any solution works for them.)
+        swap_simple(x, y);
+    }
+    #[cfg(not(bootstrap))]
+    // SAFETY: since `x` and `y` are mutable references,
+    // 1. `x` and `y` are initialized.
+    // 2. `x` and `y` cannot overlap.
+    // 3. `x` and `y` are aligned.
+    unsafe {
+        core::intrinsics::swap_nonoverlapping_single(x, y);
+    }
 }
 
 /// Same as [`swap`] semantically, but always uses the simple implementation.
 ///
 /// Used elsewhere in `mem` and `ptr` at the bottom layer of calls.
+#[cfg(bootstrap)]
 #[rustc_const_unstable(feature = "const_swap", issue = "83163")]
 #[inline]
 pub(crate) const fn swap_simple<T>(x: &mut T, y: &mut T) {
diff --git a/library/core/src/ptr/mod.rs b/library/core/src/ptr/mod.rs
index acc9ca29d41a1..a6de56da986ae 100644
--- a/library/core/src/ptr/mod.rs
+++ b/library/core/src/ptr/mod.rs
@@ -908,25 +908,6 @@ pub const unsafe fn swap<T>(x: *mut T, y: *mut T) {
 #[stable(feature = "swap_nonoverlapping", since = "1.27.0")]
 #[rustc_const_unstable(feature = "const_swap", issue = "83163")]
 pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
-    #[allow(unused)]
-    macro_rules! attempt_swap_as_chunks {
-        ($ChunkTy:ty) => {
-            if mem::align_of::<T>() >= mem::align_of::<$ChunkTy>()
-                && mem::size_of::<T>() % mem::size_of::<$ChunkTy>() == 0
-            {
-                let x: *mut $ChunkTy = x.cast();
-                let y: *mut $ChunkTy = y.cast();
-                let count = count * (mem::size_of::<T>() / mem::size_of::<$ChunkTy>());
-                // SAFETY: these are the same bytes that the caller promised were
-                // ok, just typed as `MaybeUninit<ChunkTy>`s instead of as `T`s.
-                // The `if` condition above ensures that we're not violating
-                // alignment requirements, and that the division is exact so
-                // that we don't lose any bytes off the end.
-                return unsafe { swap_nonoverlapping_simple_untyped(x, y, count) };
-            }
-        };
-    }
-
     // SAFETY: the caller must guarantee that `x` and `y` are
     // valid for writes and properly aligned.
     unsafe {
@@ -940,19 +921,16 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
         );
     }
 
-    // Split up the slice into small power-of-two-sized chunks that LLVM is able
-    // to vectorize (unless it's a special type with more-than-pointer alignment,
-    // because we don't want to pessimize things like slices of SIMD vectors.)
-    if mem::align_of::<T>() <= mem::size_of::<usize>()
-        && (!mem::size_of::<T>().is_power_of_two()
-            || mem::size_of::<T>() > mem::size_of::<usize>() * 2)
-    {
-        attempt_swap_as_chunks!(usize);
-        attempt_swap_as_chunks!(u8);
+    #[cfg(bootstrap)]
+    // SAFETY: Same preconditions as this function
+    unsafe {
+        swap_nonoverlapping_simple_untyped(x, y, count)
     }
-
+    #[cfg(not(bootstrap))]
     // SAFETY: Same preconditions as this function
-    unsafe { swap_nonoverlapping_simple_untyped(x, y, count) }
+    unsafe {
+        intrinsics::swap_nonoverlapping_many(x, y, count)
+    }
 }
 
 /// Same behaviour and safety conditions as [`swap_nonoverlapping`]
@@ -960,6 +938,7 @@ pub const unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
 /// LLVM can vectorize this (at least it can for the power-of-two-sized types
 /// `swap_nonoverlapping` tries to use) so no need to manually SIMD it.
 #[inline]
+#[cfg(bootstrap)]
 #[rustc_const_unstable(feature = "const_swap", issue = "83163")]
 const unsafe fn swap_nonoverlapping_simple_untyped<T>(x: *mut T, y: *mut T, count: usize) {
     let x = x.cast::<MaybeUninit<T>>();
diff --git a/library/core/tests/mem.rs b/library/core/tests/mem.rs
index 5c2e18745ea21..251ebb274c520 100644
--- a/library/core/tests/mem.rs
+++ b/library/core/tests/mem.rs
@@ -103,6 +103,92 @@ fn test_swap() {
     assert_eq!(y, 31337);
 }
 
+#[test]
+fn test_many() {
+    // This tests if chunking works properly
+    fn swap_sized<T: Copy + Eq + core::fmt::Debug, const SIZE: usize>(a: T, b: T) {
+        let mut x: [T; SIZE] = [a; SIZE];
+        let mut y: [T; SIZE] = [b; SIZE];
+        swap::<[T; SIZE]>(&mut x, &mut y);
+        assert_eq!(x, [b; SIZE]);
+        assert_eq!(y, [a; SIZE]);
+    }
+
+    fn swap_t<T: Copy + Eq + core::fmt::Debug>(a: T, b: T) {
+        swap_sized::<T, 0>(a, b);
+        swap_sized::<T, 1>(a, b);
+        swap_sized::<T, 2>(a, b);
+        swap_sized::<T, 3>(a, b);
+        swap_sized::<T, 4>(a, b);
+        swap_sized::<T, 5>(a, b);
+        swap_sized::<T, 6>(a, b);
+        swap_sized::<T, 7>(a, b);
+        swap_sized::<T, 8>(a, b);
+        swap_sized::<T, 9>(a, b);
+        swap_sized::<T, 10>(a, b);
+        swap_sized::<T, 11>(a, b);
+        swap_sized::<T, 12>(a, b);
+        swap_sized::<T, 13>(a, b);
+        swap_sized::<T, 14>(a, b);
+        swap_sized::<T, 15>(a, b);
+        swap_sized::<T, 16>(a, b);
+        swap_sized::<T, 17>(a, b);
+        swap_sized::<T, 18>(a, b);
+        swap_sized::<T, 19>(a, b);
+        swap_sized::<T, 20>(a, b);
+        swap_sized::<T, 21>(a, b);
+        swap_sized::<T, 22>(a, b);
+        swap_sized::<T, 23>(a, b);
+        swap_sized::<T, 24>(a, b);
+        swap_sized::<T, 25>(a, b);
+        swap_sized::<T, 26>(a, b);
+        swap_sized::<T, 27>(a, b);
+        swap_sized::<T, 28>(a, b);
+        swap_sized::<T, 29>(a, b);
+        swap_sized::<T, 30>(a, b);
+        swap_sized::<T, 31>(a, b);
+        swap_sized::<T, 32>(a, b);
+        swap_sized::<T, 33>(a, b);
+    }
+
+    swap_t::<u8>(7, 0xFF);
+    swap_t::<u16>(0xFAFA, 0x9898);
+    swap_t::<u32>(0xF0F0_F0F0, 0x0E0E_0E0E);
+    swap_t::<u64>(7, 8);
+
+    #[derive(Eq, PartialEq, Debug)]
+    #[repr(align(32))]
+    struct LargeAlign([u8; 32]);
+
+    let mut x = LargeAlign([9; 32]);
+    let mut y = LargeAlign([20; 32]);
+    swap(&mut x, &mut y);
+    assert_eq!(x, LargeAlign([20; 32]));
+    assert_eq!(y, LargeAlign([9; 32]));
+
+    #[derive(Eq, PartialEq, Debug)]
+    #[repr(align(32))]
+    struct LargeAlignAndSize([u8; 96]);
+
+    let mut x = LargeAlignAndSize([9; 96]);
+    let mut y = LargeAlignAndSize([20; 96]);
+    swap(&mut x, &mut y);
+    assert_eq!(x, LargeAlignAndSize([20; 96]));
+    assert_eq!(y, LargeAlignAndSize([9; 96]));
+
+    #[derive(Eq, PartialEq, Debug)]
+    struct WithPadding {
+        a: u16,
+        b: u64,
+    }
+
+    let mut x = WithPadding { a: 7, b: 27 };
+    let mut y = WithPadding { a: 77, b: u64::MAX };
+    swap(&mut x, &mut y);
+    assert_eq!(x, WithPadding { a: 77, b: u64::MAX });
+    assert_eq!(y, WithPadding { a: 7, b: 27 });
+}
+
 #[test]
 fn test_replace() {
     let mut x = Some("test".to_string());
diff --git a/library/core/tests/ptr.rs b/library/core/tests/ptr.rs
index c02cd99cc4477..80837f040039b 100644
--- a/library/core/tests/ptr.rs
+++ b/library/core/tests/ptr.rs
@@ -1088,6 +1088,28 @@ fn swap_copy_untyped() {
     assert_eq!(y, 5);
 }
 
+#[test]
+fn test_swap_unaligned_on_x86_64() {
+    #[derive(Copy, Clone, Eq, PartialEq, Debug)]
+    struct AlignedTo2([u16; 4]);
+
+    assert!(
+        mem::size_of::<AlignedTo2>() >= mem::size_of::<usize>()
+            && mem::align_of::<AlignedTo2>() == 2
+            && mem::align_of::<AlignedTo2>() < mem::align_of::<usize>()
+    );
+
+    let buff0: &mut [_] = &mut [AlignedTo2([1, 2, 3, 4]); 20];
+    let buff1: &mut [_] = &mut [AlignedTo2([5, 6, 7, 8]); 20];
+    let len = 20;
+
+    unsafe {
+        swap_nonoverlapping(buff0.as_mut_ptr(), buff1.as_mut_ptr(), read_volatile(&len));
+    }
+    assert_eq!(buff0, &[AlignedTo2([5, 6, 7, 8]); 20]);
+    assert_eq!(buff1, &[AlignedTo2([1, 2, 3, 4]); 20]);
+}
+
 #[test]
 fn test_const_copy() {
     const {
diff --git a/tests/codegen/swap-large-types.rs b/tests/codegen/swap-large-types.rs
index 4a68403578d1e..69caa87514f29 100644
--- a/tests/codegen/swap-large-types.rs
+++ b/tests/codegen/swap-large-types.rs
@@ -16,7 +16,7 @@ type KeccakBuffer = [[u64; 5]; 5];
 // CHECK-LABEL: @swap_basic
 #[no_mangle]
 pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
-// CHECK: alloca [5 x [5 x i64]]
+    // CHECK: alloca [5 x [5 x i64]]
 
     // SAFETY: exclusive references are always valid to read/write,
     // are non-overlapping, and nothing here panics so it's drop-safe.
@@ -33,9 +33,14 @@ pub fn swap_basic(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
 // CHECK-LABEL: @swap_std
 #[no_mangle]
 pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
-// CHECK-NOT: alloca
-// CHECK: load <{{[0-9]+}} x i64>
-// CHECK: store <{{[0-9]+}} x i64>
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: load <{{[0-9]+}} x i64>{{.*}}align 8
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: store <{{[0-9]+}} x i64>{{.*}}align 8
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
     swap(x, y)
 }
 
@@ -45,9 +50,11 @@ pub fn swap_std(x: &mut KeccakBuffer, y: &mut KeccakBuffer) {
 // CHECK-LABEL: @swap_slice
 #[no_mangle]
 pub fn swap_slice(x: &mut [KeccakBuffer], y: &mut [KeccakBuffer]) {
-// CHECK-NOT: alloca
-// CHECK: load <{{[0-9]+}} x i64>
-// CHECK: store <{{[0-9]+}} x i64>
+    // CHECK-NOT: alloca
+    // CHECK: load <{{[0-9]+}} x i{{8|16|32|64}}>{{.*}}align 8
+    // CHECK-NOT: alloca
+    // CHECK: store <{{[0-9]+}} x i{{8|16|32|64}}>{{.*}}align 8
+    // CHECK-NOT: alloca
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -60,32 +67,24 @@ type OneKilobyteBuffer = [u8; 1024];
 // CHECK-LABEL: @swap_1kb_slices
 #[no_mangle]
 pub fn swap_1kb_slices(x: &mut [OneKilobyteBuffer], y: &mut [OneKilobyteBuffer]) {
-// CHECK-NOT: alloca
-// CHECK: load <{{[0-9]+}} x i8>
-// CHECK: store <{{[0-9]+}} x i8>
+    // CHECK-NOT: alloca
+    // CHECK: load <{{[0-9]+}} x i{{8|16|32|64}}>
+    // CHECK: store <{{[0-9]+}} x i{{8|16|32|64}}>
+    // CHECK-NOT: alloca
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This verifies that the 2×read + 2×write optimizes to just 3 memcpys
-// for an unusual type like this.  It's not clear whether we should do anything
-// smarter in Rust for these, so for now it's fine to leave these up to the backend.
-// That's not as bad as it might seem, as for example, LLVM will lower the
-// memcpys below to VMOVAPS on YMMs if one enables the AVX target feature.
-// Eventually we'll be able to pass `align_of::<T>` to a const generic and
-// thus pick a smarter chunk size ourselves without huge code duplication.
-
 #[repr(align(64))]
 pub struct BigButHighlyAligned([u8; 64 * 3]);
 
 // CHECK-LABEL: @swap_big_aligned
 #[no_mangle]
 pub fn swap_big_aligned(x: &mut BigButHighlyAligned, y: &mut BigButHighlyAligned) {
-// CHECK-NOT: call void @llvm.memcpy
-// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192)
-// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192)
-// CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192)
-// CHECK-NOT: call void @llvm.memcpy
+    // CHECK-NOT: alloca
+    // CHECK-NOT: call void @llvm.memcpy
+    // CHECK: call void @llvm.memcpy.{{.+}}({{i8\*|ptr}} noundef nonnull align 64 dereferenceable(192)
+    // CHECK-NOT: call void @llvm.memcpy
     swap(x, y)
 }
diff --git a/tests/codegen/swap-simd-types.rs b/tests/codegen/swap-simd-types.rs
index 3472a42b0e65e..5d702144b3973 100644
--- a/tests/codegen/swap-simd-types.rs
+++ b/tests/codegen/swap-simd-types.rs
@@ -1,4 +1,4 @@
-// compile-flags: -O -C target-feature=+avx
+// compile-flags: -Copt-level=3 -C target-feature=+avx
 // only-x86_64
 // ignore-debug: the debug assertions get in the way
 
@@ -35,7 +35,7 @@ pub fn swap_m256_slice(x: &mut [__m256], y: &mut [__m256]) {
 #[no_mangle]
 pub fn swap_bytes32(x: &mut [u8; 32], y: &mut [u8; 32]) {
 // CHECK-NOT: alloca
-// CHECK: load <32 x i8>{{.+}}align 1
-// CHECK: store <32 x i8>{{.+}}align 1
+// CHECK: load <4 x i64>{{.+}}align 1
+// CHECK: store <4 x i64>{{.+}}align 1
     swap(x, y)
 }
diff --git a/tests/codegen/swap-small-types.rs b/tests/codegen/swap-small-types.rs
index 419645a3fc6bc..03dd4f7a5dddd 100644
--- a/tests/codegen/swap-small-types.rs
+++ b/tests/codegen/swap-small-types.rs
@@ -1,8 +1,9 @@
-// compile-flags: -O -Z merge-functions=disabled
+// compile-flags: -Copt-level=3 -Z merge-functions=disabled
 // only-x86_64
 // ignore-debug: the debug assertions get in the way
 
 #![crate_type = "lib"]
+#![feature(portable_simd)]
 
 use std::mem::swap;
 
@@ -26,10 +27,61 @@ pub fn swap_rgb48_manually(x: &mut RGB48, y: &mut RGB48) {
 #[no_mangle]
 pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
     // CHECK-NOT: alloca
-    // CHECK: load <3 x i16>
-    // CHECK: load <3 x i16>
-    // CHECK: store <3 x i16>
-    // CHECK: store <3 x i16>
+    // CHECK-NOT: br
+    // CHECK: load i32
+
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: store i32
+
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: load i16
+
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: store i16
+
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: ret void
+    swap(x, y)
+}
+
+// CHECK-LABEL: @swap_vecs
+#[no_mangle]
+pub fn swap_vecs(x: &mut Vec<u32>, y: &mut Vec<u32>) {
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+
+    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+
+    // CHECK: load i64
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+
+    // CHECK: store i64
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+
+    // CHECK: ret void
+    swap(x, y)
+}
+
+// CHECK-LABEL: @swap_slices
+#[no_mangle]
+pub fn swap_slices<'a>(x: &mut &'a [u32], y: &mut &'a [u32]) {
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
+    // CHECK: ret void
     swap(x, y)
 }
 
@@ -40,23 +92,23 @@ type RGB24 = [u8; 3];
 // CHECK-LABEL: @swap_rgb24_slices
 #[no_mangle]
 pub fn swap_rgb24_slices(x: &mut [RGB24], y: &mut [RGB24]) {
-// CHECK-NOT: alloca
-// CHECK: load <{{[0-9]+}} x i8>
-// CHECK: store <{{[0-9]+}} x i8>
+    // CHECK-NOT: alloca
+    // CHECK: load <{{[0-9]+}} x i8>
+    // CHECK: store <{{[0-9]+}} x i8>
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
 
-// This one has a power-of-two size, so we iterate over it directly
+// This one has a power-of-two size, so we iterate over it using ints.
 type RGBA32 = [u8; 4];
 
 // CHECK-LABEL: @swap_rgba32_slices
 #[no_mangle]
 pub fn swap_rgba32_slices(x: &mut [RGBA32], y: &mut [RGBA32]) {
-// CHECK-NOT: alloca
-// CHECK: load <{{[0-9]+}} x i32>
-// CHECK: store <{{[0-9]+}} x i32>
+    // CHECK-NOT: alloca
+    // CHECK: load <{{[0-9]+}} x i32>
+    // CHECK: store <{{[0-9]+}} x i32>
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
@@ -69,10 +121,38 @@ const _: () = assert!(!std::mem::size_of::<String>().is_power_of_two());
 // CHECK-LABEL: @swap_string_slices
 #[no_mangle]
 pub fn swap_string_slices(x: &mut [String], y: &mut [String]) {
-// CHECK-NOT: alloca
-// CHECK: load <{{[0-9]+}} x i64>
-// CHECK: store <{{[0-9]+}} x i64>
+    // CHECK-NOT: alloca
+    // CHECK: load <{{[0-9]+}} x i64>
+    // CHECK: store <{{[0-9]+}} x i64>
     if x.len() == y.len() {
         x.swap_with_slice(y);
     }
 }
+
+#[repr(C, packed)]
+pub struct Packed {
+    pub first: bool,
+    pub second: u64,
+}
+
+// CHECK-LABEL: @swap_packed_structs
+#[no_mangle]
+pub fn swap_packed_structs(x: &mut Packed, y: &mut Packed) {
+    // CHECK-NOT: alloca
+    // CHECK: ret void
+    swap(x, y)
+}
+
+// CHECK-LABEL: @swap_simd_type
+#[no_mangle]
+pub fn swap_simd_type(x: &mut std::simd::f32x4, y: &mut std::simd::f32x4){
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: load <4 x float>
+
+    // CHECK-NOT: alloca
+    // CHECK-NOT: br
+    // CHECK: store <4 x float>
+    // CHECK: ret void
+    swap(x, y)
+}
diff --git a/tests/ui/consts/missing_span_in_backtrace.stderr b/tests/ui/consts/missing_span_in_backtrace.stderr
index fcfb9fbb3f8c0..1535777326566 100644
--- a/tests/ui/consts/missing_span_in_backtrace.stderr
+++ b/tests/ui/consts/missing_span_in_backtrace.stderr
@@ -3,12 +3,6 @@ error[E0080]: evaluation of constant value failed
    |
    = note: unable to copy parts of a pointer from memory at ALLOC_ID
    |
-note: inside `std::ptr::read::<MaybeUninit<MaybeUninit<u8>>>`
-  --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
-note: inside `mem::swap_simple::<MaybeUninit<MaybeUninit<u8>>>`
-  --> $SRC_DIR/core/src/mem/mod.rs:LL:COL
-note: inside `ptr::swap_nonoverlapping_simple_untyped::<MaybeUninit<u8>>`
-  --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
 note: inside `swap_nonoverlapping::<MaybeUninit<u8>>`
   --> $SRC_DIR/core/src/ptr/mod.rs:LL:COL
 note: inside `X`
diff --git a/tests/ui/intrinsics/swap_nonoverlapping_single.rs b/tests/ui/intrinsics/swap_nonoverlapping_single.rs
new file mode 100644
index 0000000000000..64afeceeb7ca8
--- /dev/null
+++ b/tests/ui/intrinsics/swap_nonoverlapping_single.rs
@@ -0,0 +1,132 @@
+#![feature(core_intrinsics, const_mut_refs, const_swap)]
+#![crate_type = "rlib"]
+
+//! This module tests if `swap_nonoverlapping_single` works properly in const contexts.
+
+use std::intrinsics::swap_nonoverlapping_single;
+
+pub const OK_A: () = {
+    let mut a = 0i32;
+    let mut b = 5i32;
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+    assert!(a == 5, "Must NOT fail.");
+    assert!(b == 0, "Must NOT fail.");
+};
+
+pub const ERR_A0: () = {
+    let mut a = 0i32;
+    let mut b = 5i32;
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+
+    assert!(a != 5, "Must fail."); //~ ERROR evaluation of constant value failed
+};
+
+pub const ERR_A1: () = {
+    let mut a = 0i32;
+    let mut b = 5i32;
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+
+    assert!(b != 0, "Must fail."); //~ ERROR evaluation of constant value failed
+};
+
+// This must NOT fail.
+pub const B: () = {
+    let mut a = 0i32;
+    let mut b = 5i32;
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+    assert!(a == 0, "Must NOT fail.");
+    assert!(b == 5, "Must NOT fail.");
+};
+
+pub const ERR_B0: () = {
+    let mut a = 0i32;
+    let mut b = 5i32;
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+
+    assert!(a != 0, "Must fail."); //~ ERROR evaluation of constant value failed
+};
+
+pub const ERR_B1: () = {
+    let mut a = 0i32;
+    let mut b = 5i32;
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+
+    assert!(b != 5, "Must fail."); //~ ERROR evaluation of constant value failed
+};
+
+// This must NOT fail.
+pub const NON_OVERLAPPING_PTRS: () = {
+    let mut chunk = [0_i32, 1, 2, 3];
+
+    let ptr = chunk.as_mut_ptr();
+    let ptr2 = unsafe { ptr.add(2) };
+    let x: &mut [i32; 2] = unsafe { &mut *ptr.cast() };
+    let y: &mut [i32; 2] = unsafe { &mut *ptr2.cast() };
+    unsafe {
+        swap_nonoverlapping_single(x, y);
+    }
+
+    assert!(matches!(chunk, [2, 3, 0, 1]), "Must NOT fail.");
+};
+
+pub const OVERLAPPING_PTRS_0: () = {
+    let mut chunk = [0_i32, 1, 2, 3];
+
+    let ptr = chunk.as_mut_ptr();
+    let ptr2 = unsafe { ptr.add(1) };
+    let x: &mut [i32; 2] = unsafe { &mut *ptr.cast() };
+    let y: &mut [i32; 2] = unsafe { &mut *ptr2.cast() };
+
+    unsafe {
+        swap_nonoverlapping_single(x, y); //~ ERROR evaluation of constant value failed
+    }
+};
+
+pub const OVERLAPPING_PTRS_1: () = {
+    let mut val = 7;
+
+    let ptr: *mut _ = &mut val;
+    let x: &mut i32 = unsafe { &mut *ptr };
+    let y: &mut i32 = unsafe { &mut *ptr };
+
+    unsafe {
+        swap_nonoverlapping_single(x, y); //~ ERROR evaluation of constant value failed
+    }
+};
+
+pub const OK_STRUCT: () = {
+    struct Adt {
+        fl: bool,
+        val: usize,
+    }
+    let mut a = Adt { fl: false, val: 10 };
+    let mut b = Adt { fl: true, val: 77 };
+
+    unsafe {
+        swap_nonoverlapping_single(&mut a, &mut b);
+    }
+
+    assert!(matches!(a, Adt { fl: true, val: 77 }), "Must NOT fail.");
+    assert!(matches!(b, Adt { fl: false, val: 10 }), "Must NOT fail.");
+};
diff --git a/tests/ui/intrinsics/swap_nonoverlapping_single.stderr b/tests/ui/intrinsics/swap_nonoverlapping_single.stderr
new file mode 100644
index 0000000000000..d9a77927ea13a
--- /dev/null
+++ b/tests/ui/intrinsics/swap_nonoverlapping_single.stderr
@@ -0,0 +1,47 @@
+error[E0080]: evaluation of constant value failed
+  --> $DIR/swap_nonoverlapping_single.rs:25:5
+   |
+LL |     assert!(a != 5, "Must fail.");
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:25:5
+   |
+   = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation of constant value failed
+  --> $DIR/swap_nonoverlapping_single.rs:35:5
+   |
+LL |     assert!(b != 0, "Must fail.");
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:35:5
+   |
+   = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation of constant value failed
+  --> $DIR/swap_nonoverlapping_single.rs:62:5
+   |
+LL |     assert!(a != 0, "Must fail.");
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:62:5
+   |
+   = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation of constant value failed
+  --> $DIR/swap_nonoverlapping_single.rs:75:5
+   |
+LL |     assert!(b != 5, "Must fail.");
+   |     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ the evaluated program panicked at 'Must fail.', $DIR/swap_nonoverlapping_single.rs:75:5
+   |
+   = note: this error originates in the macro `$crate::panic::panic_2015` which comes from the expansion of the macro `panic` (in Nightly builds, run with -Z macro-backtrace for more info)
+
+error[E0080]: evaluation of constant value failed
+  --> $DIR/swap_nonoverlapping_single.rs:102:9
+   |
+LL |         swap_nonoverlapping_single(x, y);
+   |         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ swap was called on overlapping memory.
+
+error[E0080]: evaluation of constant value failed
+  --> $DIR/swap_nonoverlapping_single.rs:114:9
+   |
+LL |         swap_nonoverlapping_single(x, y);
+   |         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ swap was called on overlapping memory.
+
+error: aborting due to 6 previous errors
+
+For more information about this error, try `rustc --explain E0080`.