From c8342feef25facca923532e4ff4fe29cf81b52f5 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 17 Feb 2024 09:43:49 +0100
Subject: [PATCH 1/2] put the idx arguments of simd_insert and simd_extract
 into const blocks

---
 .../core_arch/src/aarch64/neon/generated.rs   | 400 +++++------
 crates/core_arch/src/aarch64/neon/mod.rs      |  24 +-
 .../src/arm_shared/neon/generated.rs          | 108 +--
 crates/core_arch/src/arm_shared/neon/mod.rs   | 104 +--
 crates/core_arch/src/macros.rs                |  17 +
 crates/core_arch/src/wasm32/simd128.rs        |  32 +-
 crates/core_arch/src/x86/avx.rs               |   8 +-
 crates/core_arch/src/x86/avx2.rs              |  10 +-
 crates/core_arch/src/x86/avx512f.rs           | 638 +++++++++---------
 crates/core_arch/src/x86/sse.rs               |   4 +-
 crates/core_arch/src/x86/sse2.rs              |  40 +-
 crates/core_arch/src/x86/sse41.rs             |  10 +-
 crates/core_arch/src/x86_64/avx.rs            |   2 +-
 crates/core_arch/src/x86_64/avx2.rs           |   2 +-
 crates/core_arch/src/x86_64/avx512f.rs        |   8 +-
 crates/core_arch/src/x86_64/sse2.rs           |   4 +-
 crates/core_arch/src/x86_64/sse41.rs          |   4 +-
 crates/stdarch-gen/neon.spec                  | 150 ++--
 18 files changed, 791 insertions(+), 774 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs
index 196c6f34e4..ac960c657e 100644
--- a/crates/core_arch/src/aarch64/neon/generated.rs
+++ b/crates/core_arch/src/aarch64/neon/generated.rs
@@ -177,7 +177,7 @@ pub unsafe fn vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[cfg_attr(test, assert_instr(fabd))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vabds_f32(a: f32, b: f32) -> f32 {
-    simd_extract(vabd_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+    simd_extract!(vabd_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
 }
 
 /// Floating-point absolute difference
@@ -188,7 +188,7 @@ pub unsafe fn vabds_f32(a: f32, b: f32) -> f32 {
 #[cfg_attr(test, assert_instr(fabd))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vabdd_f64(a: f64, b: f64) -> f64 {
-    simd_extract(vabd_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+    simd_extract!(vabd_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
 }
 
 /// Unsigned Absolute difference Long
@@ -390,7 +390,7 @@ pub unsafe fn vceqd_u64(a: u64, b: u64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vceqs_f32(a: f32, b: f32) -> u32 {
-    simd_extract(vceq_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+    simd_extract!(vceq_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
 }
 
 /// Floating-point compare equal
@@ -401,7 +401,7 @@ pub unsafe fn vceqs_f32(a: f32, b: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vceqd_f64(a: f64, b: f64) -> u64 {
-    simd_extract(vceq_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+    simd_extract!(vceq_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
 }
 
 /// Signed compare bitwise equal to zero
@@ -722,7 +722,7 @@ pub unsafe fn vceqzd_u64(a: u64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vceqzs_f32(a: f32) -> u32 {
-    simd_extract(vceqz_f32(vdup_n_f32(a)), 0)
+    simd_extract!(vceqz_f32(vdup_n_f32(a)), 0)
 }
 
 /// Floating-point compare bitwise equal to zero
@@ -733,7 +733,7 @@ pub unsafe fn vceqzs_f32(a: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vceqzd_f64(a: f64) -> u64 {
-    simd_extract(vceqz_f64(vdup_n_f64(a)), 0)
+    simd_extract!(vceqz_f64(vdup_n_f64(a)), 0)
 }
 
 /// Signed compare bitwise Test bits nonzero
@@ -876,7 +876,7 @@ pub unsafe fn vuqaddd_s64(a: i64, b: u64) -> i64 {
 #[cfg_attr(test, assert_instr(suqadd))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vuqaddb_s8(a: i8, b: u8) -> i8 {
-    simd_extract(vuqadd_s8(vdup_n_s8(a), vdup_n_u8(b)), 0)
+    simd_extract!(vuqadd_s8(vdup_n_s8(a), vdup_n_u8(b)), 0)
 }
 
 /// Signed saturating accumulate of unsigned value
@@ -887,7 +887,7 @@ pub unsafe fn vuqaddb_s8(a: i8, b: u8) -> i8 {
 #[cfg_attr(test, assert_instr(suqadd))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vuqaddh_s16(a: i16, b: u16) -> i16 {
-    simd_extract(vuqadd_s16(vdup_n_s16(a), vdup_n_u16(b)), 0)
+    simd_extract!(vuqadd_s16(vdup_n_s16(a), vdup_n_u16(b)), 0)
 }
 
 /// Floating-point absolute value
@@ -1008,7 +1008,7 @@ pub unsafe fn vcgtd_u64(a: u64, b: u64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcgts_f32(a: f32, b: f32) -> u32 {
-    simd_extract(vcgt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+    simd_extract!(vcgt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
 }
 
 /// Floating-point compare greater than
@@ -1019,7 +1019,7 @@ pub unsafe fn vcgts_f32(a: f32, b: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcgtd_f64(a: f64, b: f64) -> u64 {
-    simd_extract(vcgt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+    simd_extract!(vcgt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
 }
 
 /// Compare signed less than
@@ -1118,7 +1118,7 @@ pub unsafe fn vcltd_u64(a: u64, b: u64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vclts_f32(a: f32, b: f32) -> u32 {
-    simd_extract(vclt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+    simd_extract!(vclt_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
 }
 
 /// Floating-point compare less than
@@ -1129,7 +1129,7 @@ pub unsafe fn vclts_f32(a: f32, b: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltd_f64(a: f64, b: f64) -> u64 {
-    simd_extract(vclt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+    simd_extract!(vclt_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
 }
 
 /// Compare signed less than or equal
@@ -1184,7 +1184,7 @@ pub unsafe fn vcged_u64(a: u64, b: u64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcges_f32(a: f32, b: f32) -> u32 {
-    simd_extract(vcge_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+    simd_extract!(vcge_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
 }
 
 /// Floating-point compare greater than or equal
@@ -1195,7 +1195,7 @@ pub unsafe fn vcges_f32(a: f32, b: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcged_f64(a: f64, b: f64) -> u64 {
-    simd_extract(vcge_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+    simd_extract!(vcge_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
 }
 
 /// Compare unsigned less than or equal
@@ -1272,7 +1272,7 @@ pub unsafe fn vcled_u64(a: u64, b: u64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcles_f32(a: f32, b: f32) -> u32 {
-    simd_extract(vcle_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
+    simd_extract!(vcle_f32(vdup_n_f32(a), vdup_n_f32(b)), 0)
 }
 
 /// Floating-point compare less than or equal
@@ -1283,7 +1283,7 @@ pub unsafe fn vcles_f32(a: f32, b: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcled_f64(a: f64, b: f64) -> u64 {
-    simd_extract(vcle_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
+    simd_extract!(vcle_f64(vdup_n_f64(a), vdup_n_f64(b)), 0)
 }
 
 /// Compare signed greater than or equal
@@ -1515,7 +1515,7 @@ pub unsafe fn vcgezd_s64(a: i64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcgezs_f32(a: f32) -> u32 {
-    simd_extract(vcgez_f32(vdup_n_f32(a)), 0)
+    simd_extract!(vcgez_f32(vdup_n_f32(a)), 0)
 }
 
 /// Floating-point compare greater than or equal to zero
@@ -1526,7 +1526,7 @@ pub unsafe fn vcgezs_f32(a: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcgezd_f64(a: f64) -> u64 {
-    simd_extract(vcgez_f64(vdup_n_f64(a)), 0)
+    simd_extract!(vcgez_f64(vdup_n_f64(a)), 0)
 }
 
 /// Compare signed greater than zero
@@ -1692,7 +1692,7 @@ pub unsafe fn vcgtzd_s64(a: i64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcgtzs_f32(a: f32) -> u32 {
-    simd_extract(vcgtz_f32(vdup_n_f32(a)), 0)
+    simd_extract!(vcgtz_f32(vdup_n_f32(a)), 0)
 }
 
 /// Floating-point compare greater than zero
@@ -1703,7 +1703,7 @@ pub unsafe fn vcgtzs_f32(a: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcgtzd_f64(a: f64) -> u64 {
-    simd_extract(vcgtz_f64(vdup_n_f64(a)), 0)
+    simd_extract!(vcgtz_f64(vdup_n_f64(a)), 0)
 }
 
 /// Compare signed less than or equal to zero
@@ -1869,7 +1869,7 @@ pub unsafe fn vclezd_s64(a: i64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vclezs_f32(a: f32) -> u32 {
-    simd_extract(vclez_f32(vdup_n_f32(a)), 0)
+    simd_extract!(vclez_f32(vdup_n_f32(a)), 0)
 }
 
 /// Floating-point compare less than or equal to zero
@@ -1880,7 +1880,7 @@ pub unsafe fn vclezs_f32(a: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vclezd_f64(a: f64) -> u64 {
-    simd_extract(vclez_f64(vdup_n_f64(a)), 0)
+    simd_extract!(vclez_f64(vdup_n_f64(a)), 0)
 }
 
 /// Compare signed less than zero
@@ -2046,7 +2046,7 @@ pub unsafe fn vcltzd_s64(a: i64) -> u64 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltzs_f32(a: f32) -> u32 {
-    simd_extract(vcltz_f32(vdup_n_f32(a)), 0)
+    simd_extract!(vcltz_f32(vdup_n_f32(a)), 0)
 }
 
 /// Floating-point compare less than zero
@@ -2057,7 +2057,7 @@ pub unsafe fn vcltzs_f32(a: f32) -> u32 {
 #[cfg_attr(test, assert_instr(fcmp))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcltzd_f64(a: f64) -> u64 {
-    simd_extract(vcltz_f64(vdup_n_f64(a)), 0)
+    simd_extract!(vcltz_f64(vdup_n_f64(a)), 0)
 }
 
 /// Floating-point absolute compare greater than
@@ -3394,7 +3394,7 @@ pub unsafe fn vcvtx_f32_f64(a: float64x2_t) -> float32x2_t {
 #[cfg_attr(test, assert_instr(fcvtxn))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vcvtxd_f32_f64(a: f64) -> f32 {
-    simd_extract(vcvtx_f32_f64(vdupq_n_f64(a)), 0)
+    simd_extract!(vcvtx_f32_f64(vdupq_n_f64(a)), 0)
 }
 
 /// Floating-point convert to lower precision narrow, rounding to odd
@@ -4704,7 +4704,7 @@ pub unsafe fn vdup_lane_f64<const N: i32>(a: float64x1_t) -> float64x1_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdup_laneq_p64<const N: i32>(a: poly64x2_t) -> poly64x1_t {
     static_assert_uimm_bits!(N, 1);
-    transmute::<u64, _>(simd_extract(a, N as u32))
+    transmute::<u64, _>(simd_extract!(a, N as u32))
 }
 
 /// Set all vector lanes to the same value
@@ -4717,7 +4717,7 @@ pub unsafe fn vdup_laneq_p64<const N: i32>(a: poly64x2_t) -> poly64x1_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdup_laneq_f64<const N: i32>(a: float64x2_t) -> float64x1_t {
     static_assert_uimm_bits!(N, 1);
-    transmute::<f64, _>(simd_extract(a, N as u32))
+    transmute::<f64, _>(simd_extract!(a, N as u32))
 }
 
 /// Set all vector lanes to the same value
@@ -4730,7 +4730,7 @@ pub unsafe fn vdup_laneq_f64<const N: i32>(a: float64x2_t) -> float64x1_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupb_lane_s8<const N: i32>(a: int8x8_t) -> i8 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4743,7 +4743,7 @@ pub unsafe fn vdupb_lane_s8<const N: i32>(a: int8x8_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupb_laneq_s8<const N: i32>(a: int8x16_t) -> i8 {
     static_assert_uimm_bits!(N, 4);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4756,7 +4756,7 @@ pub unsafe fn vdupb_laneq_s8<const N: i32>(a: int8x16_t) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vduph_lane_s16<const N: i32>(a: int16x4_t) -> i16 {
     static_assert_uimm_bits!(N, 2);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4769,7 +4769,7 @@ pub unsafe fn vduph_lane_s16<const N: i32>(a: int16x4_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vduph_laneq_s16<const N: i32>(a: int16x8_t) -> i16 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4782,7 +4782,7 @@ pub unsafe fn vduph_laneq_s16<const N: i32>(a: int16x8_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdups_lane_s32<const N: i32>(a: int32x2_t) -> i32 {
     static_assert_uimm_bits!(N, 1);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4795,7 +4795,7 @@ pub unsafe fn vdups_lane_s32<const N: i32>(a: int32x2_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdups_laneq_s32<const N: i32>(a: int32x4_t) -> i32 {
     static_assert_uimm_bits!(N, 2);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4808,7 +4808,7 @@ pub unsafe fn vdups_laneq_s32<const N: i32>(a: int32x4_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupd_lane_s64<const N: i32>(a: int64x1_t) -> i64 {
     static_assert!(N == 0);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4821,7 +4821,7 @@ pub unsafe fn vdupd_lane_s64<const N: i32>(a: int64x1_t) -> i64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupd_laneq_s64<const N: i32>(a: int64x2_t) -> i64 {
     static_assert_uimm_bits!(N, 1);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4834,7 +4834,7 @@ pub unsafe fn vdupd_laneq_s64<const N: i32>(a: int64x2_t) -> i64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupb_lane_u8<const N: i32>(a: uint8x8_t) -> u8 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4847,7 +4847,7 @@ pub unsafe fn vdupb_lane_u8<const N: i32>(a: uint8x8_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupb_laneq_u8<const N: i32>(a: uint8x16_t) -> u8 {
     static_assert_uimm_bits!(N, 4);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4860,7 +4860,7 @@ pub unsafe fn vdupb_laneq_u8<const N: i32>(a: uint8x16_t) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vduph_lane_u16<const N: i32>(a: uint16x4_t) -> u16 {
     static_assert_uimm_bits!(N, 2);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4873,7 +4873,7 @@ pub unsafe fn vduph_lane_u16<const N: i32>(a: uint16x4_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vduph_laneq_u16<const N: i32>(a: uint16x8_t) -> u16 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4886,7 +4886,7 @@ pub unsafe fn vduph_laneq_u16<const N: i32>(a: uint16x8_t) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdups_lane_u32<const N: i32>(a: uint32x2_t) -> u32 {
     static_assert_uimm_bits!(N, 1);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4899,7 +4899,7 @@ pub unsafe fn vdups_lane_u32<const N: i32>(a: uint32x2_t) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdups_laneq_u32<const N: i32>(a: uint32x4_t) -> u32 {
     static_assert_uimm_bits!(N, 2);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4912,7 +4912,7 @@ pub unsafe fn vdups_laneq_u32<const N: i32>(a: uint32x4_t) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupd_lane_u64<const N: i32>(a: uint64x1_t) -> u64 {
     static_assert!(N == 0);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4925,7 +4925,7 @@ pub unsafe fn vdupd_lane_u64<const N: i32>(a: uint64x1_t) -> u64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupd_laneq_u64<const N: i32>(a: uint64x2_t) -> u64 {
     static_assert_uimm_bits!(N, 1);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4938,7 +4938,7 @@ pub unsafe fn vdupd_laneq_u64<const N: i32>(a: uint64x2_t) -> u64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupb_lane_p8<const N: i32>(a: poly8x8_t) -> p8 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4951,7 +4951,7 @@ pub unsafe fn vdupb_lane_p8<const N: i32>(a: poly8x8_t) -> p8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupb_laneq_p8<const N: i32>(a: poly8x16_t) -> p8 {
     static_assert_uimm_bits!(N, 4);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4964,7 +4964,7 @@ pub unsafe fn vdupb_laneq_p8<const N: i32>(a: poly8x16_t) -> p8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vduph_lane_p16<const N: i32>(a: poly16x4_t) -> p16 {
     static_assert_uimm_bits!(N, 2);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4977,7 +4977,7 @@ pub unsafe fn vduph_lane_p16<const N: i32>(a: poly16x4_t) -> p16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vduph_laneq_p16<const N: i32>(a: poly16x8_t) -> p16 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -4990,7 +4990,7 @@ pub unsafe fn vduph_laneq_p16<const N: i32>(a: poly16x8_t) -> p16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdups_lane_f32<const N: i32>(a: float32x2_t) -> f32 {
     static_assert_uimm_bits!(N, 1);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -5003,7 +5003,7 @@ pub unsafe fn vdups_lane_f32<const N: i32>(a: float32x2_t) -> f32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdups_laneq_f32<const N: i32>(a: float32x4_t) -> f32 {
     static_assert_uimm_bits!(N, 2);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -5016,7 +5016,7 @@ pub unsafe fn vdups_laneq_f32<const N: i32>(a: float32x4_t) -> f32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupd_lane_f64<const N: i32>(a: float64x1_t) -> f64 {
     static_assert!(N == 0);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Set all vector lanes to the same value
@@ -5029,7 +5029,7 @@ pub unsafe fn vdupd_lane_f64<const N: i32>(a: float64x1_t) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vdupd_laneq_f64<const N: i32>(a: float64x2_t) -> f64 {
     static_assert_uimm_bits!(N, 1);
-    simd_extract(a, N as u32)
+    simd_extract!(a, N as u32)
 }
 
 /// Extract vector from pair of vectors
@@ -5729,7 +5729,7 @@ pub unsafe fn vqnegq_s64(a: int64x2_t) -> int64x2_t {
 #[cfg_attr(test, assert_instr(sqneg))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqnegb_s8(a: i8) -> i8 {
-    simd_extract(vqneg_s8(vdup_n_s8(a)), 0)
+    simd_extract!(vqneg_s8(vdup_n_s8(a)), 0)
 }
 
 /// Signed saturating negate
@@ -5740,7 +5740,7 @@ pub unsafe fn vqnegb_s8(a: i8) -> i8 {
 #[cfg_attr(test, assert_instr(sqneg))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqnegh_s16(a: i16) -> i16 {
-    simd_extract(vqneg_s16(vdup_n_s16(a)), 0)
+    simd_extract!(vqneg_s16(vdup_n_s16(a)), 0)
 }
 
 /// Signed saturating negate
@@ -5751,7 +5751,7 @@ pub unsafe fn vqnegh_s16(a: i16) -> i16 {
 #[cfg_attr(test, assert_instr(sqneg))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqnegs_s32(a: i32) -> i32 {
-    simd_extract(vqneg_s32(vdup_n_s32(a)), 0)
+    simd_extract!(vqneg_s32(vdup_n_s32(a)), 0)
 }
 
 /// Signed saturating negate
@@ -5762,7 +5762,7 @@ pub unsafe fn vqnegs_s32(a: i32) -> i32 {
 #[cfg_attr(test, assert_instr(sqneg))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqnegd_s64(a: i64) -> i64 {
-    simd_extract(vqneg_s64(vdup_n_s64(a)), 0)
+    simd_extract!(vqneg_s64(vdup_n_s64(a)), 0)
 }
 
 /// Saturating subtract
@@ -5775,7 +5775,7 @@ pub unsafe fn vqnegd_s64(a: i64) -> i64 {
 pub unsafe fn vqsubb_s8(a: i8, b: i8) -> i8 {
     let a: int8x8_t = vdup_n_s8(a);
     let b: int8x8_t = vdup_n_s8(b);
-    simd_extract(vqsub_s8(a, b), 0)
+    simd_extract!(vqsub_s8(a, b), 0)
 }
 
 /// Saturating subtract
@@ -5788,7 +5788,7 @@ pub unsafe fn vqsubb_s8(a: i8, b: i8) -> i8 {
 pub unsafe fn vqsubh_s16(a: i16, b: i16) -> i16 {
     let a: int16x4_t = vdup_n_s16(a);
     let b: int16x4_t = vdup_n_s16(b);
-    simd_extract(vqsub_s16(a, b), 0)
+    simd_extract!(vqsub_s16(a, b), 0)
 }
 
 /// Saturating subtract
@@ -5801,7 +5801,7 @@ pub unsafe fn vqsubh_s16(a: i16, b: i16) -> i16 {
 pub unsafe fn vqsubb_u8(a: u8, b: u8) -> u8 {
     let a: uint8x8_t = vdup_n_u8(a);
     let b: uint8x8_t = vdup_n_u8(b);
-    simd_extract(vqsub_u8(a, b), 0)
+    simd_extract!(vqsub_u8(a, b), 0)
 }
 
 /// Saturating subtract
@@ -5814,7 +5814,7 @@ pub unsafe fn vqsubb_u8(a: u8, b: u8) -> u8 {
 pub unsafe fn vqsubh_u16(a: u16, b: u16) -> u16 {
     let a: uint16x4_t = vdup_n_u16(a);
     let b: uint16x4_t = vdup_n_u16(b);
-    simd_extract(vqsub_u16(a, b), 0)
+    simd_extract!(vqsub_u16(a, b), 0)
 }
 
 /// Saturating subtract
@@ -6399,7 +6399,7 @@ pub unsafe fn vrndiq_f64(a: float64x2_t) -> float64x2_t {
 pub unsafe fn vqaddb_s8(a: i8, b: i8) -> i8 {
     let a: int8x8_t = vdup_n_s8(a);
     let b: int8x8_t = vdup_n_s8(b);
-    simd_extract(vqadd_s8(a, b), 0)
+    simd_extract!(vqadd_s8(a, b), 0)
 }
 
 /// Saturating add
@@ -6412,7 +6412,7 @@ pub unsafe fn vqaddb_s8(a: i8, b: i8) -> i8 {
 pub unsafe fn vqaddh_s16(a: i16, b: i16) -> i16 {
     let a: int16x4_t = vdup_n_s16(a);
     let b: int16x4_t = vdup_n_s16(b);
-    simd_extract(vqadd_s16(a, b), 0)
+    simd_extract!(vqadd_s16(a, b), 0)
 }
 
 /// Saturating add
@@ -6425,7 +6425,7 @@ pub unsafe fn vqaddh_s16(a: i16, b: i16) -> i16 {
 pub unsafe fn vqaddb_u8(a: u8, b: u8) -> u8 {
     let a: uint8x8_t = vdup_n_u8(a);
     let b: uint8x8_t = vdup_n_u8(b);
-    simd_extract(vqadd_u8(a, b), 0)
+    simd_extract!(vqadd_u8(a, b), 0)
 }
 
 /// Saturating add
@@ -6438,7 +6438,7 @@ pub unsafe fn vqaddb_u8(a: u8, b: u8) -> u8 {
 pub unsafe fn vqaddh_u16(a: u16, b: u16) -> u16 {
     let a: uint16x4_t = vdup_n_u16(a);
     let b: uint16x4_t = vdup_n_u16(b);
-    simd_extract(vqadd_u16(a, b), 0)
+    simd_extract!(vqadd_u16(a, b), 0)
 }
 
 /// Saturating add
@@ -7535,7 +7535,7 @@ pub unsafe fn vld4q_lane_f64<const LANE: i32>(a: *const f64, b: float64x2x4_t) -
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vst1_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1_t) {
     static_assert!(LANE == 0);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -7548,7 +7548,7 @@ pub unsafe fn vst1_lane_f64<const LANE: i32>(a: *mut f64, b: float64x1_t) {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vst1q_lane_f64<const LANE: i32>(a: *mut f64, b: float64x2_t) {
     static_assert_uimm_bits!(LANE, 1);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures to one, two, three, or four registers
@@ -8475,7 +8475,7 @@ pub unsafe fn vmulq_n_f64(a: float64x2_t, b: f64) -> float64x2_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmul_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64x1_t {
     static_assert!(LANE == 0);
-    simd_mul(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+    simd_mul(a, transmute::<f64, _>(simd_extract!(b, LANE as u32)))
 }
 
 /// Floating-point multiply
@@ -8488,7 +8488,7 @@ pub unsafe fn vmul_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) ->
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmul_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -> float64x1_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_mul(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+    simd_mul(a, transmute::<f64, _>(simd_extract!(b, LANE as u32)))
 }
 
 /// Floating-point multiply
@@ -8527,7 +8527,7 @@ pub unsafe fn vmulq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t) -
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmuls_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
     static_assert_uimm_bits!(LANE, 1);
-    let b: f32 = simd_extract(b, LANE as u32);
+    let b: f32 = simd_extract!(b, LANE as u32);
     a * b
 }
 
@@ -8541,7 +8541,7 @@ pub unsafe fn vmuls_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmuls_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
     static_assert_uimm_bits!(LANE, 2);
-    let b: f32 = simd_extract(b, LANE as u32);
+    let b: f32 = simd_extract!(b, LANE as u32);
     a * b
 }
 
@@ -8555,7 +8555,7 @@ pub unsafe fn vmuls_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmuld_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
     static_assert!(LANE == 0);
-    let b: f64 = simd_extract(b, LANE as u32);
+    let b: f64 = simd_extract!(b, LANE as u32);
     a * b
 }
 
@@ -8569,7 +8569,7 @@ pub unsafe fn vmuld_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmuld_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
     static_assert_uimm_bits!(LANE, 1);
-    let b: f64 = simd_extract(b, LANE as u32);
+    let b: f64 = simd_extract!(b, LANE as u32);
     a * b
 }
 
@@ -8688,7 +8688,7 @@ pub unsafe fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t {
 #[cfg_attr(test, assert_instr(pmull))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmull_high_p64(a: poly64x2_t, b: poly64x2_t) -> p128 {
-    vmull_p64(simd_extract(a, 1), simd_extract(b, 1))
+    vmull_p64(simd_extract!(a, 1), simd_extract!(b, 1))
 }
 
 /// Multiply long
@@ -8913,7 +8913,7 @@ pub unsafe fn vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmulx_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) -> float64x1_t {
     static_assert!(LANE == 0);
-    vmulx_f64(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+    vmulx_f64(a, transmute::<f64, _>(simd_extract!(b, LANE as u32)))
 }
 
 /// Floating-point multiply extended
@@ -8926,7 +8926,7 @@ pub unsafe fn vmulx_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t) ->
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmulx_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x2_t) -> float64x1_t {
     static_assert_uimm_bits!(LANE, 1);
-    vmulx_f64(a, transmute::<f64, _>(simd_extract(b, LANE as u32)))
+    vmulx_f64(a, transmute::<f64, _>(simd_extract!(b, LANE as u32)))
 }
 
 /// Floating-point multiply extended
@@ -9049,7 +9049,7 @@ pub unsafe fn vmulxd_f64(a: f64, b: f64) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmulxs_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
     static_assert_uimm_bits!(LANE, 1);
-    vmulxs_f32(a, simd_extract(b, LANE as u32))
+    vmulxs_f32(a, simd_extract!(b, LANE as u32))
 }
 
 /// Floating-point multiply extended
@@ -9062,7 +9062,7 @@ pub unsafe fn vmulxs_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> f32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmulxs_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
     static_assert_uimm_bits!(LANE, 2);
-    vmulxs_f32(a, simd_extract(b, LANE as u32))
+    vmulxs_f32(a, simd_extract!(b, LANE as u32))
 }
 
 /// Floating-point multiply extended
@@ -9075,7 +9075,7 @@ pub unsafe fn vmulxs_laneq_f32<const LANE: i32>(a: f32, b: float32x4_t) -> f32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmulxd_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
     static_assert!(LANE == 0);
-    vmulxd_f64(a, simd_extract(b, LANE as u32))
+    vmulxd_f64(a, simd_extract!(b, LANE as u32))
 }
 
 /// Floating-point multiply extended
@@ -9088,7 +9088,7 @@ pub unsafe fn vmulxd_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> f64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vmulxd_laneq_f64<const LANE: i32>(a: f64, b: float64x2_t) -> f64 {
     static_assert_uimm_bits!(LANE, 1);
-    vmulxd_f64(a, simd_extract(b, LANE as u32))
+    vmulxd_f64(a, simd_extract!(b, LANE as u32))
 }
 
 /// Floating-point fused Multiply-Add to accumulator(vector)
@@ -9155,7 +9155,7 @@ pub unsafe fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfma_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
+    vfma_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-add to accumulator
@@ -9168,7 +9168,7 @@ pub unsafe fn vfma_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfma_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    vfma_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
+    vfma_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-add to accumulator
@@ -9181,7 +9181,7 @@ pub unsafe fn vfma_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfmaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
+    vfmaq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-add to accumulator
@@ -9194,7 +9194,7 @@ pub unsafe fn vfmaq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfmaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    vfmaq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
+    vfmaq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-add to accumulator
@@ -9207,7 +9207,7 @@ pub unsafe fn vfmaq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfma_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
     static_assert!(LANE == 0);
-    vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
+    vfma_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-add to accumulator
@@ -9220,7 +9220,7 @@ pub unsafe fn vfma_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfma_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t {
     static_assert_uimm_bits!(LANE, 1);
-    vfma_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
+    vfma_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-add to accumulator
@@ -9233,7 +9233,7 @@ pub unsafe fn vfma_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfmaq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t {
     static_assert!(LANE == 0);
-    vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
+    vfmaq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-add to accumulator
@@ -9246,7 +9246,7 @@ pub unsafe fn vfmaq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfmaq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    vfmaq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
+    vfmaq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-add to accumulator
@@ -9264,7 +9264,7 @@ pub unsafe fn vfmas_lane_f32<const LANE: i32>(a: f32, b: f32, c: float32x2_t) ->
         fn vfmas_lane_f32_(a: f32, b: f32, c: f32) -> f32;
     }
     static_assert_uimm_bits!(LANE, 1);
-    let c: f32 = simd_extract(c, LANE as u32);
+    let c: f32 = simd_extract!(c, LANE as u32);
     vfmas_lane_f32_(b, c, a)
 }
 
@@ -9283,7 +9283,7 @@ pub unsafe fn vfmas_laneq_f32<const LANE: i32>(a: f32, b: f32, c: float32x4_t) -
         fn vfmas_laneq_f32_(a: f32, b: f32, c: f32) -> f32;
     }
     static_assert_uimm_bits!(LANE, 2);
-    let c: f32 = simd_extract(c, LANE as u32);
+    let c: f32 = simd_extract!(c, LANE as u32);
     vfmas_laneq_f32_(b, c, a)
 }
 
@@ -9302,7 +9302,7 @@ pub unsafe fn vfmad_lane_f64<const LANE: i32>(a: f64, b: f64, c: float64x1_t) ->
         fn vfmad_lane_f64_(a: f64, b: f64, c: f64) -> f64;
     }
     static_assert!(LANE == 0);
-    let c: f64 = simd_extract(c, LANE as u32);
+    let c: f64 = simd_extract!(c, LANE as u32);
     vfmad_lane_f64_(b, c, a)
 }
 
@@ -9321,7 +9321,7 @@ pub unsafe fn vfmad_laneq_f64<const LANE: i32>(a: f64, b: f64, c: float64x2_t) -
         fn vfmad_laneq_f64_(a: f64, b: f64, c: f64) -> f64;
     }
     static_assert_uimm_bits!(LANE, 1);
-    let c: f64 = simd_extract(c, LANE as u32);
+    let c: f64 = simd_extract!(c, LANE as u32);
     vfmad_laneq_f64_(b, c, a)
 }
 
@@ -9381,7 +9381,7 @@ pub unsafe fn vfmsq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfms_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
+    vfms_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-subtract to accumulator
@@ -9394,7 +9394,7 @@ pub unsafe fn vfms_lane_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfms_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c: float32x4_t) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    vfms_f32(a, b, vdup_n_f32(simd_extract(c, LANE as u32)))
+    vfms_f32(a, b, vdup_n_f32(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-subtract to accumulator
@@ -9407,7 +9407,7 @@ pub unsafe fn vfms_laneq_f32<const LANE: i32>(a: float32x2_t, b: float32x2_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfmsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x2_t) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
+    vfmsq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-subtract to accumulator
@@ -9420,7 +9420,7 @@ pub unsafe fn vfmsq_lane_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfmsq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    vfmsq_f32(a, b, vdupq_n_f32(simd_extract(c, LANE as u32)))
+    vfmsq_f32(a, b, vdupq_n_f32(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-subtract to accumulator
@@ -9433,7 +9433,7 @@ pub unsafe fn vfmsq_laneq_f32<const LANE: i32>(a: float32x4_t, b: float32x4_t, c
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfms_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t {
     static_assert!(LANE == 0);
-    vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
+    vfms_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-subtract to accumulator
@@ -9446,7 +9446,7 @@ pub unsafe fn vfms_lane_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfms_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c: float64x2_t) -> float64x1_t {
     static_assert_uimm_bits!(LANE, 1);
-    vfms_f64(a, b, vdup_n_f64(simd_extract(c, LANE as u32)))
+    vfms_f64(a, b, vdup_n_f64(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-subtract to accumulator
@@ -9459,7 +9459,7 @@ pub unsafe fn vfms_laneq_f64<const LANE: i32>(a: float64x1_t, b: float64x1_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfmsq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x1_t) -> float64x2_t {
     static_assert!(LANE == 0);
-    vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
+    vfmsq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-subtract to accumulator
@@ -9472,7 +9472,7 @@ pub unsafe fn vfmsq_lane_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c:
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vfmsq_laneq_f64<const LANE: i32>(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    vfmsq_f64(a, b, vdupq_n_f64(simd_extract(c, LANE as u32)))
+    vfmsq_f64(a, b, vdupq_n_f64(simd_extract!(c, LANE as u32)))
 }
 
 /// Floating-point fused multiply-subtract to accumulator
@@ -11119,8 +11119,8 @@ pub unsafe fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t {
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vpadds_f32(a: float32x2_t) -> f32 {
-    let a1: f32 = simd_extract(a, 0);
-    let a2: f32 = simd_extract(a, 1);
+    let a1: f32 = simd_extract!(a, 0);
+    let a2: f32 = simd_extract!(a, 1);
     a1 + a2
 }
 
@@ -11132,8 +11132,8 @@ pub unsafe fn vpadds_f32(a: float32x2_t) -> f32 {
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vpaddd_f64(a: float64x2_t) -> f64 {
-    let a1: f64 = simd_extract(a, 0);
-    let a2: f64 = simd_extract(a, 1);
+    let a1: f64 = simd_extract!(a, 0);
+    let a2: f64 = simd_extract!(a, 1);
     a1 + a2
 }
 
@@ -11259,7 +11259,7 @@ pub unsafe fn vpminqd_f64(a: float64x2_t) -> f64 {
 pub unsafe fn vqdmullh_s16(a: i16, b: i16) -> i32 {
     let a: int16x4_t = vdup_n_s16(a);
     let b: int16x4_t = vdup_n_s16(b);
-    simd_extract(vqdmull_s16(a, b), 0)
+    simd_extract!(vqdmull_s16(a, b), 0)
 }
 
 /// Signed saturating doubling multiply long
@@ -11368,7 +11368,7 @@ pub unsafe fn vqdmull_laneq_s32<const N: i32>(a: int32x2_t, b: int32x4_t) -> int
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmullh_lane_s16<const N: i32>(a: i16, b: int16x4_t) -> i32 {
     static_assert_uimm_bits!(N, 2);
-    let b: i16 = simd_extract(b, N as u32);
+    let b: i16 = simd_extract!(b, N as u32);
     vqdmullh_s16(a, b)
 }
 
@@ -11382,7 +11382,7 @@ pub unsafe fn vqdmullh_lane_s16<const N: i32>(a: i16, b: int16x4_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmullh_laneq_s16<const N: i32>(a: i16, b: int16x8_t) -> i32 {
     static_assert_uimm_bits!(N, 3);
-    let b: i16 = simd_extract(b, N as u32);
+    let b: i16 = simd_extract!(b, N as u32);
     vqdmullh_s16(a, b)
 }
 
@@ -11396,7 +11396,7 @@ pub unsafe fn vqdmullh_laneq_s16<const N: i32>(a: i16, b: int16x8_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulls_lane_s32<const N: i32>(a: i32, b: int32x2_t) -> i64 {
     static_assert_uimm_bits!(N, 1);
-    let b: i32 = simd_extract(b, N as u32);
+    let b: i32 = simd_extract!(b, N as u32);
     vqdmulls_s32(a, b)
 }
 
@@ -11410,7 +11410,7 @@ pub unsafe fn vqdmulls_lane_s32<const N: i32>(a: i32, b: int32x2_t) -> i64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulls_laneq_s32<const N: i32>(a: i32, b: int32x4_t) -> i64 {
     static_assert_uimm_bits!(N, 2);
-    let b: i32 = simd_extract(b, N as u32);
+    let b: i32 = simd_extract!(b, N as u32);
     vqdmulls_s32(a, b)
 }
 
@@ -11605,7 +11605,7 @@ pub unsafe fn vqdmlal_high_laneq_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlalh_s16(a: i32, b: i16, c: i16) -> i32 {
     let x: int32x4_t = vqdmull_s16(vdup_n_s16(b), vdup_n_s16(c));
-    vqadds_s32(a, simd_extract(x, 0))
+    vqadds_s32(a, simd_extract!(x, 0))
 }
 
 /// Signed saturating doubling multiply-add long
@@ -11630,7 +11630,7 @@ pub unsafe fn vqdmlals_s32(a: i64, b: i32, c: i32) -> i64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlalh_lane_s16<const LANE: i32>(a: i32, b: i16, c: int16x4_t) -> i32 {
     static_assert_uimm_bits!(LANE, 2);
-    vqdmlalh_s16(a, b, simd_extract(c, LANE as u32))
+    vqdmlalh_s16(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating doubling multiply-add long
@@ -11643,7 +11643,7 @@ pub unsafe fn vqdmlalh_lane_s16<const LANE: i32>(a: i32, b: i16, c: int16x4_t) -
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlalh_laneq_s16<const LANE: i32>(a: i32, b: i16, c: int16x8_t) -> i32 {
     static_assert_uimm_bits!(LANE, 3);
-    vqdmlalh_s16(a, b, simd_extract(c, LANE as u32))
+    vqdmlalh_s16(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating doubling multiply-add long
@@ -11656,7 +11656,7 @@ pub unsafe fn vqdmlalh_laneq_s16<const LANE: i32>(a: i32, b: i16, c: int16x8_t)
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlals_lane_s32<const LANE: i32>(a: i64, b: i32, c: int32x2_t) -> i64 {
     static_assert_uimm_bits!(LANE, 1);
-    vqdmlals_s32(a, b, simd_extract(c, LANE as u32))
+    vqdmlals_s32(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating doubling multiply-add long
@@ -11669,7 +11669,7 @@ pub unsafe fn vqdmlals_lane_s32<const LANE: i32>(a: i64, b: i32, c: int32x2_t) -
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlals_laneq_s32<const LANE: i32>(a: i64, b: i32, c: int32x4_t) -> i64 {
     static_assert_uimm_bits!(LANE, 2);
-    vqdmlals_s32(a, b, simd_extract(c, LANE as u32))
+    vqdmlals_s32(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating doubling multiply-subtract long
@@ -11803,7 +11803,7 @@ pub unsafe fn vqdmlsl_high_laneq_s32<const N: i32>(a: int64x2_t, b: int32x4_t, c
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlslh_s16(a: i32, b: i16, c: i16) -> i32 {
     let x: int32x4_t = vqdmull_s16(vdup_n_s16(b), vdup_n_s16(c));
-    vqsubs_s32(a, simd_extract(x, 0))
+    vqsubs_s32(a, simd_extract!(x, 0))
 }
 
 /// Signed saturating doubling multiply-subtract long
@@ -11828,7 +11828,7 @@ pub unsafe fn vqdmlsls_s32(a: i64, b: i32, c: i32) -> i64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlslh_lane_s16<const LANE: i32>(a: i32, b: i16, c: int16x4_t) -> i32 {
     static_assert_uimm_bits!(LANE, 2);
-    vqdmlslh_s16(a, b, simd_extract(c, LANE as u32))
+    vqdmlslh_s16(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating doubling multiply-subtract long
@@ -11841,7 +11841,7 @@ pub unsafe fn vqdmlslh_lane_s16<const LANE: i32>(a: i32, b: i16, c: int16x4_t) -
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlslh_laneq_s16<const LANE: i32>(a: i32, b: i16, c: int16x8_t) -> i32 {
     static_assert_uimm_bits!(LANE, 3);
-    vqdmlslh_s16(a, b, simd_extract(c, LANE as u32))
+    vqdmlslh_s16(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating doubling multiply-subtract long
@@ -11854,7 +11854,7 @@ pub unsafe fn vqdmlslh_laneq_s16<const LANE: i32>(a: i32, b: i16, c: int16x8_t)
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlsls_lane_s32<const LANE: i32>(a: i64, b: i32, c: int32x2_t) -> i64 {
     static_assert_uimm_bits!(LANE, 1);
-    vqdmlsls_s32(a, b, simd_extract(c, LANE as u32))
+    vqdmlsls_s32(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating doubling multiply-subtract long
@@ -11867,7 +11867,7 @@ pub unsafe fn vqdmlsls_lane_s32<const LANE: i32>(a: i64, b: i32, c: int32x2_t) -
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmlsls_laneq_s32<const LANE: i32>(a: i64, b: i32, c: int32x4_t) -> i64 {
     static_assert_uimm_bits!(LANE, 2);
-    vqdmlsls_s32(a, b, simd_extract(c, LANE as u32))
+    vqdmlsls_s32(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating doubling multiply returning high half
@@ -11880,7 +11880,7 @@ pub unsafe fn vqdmlsls_laneq_s32<const LANE: i32>(a: i64, b: i32, c: int32x4_t)
 pub unsafe fn vqdmulhh_s16(a: i16, b: i16) -> i16 {
     let a: int16x4_t = vdup_n_s16(a);
     let b: int16x4_t = vdup_n_s16(b);
-    simd_extract(vqdmulh_s16(a, b), 0)
+    simd_extract!(vqdmulh_s16(a, b), 0)
 }
 
 /// Signed saturating doubling multiply returning high half
@@ -11893,7 +11893,7 @@ pub unsafe fn vqdmulhh_s16(a: i16, b: i16) -> i16 {
 pub unsafe fn vqdmulhs_s32(a: i32, b: i32) -> i32 {
     let a: int32x2_t = vdup_n_s32(a);
     let b: int32x2_t = vdup_n_s32(b);
-    simd_extract(vqdmulh_s32(a, b), 0)
+    simd_extract!(vqdmulh_s32(a, b), 0)
 }
 
 /// Signed saturating doubling multiply returning high half
@@ -11906,7 +11906,7 @@ pub unsafe fn vqdmulhs_s32(a: i32, b: i32) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulhh_lane_s16<const N: i32>(a: i16, b: int16x4_t) -> i16 {
     static_assert_uimm_bits!(N, 2);
-    let b: i16 = simd_extract(b, N as u32);
+    let b: i16 = simd_extract!(b, N as u32);
     vqdmulhh_s16(a, b)
 }
 
@@ -11920,7 +11920,7 @@ pub unsafe fn vqdmulhh_lane_s16<const N: i32>(a: i16, b: int16x4_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulhh_laneq_s16<const N: i32>(a: i16, b: int16x8_t) -> i16 {
     static_assert_uimm_bits!(N, 3);
-    let b: i16 = simd_extract(b, N as u32);
+    let b: i16 = simd_extract!(b, N as u32);
     vqdmulhh_s16(a, b)
 }
 
@@ -11934,7 +11934,7 @@ pub unsafe fn vqdmulhh_laneq_s16<const N: i32>(a: i16, b: int16x8_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulhs_lane_s32<const N: i32>(a: i32, b: int32x2_t) -> i32 {
     static_assert_uimm_bits!(N, 1);
-    let b: i32 = simd_extract(b, N as u32);
+    let b: i32 = simd_extract!(b, N as u32);
     vqdmulhs_s32(a, b)
 }
 
@@ -11948,7 +11948,7 @@ pub unsafe fn vqdmulhs_lane_s32<const N: i32>(a: i32, b: int32x2_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulhs_laneq_s32<const N: i32>(a: i32, b: int32x4_t) -> i32 {
     static_assert_uimm_bits!(N, 2);
-    let b: i32 = simd_extract(b, N as u32);
+    let b: i32 = simd_extract!(b, N as u32);
     vqdmulhs_s32(a, b)
 }
 
@@ -11962,7 +11962,7 @@ pub unsafe fn vqdmulhs_laneq_s32<const N: i32>(a: i32, b: int32x4_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> int16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    vqdmulh_s16(a, vdup_n_s16(simd_extract(b, LANE as u32)))
+    vqdmulh_s16(a, vdup_n_s16(simd_extract!(b, LANE as u32)))
 }
 
 /// Vector saturating doubling multiply high by scalar
@@ -11975,7 +11975,7 @@ pub unsafe fn vqdmulh_lane_s16<const LANE: i32>(a: int16x4_t, b: int16x4_t) -> i
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) -> int16x8_t {
     static_assert_uimm_bits!(LANE, 2);
-    vqdmulhq_s16(a, vdupq_n_s16(simd_extract(b, LANE as u32)))
+    vqdmulhq_s16(a, vdupq_n_s16(simd_extract!(b, LANE as u32)))
 }
 
 /// Vector saturating doubling multiply high by scalar
@@ -11988,7 +11988,7 @@ pub unsafe fn vqdmulhq_lane_s16<const LANE: i32>(a: int16x8_t, b: int16x4_t) ->
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    vqdmulh_s32(a, vdup_n_s32(simd_extract(b, LANE as u32)))
+    vqdmulh_s32(a, vdup_n_s32(simd_extract!(b, LANE as u32)))
 }
 
 /// Vector saturating doubling multiply high by scalar
@@ -12001,7 +12001,7 @@ pub unsafe fn vqdmulh_lane_s32<const LANE: i32>(a: int32x2_t, b: int32x2_t) -> i
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 1);
-    vqdmulhq_s32(a, vdupq_n_s32(simd_extract(b, LANE as u32)))
+    vqdmulhq_s32(a, vdupq_n_s32(simd_extract!(b, LANE as u32)))
 }
 
 /// Saturating extract narrow
@@ -12012,7 +12012,7 @@ pub unsafe fn vqdmulhq_lane_s32<const LANE: i32>(a: int32x4_t, b: int32x2_t) ->
 #[cfg_attr(test, assert_instr(sqxtn))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqmovnh_s16(a: i16) -> i8 {
-    simd_extract(vqmovn_s16(vdupq_n_s16(a)), 0)
+    simd_extract!(vqmovn_s16(vdupq_n_s16(a)), 0)
 }
 
 /// Saturating extract narrow
@@ -12023,7 +12023,7 @@ pub unsafe fn vqmovnh_s16(a: i16) -> i8 {
 #[cfg_attr(test, assert_instr(sqxtn))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqmovns_s32(a: i32) -> i16 {
-    simd_extract(vqmovn_s32(vdupq_n_s32(a)), 0)
+    simd_extract!(vqmovn_s32(vdupq_n_s32(a)), 0)
 }
 
 /// Saturating extract narrow
@@ -12034,7 +12034,7 @@ pub unsafe fn vqmovns_s32(a: i32) -> i16 {
 #[cfg_attr(test, assert_instr(uqxtn))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqmovnh_u16(a: u16) -> u8 {
-    simd_extract(vqmovn_u16(vdupq_n_u16(a)), 0)
+    simd_extract!(vqmovn_u16(vdupq_n_u16(a)), 0)
 }
 
 /// Saturating extract narrow
@@ -12045,7 +12045,7 @@ pub unsafe fn vqmovnh_u16(a: u16) -> u8 {
 #[cfg_attr(test, assert_instr(uqxtn))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqmovns_u32(a: u32) -> u16 {
-    simd_extract(vqmovn_u32(vdupq_n_u32(a)), 0)
+    simd_extract!(vqmovn_u32(vdupq_n_u32(a)), 0)
 }
 
 /// Saturating extract narrow
@@ -12154,7 +12154,7 @@ pub unsafe fn vqmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t {
 #[cfg_attr(test, assert_instr(sqxtun))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqmovunh_s16(a: i16) -> u8 {
-    simd_extract(vqmovun_s16(vdupq_n_s16(a)), 0)
+    simd_extract!(vqmovun_s16(vdupq_n_s16(a)), 0)
 }
 
 /// Signed saturating extract unsigned narrow
@@ -12165,7 +12165,7 @@ pub unsafe fn vqmovunh_s16(a: i16) -> u8 {
 #[cfg_attr(test, assert_instr(sqxtun))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqmovuns_s32(a: i32) -> u16 {
-    simd_extract(vqmovun_s32(vdupq_n_s32(a)), 0)
+    simd_extract!(vqmovun_s32(vdupq_n_s32(a)), 0)
 }
 
 /// Signed saturating extract unsigned narrow
@@ -12176,7 +12176,7 @@ pub unsafe fn vqmovuns_s32(a: i32) -> u16 {
 #[cfg_attr(test, assert_instr(sqxtun))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqmovund_s64(a: i64) -> u32 {
-    simd_extract(vqmovun_s64(vdupq_n_s64(a)), 0)
+    simd_extract!(vqmovun_s64(vdupq_n_s64(a)), 0)
 }
 
 /// Signed saturating extract unsigned narrow
@@ -12220,7 +12220,7 @@ pub unsafe fn vqmovun_high_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t {
 #[cfg_attr(test, assert_instr(sqrdmulh))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqrdmulhh_s16(a: i16, b: i16) -> i16 {
-    simd_extract(vqrdmulh_s16(vdup_n_s16(a), vdup_n_s16(b)), 0)
+    simd_extract!(vqrdmulh_s16(vdup_n_s16(a), vdup_n_s16(b)), 0)
 }
 
 /// Signed saturating rounding doubling multiply returning high half
@@ -12231,7 +12231,7 @@ pub unsafe fn vqrdmulhh_s16(a: i16, b: i16) -> i16 {
 #[cfg_attr(test, assert_instr(sqrdmulh))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqrdmulhs_s32(a: i32, b: i32) -> i32 {
-    simd_extract(vqrdmulh_s32(vdup_n_s32(a), vdup_n_s32(b)), 0)
+    simd_extract!(vqrdmulh_s32(vdup_n_s32(a), vdup_n_s32(b)), 0)
 }
 
 /// Signed saturating rounding doubling multiply returning high half
@@ -12244,7 +12244,7 @@ pub unsafe fn vqrdmulhs_s32(a: i32, b: i32) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqrdmulhh_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> i16 {
     static_assert_uimm_bits!(LANE, 2);
-    vqrdmulhh_s16(a, simd_extract(b, LANE as u32))
+    vqrdmulhh_s16(a, simd_extract!(b, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply returning high half
@@ -12257,7 +12257,7 @@ pub unsafe fn vqrdmulhh_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqrdmulhh_laneq_s16<const LANE: i32>(a: i16, b: int16x8_t) -> i16 {
     static_assert_uimm_bits!(LANE, 3);
-    vqrdmulhh_s16(a, simd_extract(b, LANE as u32))
+    vqrdmulhh_s16(a, simd_extract!(b, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply returning high half
@@ -12270,7 +12270,7 @@ pub unsafe fn vqrdmulhh_laneq_s16<const LANE: i32>(a: i16, b: int16x8_t) -> i16
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqrdmulhs_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> i32 {
     static_assert_uimm_bits!(LANE, 1);
-    vqrdmulhs_s32(a, simd_extract(b, LANE as u32))
+    vqrdmulhs_s32(a, simd_extract!(b, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply returning high half
@@ -12283,7 +12283,7 @@ pub unsafe fn vqrdmulhs_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqrdmulhs_laneq_s32<const LANE: i32>(a: i32, b: int32x4_t) -> i32 {
     static_assert_uimm_bits!(LANE, 2);
-    vqrdmulhs_s32(a, simd_extract(b, LANE as u32))
+    vqrdmulhs_s32(a, simd_extract!(b, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -12361,7 +12361,7 @@ pub unsafe fn vqrdmlahh_s16(a: i16, b: i16, c: i16) -> i16 {
     let a: int16x4_t = vdup_n_s16(a);
     let b: int16x4_t = vdup_n_s16(b);
     let c: int16x4_t = vdup_n_s16(c);
-    simd_extract(vqrdmlah_s16(a, b, c), 0)
+    simd_extract!(vqrdmlah_s16(a, b, c), 0)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -12375,7 +12375,7 @@ pub unsafe fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32 {
     let a: int32x2_t = vdup_n_s32(a);
     let b: int32x2_t = vdup_n_s32(b);
     let c: int32x2_t = vdup_n_s32(c);
-    simd_extract(vqrdmlah_s32(a, b, c), 0)
+    simd_extract!(vqrdmlah_s32(a, b, c), 0)
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -12500,7 +12500,7 @@ pub unsafe fn vqrdmlahq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c
 #[stable(feature = "rdm_intrinsics", since = "1.62.0")]
 pub unsafe fn vqrdmlahh_lane_s16<const LANE: i32>(a: i16, b: i16, c: int16x4_t) -> i16 {
     static_assert_uimm_bits!(LANE, 2);
-    vqrdmlahh_s16(a, b, simd_extract(c, LANE as u32))
+    vqrdmlahh_s16(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -12513,7 +12513,7 @@ pub unsafe fn vqrdmlahh_lane_s16<const LANE: i32>(a: i16, b: i16, c: int16x4_t)
 #[stable(feature = "rdm_intrinsics", since = "1.62.0")]
 pub unsafe fn vqrdmlahh_laneq_s16<const LANE: i32>(a: i16, b: i16, c: int16x8_t) -> i16 {
     static_assert_uimm_bits!(LANE, 3);
-    vqrdmlahh_s16(a, b, simd_extract(c, LANE as u32))
+    vqrdmlahh_s16(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -12526,7 +12526,7 @@ pub unsafe fn vqrdmlahh_laneq_s16<const LANE: i32>(a: i16, b: i16, c: int16x8_t)
 #[stable(feature = "rdm_intrinsics", since = "1.62.0")]
 pub unsafe fn vqrdmlahs_lane_s32<const LANE: i32>(a: i32, b: i32, c: int32x2_t) -> i32 {
     static_assert_uimm_bits!(LANE, 1);
-    vqrdmlahs_s32(a, b, simd_extract(c, LANE as u32))
+    vqrdmlahs_s32(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply accumulate returning high half
@@ -12539,7 +12539,7 @@ pub unsafe fn vqrdmlahs_lane_s32<const LANE: i32>(a: i32, b: i32, c: int32x2_t)
 #[stable(feature = "rdm_intrinsics", since = "1.62.0")]
 pub unsafe fn vqrdmlahs_laneq_s32<const LANE: i32>(a: i32, b: i32, c: int32x4_t) -> i32 {
     static_assert_uimm_bits!(LANE, 2);
-    vqrdmlahs_s32(a, b, simd_extract(c, LANE as u32))
+    vqrdmlahs_s32(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply subtract returning high half
@@ -12617,7 +12617,7 @@ pub unsafe fn vqrdmlshh_s16(a: i16, b: i16, c: i16) -> i16 {
     let a: int16x4_t = vdup_n_s16(a);
     let b: int16x4_t = vdup_n_s16(b);
     let c: int16x4_t = vdup_n_s16(c);
-    simd_extract(vqrdmlsh_s16(a, b, c), 0)
+    simd_extract!(vqrdmlsh_s16(a, b, c), 0)
 }
 
 /// Signed saturating rounding doubling multiply subtract returning high half
@@ -12631,7 +12631,7 @@ pub unsafe fn vqrdmlshs_s32(a: i32, b: i32, c: i32) -> i32 {
     let a: int32x2_t = vdup_n_s32(a);
     let b: int32x2_t = vdup_n_s32(b);
     let c: int32x2_t = vdup_n_s32(c);
-    simd_extract(vqrdmlsh_s32(a, b, c), 0)
+    simd_extract!(vqrdmlsh_s32(a, b, c), 0)
 }
 
 /// Signed saturating rounding doubling multiply subtract returning high half
@@ -12756,7 +12756,7 @@ pub unsafe fn vqrdmlshq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t, c
 #[stable(feature = "rdm_intrinsics", since = "1.62.0")]
 pub unsafe fn vqrdmlshh_lane_s16<const LANE: i32>(a: i16, b: i16, c: int16x4_t) -> i16 {
     static_assert_uimm_bits!(LANE, 2);
-    vqrdmlshh_s16(a, b, simd_extract(c, LANE as u32))
+    vqrdmlshh_s16(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply subtract returning high half
@@ -12769,7 +12769,7 @@ pub unsafe fn vqrdmlshh_lane_s16<const LANE: i32>(a: i16, b: i16, c: int16x4_t)
 #[stable(feature = "rdm_intrinsics", since = "1.62.0")]
 pub unsafe fn vqrdmlshh_laneq_s16<const LANE: i32>(a: i16, b: i16, c: int16x8_t) -> i16 {
     static_assert_uimm_bits!(LANE, 3);
-    vqrdmlshh_s16(a, b, simd_extract(c, LANE as u32))
+    vqrdmlshh_s16(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply subtract returning high half
@@ -12782,7 +12782,7 @@ pub unsafe fn vqrdmlshh_laneq_s16<const LANE: i32>(a: i16, b: i16, c: int16x8_t)
 #[stable(feature = "rdm_intrinsics", since = "1.62.0")]
 pub unsafe fn vqrdmlshs_lane_s32<const LANE: i32>(a: i32, b: i32, c: int32x2_t) -> i32 {
     static_assert_uimm_bits!(LANE, 1);
-    vqrdmlshs_s32(a, b, simd_extract(c, LANE as u32))
+    vqrdmlshs_s32(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating rounding doubling multiply subtract returning high half
@@ -12795,7 +12795,7 @@ pub unsafe fn vqrdmlshs_lane_s32<const LANE: i32>(a: i32, b: i32, c: int32x2_t)
 #[stable(feature = "rdm_intrinsics", since = "1.62.0")]
 pub unsafe fn vqrdmlshs_laneq_s32<const LANE: i32>(a: i32, b: i32, c: int32x4_t) -> i32 {
     static_assert_uimm_bits!(LANE, 2);
-    vqrdmlshs_s32(a, b, simd_extract(c, LANE as u32))
+    vqrdmlshs_s32(a, b, simd_extract!(c, LANE as u32))
 }
 
 /// Signed saturating rounding shift left
@@ -12840,7 +12840,7 @@ pub unsafe fn vqrshld_s64(a: i64, b: i64) -> i64 {
 pub unsafe fn vqrshlb_s8(a: i8, b: i8) -> i8 {
     let a: int8x8_t = vdup_n_s8(a);
     let b: int8x8_t = vdup_n_s8(b);
-    simd_extract(vqrshl_s8(a, b), 0)
+    simd_extract!(vqrshl_s8(a, b), 0)
 }
 
 /// Signed saturating rounding shift left
@@ -12853,7 +12853,7 @@ pub unsafe fn vqrshlb_s8(a: i8, b: i8) -> i8 {
 pub unsafe fn vqrshlh_s16(a: i16, b: i16) -> i16 {
     let a: int16x4_t = vdup_n_s16(a);
     let b: int16x4_t = vdup_n_s16(b);
-    simd_extract(vqrshl_s16(a, b), 0)
+    simd_extract!(vqrshl_s16(a, b), 0)
 }
 
 /// Unsigned signed saturating rounding shift left
@@ -12898,7 +12898,7 @@ pub unsafe fn vqrshld_u64(a: u64, b: i64) -> u64 {
 pub unsafe fn vqrshlb_u8(a: u8, b: i8) -> u8 {
     let a: uint8x8_t = vdup_n_u8(a);
     let b: int8x8_t = vdup_n_s8(b);
-    simd_extract(vqrshl_u8(a, b), 0)
+    simd_extract!(vqrshl_u8(a, b), 0)
 }
 
 /// Unsigned signed saturating rounding shift left
@@ -12911,7 +12911,7 @@ pub unsafe fn vqrshlb_u8(a: u8, b: i8) -> u8 {
 pub unsafe fn vqrshlh_u16(a: u16, b: i16) -> u16 {
     let a: uint16x4_t = vdup_n_u16(a);
     let b: int16x4_t = vdup_n_s16(b);
-    simd_extract(vqrshl_u16(a, b), 0)
+    simd_extract!(vqrshl_u16(a, b), 0)
 }
 
 /// Signed saturating rounded shift right narrow
@@ -12925,7 +12925,7 @@ pub unsafe fn vqrshlh_u16(a: u16, b: i16) -> u16 {
 pub unsafe fn vqrshrnh_n_s16<const N: i32>(a: i16) -> i8 {
     static_assert!(N >= 1 && N <= 8);
     let a: int16x8_t = vdupq_n_s16(a);
-    simd_extract(vqrshrn_n_s16::<N>(a), 0)
+    simd_extract!(vqrshrn_n_s16::<N>(a), 0)
 }
 
 /// Signed saturating rounded shift right narrow
@@ -12939,7 +12939,7 @@ pub unsafe fn vqrshrnh_n_s16<const N: i32>(a: i16) -> i8 {
 pub unsafe fn vqrshrns_n_s32<const N: i32>(a: i32) -> i16 {
     static_assert!(N >= 1 && N <= 16);
     let a: int32x4_t = vdupq_n_s32(a);
-    simd_extract(vqrshrn_n_s32::<N>(a), 0)
+    simd_extract!(vqrshrn_n_s32::<N>(a), 0)
 }
 
 /// Signed saturating rounded shift right narrow
@@ -12953,7 +12953,7 @@ pub unsafe fn vqrshrns_n_s32<const N: i32>(a: i32) -> i16 {
 pub unsafe fn vqrshrnd_n_s64<const N: i32>(a: i64) -> i32 {
     static_assert!(N >= 1 && N <= 32);
     let a: int64x2_t = vdupq_n_s64(a);
-    simd_extract(vqrshrn_n_s64::<N>(a), 0)
+    simd_extract!(vqrshrn_n_s64::<N>(a), 0)
 }
 
 /// Signed saturating rounded shift right narrow
@@ -13006,7 +13006,7 @@ pub unsafe fn vqrshrn_high_n_s64<const N: i32>(a: int32x2_t, b: int64x2_t) -> in
 pub unsafe fn vqrshrnh_n_u16<const N: i32>(a: u16) -> u8 {
     static_assert!(N >= 1 && N <= 8);
     let a: uint16x8_t = vdupq_n_u16(a);
-    simd_extract(vqrshrn_n_u16::<N>(a), 0)
+    simd_extract!(vqrshrn_n_u16::<N>(a), 0)
 }
 
 /// Unsigned saturating rounded shift right narrow
@@ -13020,7 +13020,7 @@ pub unsafe fn vqrshrnh_n_u16<const N: i32>(a: u16) -> u8 {
 pub unsafe fn vqrshrns_n_u32<const N: i32>(a: u32) -> u16 {
     static_assert!(N >= 1 && N <= 16);
     let a: uint32x4_t = vdupq_n_u32(a);
-    simd_extract(vqrshrn_n_u32::<N>(a), 0)
+    simd_extract!(vqrshrn_n_u32::<N>(a), 0)
 }
 
 /// Unsigned saturating rounded shift right narrow
@@ -13034,7 +13034,7 @@ pub unsafe fn vqrshrns_n_u32<const N: i32>(a: u32) -> u16 {
 pub unsafe fn vqrshrnd_n_u64<const N: i32>(a: u64) -> u32 {
     static_assert!(N >= 1 && N <= 32);
     let a: uint64x2_t = vdupq_n_u64(a);
-    simd_extract(vqrshrn_n_u64::<N>(a), 0)
+    simd_extract!(vqrshrn_n_u64::<N>(a), 0)
 }
 
 /// Unsigned saturating rounded shift right narrow
@@ -13087,7 +13087,7 @@ pub unsafe fn vqrshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) ->
 pub unsafe fn vqrshrunh_n_s16<const N: i32>(a: i16) -> u8 {
     static_assert!(N >= 1 && N <= 8);
     let a: int16x8_t = vdupq_n_s16(a);
-    simd_extract(vqrshrun_n_s16::<N>(a), 0)
+    simd_extract!(vqrshrun_n_s16::<N>(a), 0)
 }
 
 /// Signed saturating rounded shift right unsigned narrow
@@ -13101,7 +13101,7 @@ pub unsafe fn vqrshrunh_n_s16<const N: i32>(a: i16) -> u8 {
 pub unsafe fn vqrshruns_n_s32<const N: i32>(a: i32) -> u16 {
     static_assert!(N >= 1 && N <= 16);
     let a: int32x4_t = vdupq_n_s32(a);
-    simd_extract(vqrshrun_n_s32::<N>(a), 0)
+    simd_extract!(vqrshrun_n_s32::<N>(a), 0)
 }
 
 /// Signed saturating rounded shift right unsigned narrow
@@ -13115,7 +13115,7 @@ pub unsafe fn vqrshruns_n_s32<const N: i32>(a: i32) -> u16 {
 pub unsafe fn vqrshrund_n_s64<const N: i32>(a: i64) -> u32 {
     static_assert!(N >= 1 && N <= 32);
     let a: int64x2_t = vdupq_n_s64(a);
-    simd_extract(vqrshrun_n_s64::<N>(a), 0)
+    simd_extract!(vqrshrun_n_s64::<N>(a), 0)
 }
 
 /// Signed saturating rounded shift right unsigned narrow
@@ -13182,7 +13182,7 @@ pub unsafe fn vqshld_s64(a: i64, b: i64) -> i64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlb_s8(a: i8, b: i8) -> i8 {
     let c: int8x8_t = vqshl_s8(vdup_n_s8(a), vdup_n_s8(b));
-    simd_extract(c, 0)
+    simd_extract!(c, 0)
 }
 
 /// Signed saturating shift left
@@ -13194,7 +13194,7 @@ pub unsafe fn vqshlb_s8(a: i8, b: i8) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlh_s16(a: i16, b: i16) -> i16 {
     let c: int16x4_t = vqshl_s16(vdup_n_s16(a), vdup_n_s16(b));
-    simd_extract(c, 0)
+    simd_extract!(c, 0)
 }
 
 /// Signed saturating shift left
@@ -13206,7 +13206,7 @@ pub unsafe fn vqshlh_s16(a: i16, b: i16) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshls_s32(a: i32, b: i32) -> i32 {
     let c: int32x2_t = vqshl_s32(vdup_n_s32(a), vdup_n_s32(b));
-    simd_extract(c, 0)
+    simd_extract!(c, 0)
 }
 
 /// Unsigned saturating shift left
@@ -13234,7 +13234,7 @@ pub unsafe fn vqshld_u64(a: u64, b: i64) -> u64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlb_u8(a: u8, b: i8) -> u8 {
     let c: uint8x8_t = vqshl_u8(vdup_n_u8(a), vdup_n_s8(b));
-    simd_extract(c, 0)
+    simd_extract!(c, 0)
 }
 
 /// Unsigned saturating shift left
@@ -13246,7 +13246,7 @@ pub unsafe fn vqshlb_u8(a: u8, b: i8) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlh_u16(a: u16, b: i16) -> u16 {
     let c: uint16x4_t = vqshl_u16(vdup_n_u16(a), vdup_n_s16(b));
-    simd_extract(c, 0)
+    simd_extract!(c, 0)
 }
 
 /// Unsigned saturating shift left
@@ -13258,7 +13258,7 @@ pub unsafe fn vqshlh_u16(a: u16, b: i16) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshls_u32(a: u32, b: i32) -> u32 {
     let c: uint32x2_t = vqshl_u32(vdup_n_u32(a), vdup_n_s32(b));
-    simd_extract(c, 0)
+    simd_extract!(c, 0)
 }
 
 /// Signed saturating shift left
@@ -13271,7 +13271,7 @@ pub unsafe fn vqshls_u32(a: u32, b: i32) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlb_n_s8<const N: i32>(a: i8) -> i8 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(vqshl_n_s8::<N>(vdup_n_s8(a)), 0)
+    simd_extract!(vqshl_n_s8::<N>(vdup_n_s8(a)), 0)
 }
 
 /// Signed saturating shift left
@@ -13284,7 +13284,7 @@ pub unsafe fn vqshlb_n_s8<const N: i32>(a: i8) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlh_n_s16<const N: i32>(a: i16) -> i16 {
     static_assert_uimm_bits!(N, 4);
-    simd_extract(vqshl_n_s16::<N>(vdup_n_s16(a)), 0)
+    simd_extract!(vqshl_n_s16::<N>(vdup_n_s16(a)), 0)
 }
 
 /// Signed saturating shift left
@@ -13297,7 +13297,7 @@ pub unsafe fn vqshlh_n_s16<const N: i32>(a: i16) -> i16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshls_n_s32<const N: i32>(a: i32) -> i32 {
     static_assert_uimm_bits!(N, 5);
-    simd_extract(vqshl_n_s32::<N>(vdup_n_s32(a)), 0)
+    simd_extract!(vqshl_n_s32::<N>(vdup_n_s32(a)), 0)
 }
 
 /// Signed saturating shift left
@@ -13310,7 +13310,7 @@ pub unsafe fn vqshls_n_s32<const N: i32>(a: i32) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshld_n_s64<const N: i32>(a: i64) -> i64 {
     static_assert_uimm_bits!(N, 6);
-    simd_extract(vqshl_n_s64::<N>(vdup_n_s64(a)), 0)
+    simd_extract!(vqshl_n_s64::<N>(vdup_n_s64(a)), 0)
 }
 
 /// Unsigned saturating shift left
@@ -13323,7 +13323,7 @@ pub unsafe fn vqshld_n_s64<const N: i32>(a: i64) -> i64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlb_n_u8<const N: i32>(a: u8) -> u8 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(vqshl_n_u8::<N>(vdup_n_u8(a)), 0)
+    simd_extract!(vqshl_n_u8::<N>(vdup_n_u8(a)), 0)
 }
 
 /// Unsigned saturating shift left
@@ -13336,7 +13336,7 @@ pub unsafe fn vqshlb_n_u8<const N: i32>(a: u8) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlh_n_u16<const N: i32>(a: u16) -> u16 {
     static_assert_uimm_bits!(N, 4);
-    simd_extract(vqshl_n_u16::<N>(vdup_n_u16(a)), 0)
+    simd_extract!(vqshl_n_u16::<N>(vdup_n_u16(a)), 0)
 }
 
 /// Unsigned saturating shift left
@@ -13349,7 +13349,7 @@ pub unsafe fn vqshlh_n_u16<const N: i32>(a: u16) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshls_n_u32<const N: i32>(a: u32) -> u32 {
     static_assert_uimm_bits!(N, 5);
-    simd_extract(vqshl_n_u32::<N>(vdup_n_u32(a)), 0)
+    simd_extract!(vqshl_n_u32::<N>(vdup_n_u32(a)), 0)
 }
 
 /// Unsigned saturating shift left
@@ -13362,7 +13362,7 @@ pub unsafe fn vqshls_n_u32<const N: i32>(a: u32) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshld_n_u64<const N: i32>(a: u64) -> u64 {
     static_assert_uimm_bits!(N, 6);
-    simd_extract(vqshl_n_u64::<N>(vdup_n_u64(a)), 0)
+    simd_extract!(vqshl_n_u64::<N>(vdup_n_u64(a)), 0)
 }
 
 /// Signed saturating shift left unsigned
@@ -13375,7 +13375,7 @@ pub unsafe fn vqshld_n_u64<const N: i32>(a: u64) -> u64 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlub_n_s8<const N: i32>(a: i8) -> u8 {
     static_assert_uimm_bits!(N, 3);
-    simd_extract(vqshlu_n_s8::<N>(vdup_n_s8(a)), 0)
+    simd_extract!(vqshlu_n_s8::<N>(vdup_n_s8(a)), 0)
 }
 
 /// Signed saturating shift left unsigned
@@ -13388,7 +13388,7 @@ pub unsafe fn vqshlub_n_s8<const N: i32>(a: i8) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshluh_n_s16<const N: i32>(a: i16) -> u16 {
     static_assert_uimm_bits!(N, 4);
-    simd_extract(vqshlu_n_s16::<N>(vdup_n_s16(a)), 0)
+    simd_extract!(vqshlu_n_s16::<N>(vdup_n_s16(a)), 0)
 }
 
 /// Signed saturating shift left unsigned
@@ -13401,7 +13401,7 @@ pub unsafe fn vqshluh_n_s16<const N: i32>(a: i16) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlus_n_s32<const N: i32>(a: i32) -> u32 {
     static_assert_uimm_bits!(N, 5);
-    simd_extract(vqshlu_n_s32::<N>(vdup_n_s32(a)), 0)
+    simd_extract!(vqshlu_n_s32::<N>(vdup_n_s32(a)), 0)
 }
 
 /// Signed saturating shift left unsigned
@@ -13414,7 +13414,7 @@ pub unsafe fn vqshlus_n_s32<const N: i32>(a: i32) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshlud_n_s64<const N: i32>(a: i64) -> u64 {
     static_assert_uimm_bits!(N, 6);
-    simd_extract(vqshlu_n_s64::<N>(vdup_n_s64(a)), 0)
+    simd_extract!(vqshlu_n_s64::<N>(vdup_n_s64(a)), 0)
 }
 
 /// Signed saturating shift right narrow
@@ -13445,7 +13445,7 @@ pub unsafe fn vqshrnd_n_s64<const N: i32>(a: i64) -> i32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshrnh_n_s16<const N: i32>(a: i16) -> i8 {
     static_assert!(N >= 1 && N <= 8);
-    simd_extract(vqshrn_n_s16::<N>(vdupq_n_s16(a)), 0)
+    simd_extract!(vqshrn_n_s16::<N>(vdupq_n_s16(a)), 0)
 }
 
 /// Signed saturating shift right narrow
@@ -13458,7 +13458,7 @@ pub unsafe fn vqshrnh_n_s16<const N: i32>(a: i16) -> i8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshrns_n_s32<const N: i32>(a: i32) -> i16 {
     static_assert!(N >= 1 && N <= 16);
-    simd_extract(vqshrn_n_s32::<N>(vdupq_n_s32(a)), 0)
+    simd_extract!(vqshrn_n_s32::<N>(vdupq_n_s32(a)), 0)
 }
 
 /// Signed saturating shift right narrow
@@ -13528,7 +13528,7 @@ pub unsafe fn vqshrnd_n_u64<const N: i32>(a: u64) -> u32 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshrnh_n_u16<const N: i32>(a: u16) -> u8 {
     static_assert!(N >= 1 && N <= 8);
-    simd_extract(vqshrn_n_u16::<N>(vdupq_n_u16(a)), 0)
+    simd_extract!(vqshrn_n_u16::<N>(vdupq_n_u16(a)), 0)
 }
 
 /// Unsigned saturating shift right narrow
@@ -13541,7 +13541,7 @@ pub unsafe fn vqshrnh_n_u16<const N: i32>(a: u16) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshrns_n_u32<const N: i32>(a: u32) -> u16 {
     static_assert!(N >= 1 && N <= 16);
-    simd_extract(vqshrn_n_u32::<N>(vdupq_n_u32(a)), 0)
+    simd_extract!(vqshrn_n_u32::<N>(vdupq_n_u32(a)), 0)
 }
 
 /// Unsigned saturating shift right narrow
@@ -13593,7 +13593,7 @@ pub unsafe fn vqshrn_high_n_u64<const N: i32>(a: uint32x2_t, b: uint64x2_t) -> u
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshrunh_n_s16<const N: i32>(a: i16) -> u8 {
     static_assert!(N >= 1 && N <= 8);
-    simd_extract(vqshrun_n_s16::<N>(vdupq_n_s16(a)), 0)
+    simd_extract!(vqshrun_n_s16::<N>(vdupq_n_s16(a)), 0)
 }
 
 /// Signed saturating shift right unsigned narrow
@@ -13606,7 +13606,7 @@ pub unsafe fn vqshrunh_n_s16<const N: i32>(a: i16) -> u8 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshruns_n_s32<const N: i32>(a: i32) -> u16 {
     static_assert!(N >= 1 && N <= 16);
-    simd_extract(vqshrun_n_s32::<N>(vdupq_n_s32(a)), 0)
+    simd_extract!(vqshrun_n_s32::<N>(vdupq_n_s32(a)), 0)
 }
 
 /// Signed saturating shift right unsigned narrow
@@ -13619,7 +13619,7 @@ pub unsafe fn vqshruns_n_s32<const N: i32>(a: i32) -> u16 {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqshrund_n_s64<const N: i32>(a: i64) -> u32 {
     static_assert!(N >= 1 && N <= 32);
-    simd_extract(vqshrun_n_s64::<N>(vdupq_n_s64(a)), 0)
+    simd_extract!(vqshrun_n_s64::<N>(vdupq_n_s64(a)), 0)
 }
 
 /// Signed saturating shift right unsigned narrow
@@ -13669,7 +13669,7 @@ pub unsafe fn vqshrun_high_n_s64<const N: i32>(a: uint32x2_t, b: int64x2_t) -> u
 #[cfg_attr(test, assert_instr(usqadd))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vsqaddb_u8(a: u8, b: i8) -> u8 {
-    simd_extract(vsqadd_u8(vdup_n_u8(a), vdup_n_s8(b)), 0)
+    simd_extract!(vsqadd_u8(vdup_n_u8(a), vdup_n_s8(b)), 0)
 }
 
 /// Unsigned saturating accumulate of signed value
@@ -13680,7 +13680,7 @@ pub unsafe fn vsqaddb_u8(a: u8, b: i8) -> u8 {
 #[cfg_attr(test, assert_instr(usqadd))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vsqaddh_u16(a: u16, b: i16) -> u16 {
-    simd_extract(vsqadd_u16(vdup_n_u16(a), vdup_n_s16(b)), 0)
+    simd_extract!(vsqadd_u16(vdup_n_u16(a), vdup_n_s16(b)), 0)
 }
 
 /// Unsigned saturating accumulate of signed value
@@ -14975,7 +14975,7 @@ pub unsafe fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> u
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vset_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> float64x1_t {
     static_assert!(LANE == 0);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -14988,7 +14988,7 @@ pub unsafe fn vset_lane_f64<const LANE: i32>(a: f64, b: float64x1_t) -> float64x
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vsetq_lane_f64<const LANE: i32>(a: f64, b: float64x2_t) -> float64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Signed Shift left
@@ -15396,7 +15396,7 @@ pub unsafe fn vrnd32x_f64(a: float64x1_t) -> float64x1_t {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.frint32x.f64")]
         fn vrnd32x_f64_(a: f64) -> f64;
     }
-    transmute(vrnd32x_f64_(simd_extract(a, 0)))
+    transmute(vrnd32x_f64_(simd_extract!(a, 0)))
 }
 
 /// Floating-point round to 32-bit integer toward zero
@@ -15460,7 +15460,7 @@ pub unsafe fn vrnd32z_f64(a: float64x1_t) -> float64x1_t {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.frint32z.f64")]
         fn vrnd32z_f64_(a: f64) -> f64;
     }
-    transmute(vrnd32z_f64_(simd_extract(a, 0)))
+    transmute(vrnd32z_f64_(simd_extract!(a, 0)))
 }
 
 /// Floating-point round to 64-bit integer, using current rounding mode
@@ -15524,7 +15524,7 @@ pub unsafe fn vrnd64x_f64(a: float64x1_t) -> float64x1_t {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.frint64x.f64")]
         fn vrnd64x_f64_(a: f64) -> f64;
     }
-    transmute(vrnd64x_f64_(simd_extract(a, 0)))
+    transmute(vrnd64x_f64_(simd_extract!(a, 0)))
 }
 
 /// Floating-point round to 64-bit integer toward zero
@@ -15588,7 +15588,7 @@ pub unsafe fn vrnd64z_f64(a: float64x1_t) -> float64x1_t {
         #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.frint64z.f64")]
         fn vrnd64z_f64_(a: f64) -> f64;
     }
-    transmute(vrnd64z_f64_(simd_extract(a, 0)))
+    transmute(vrnd64z_f64_(simd_extract!(a, 0)))
 }
 
 /// Transpose vectors
@@ -17170,7 +17170,7 @@ pub unsafe fn vqabsq_s64(a: int64x2_t) -> int64x2_t {
 #[cfg_attr(test, assert_instr(sqabs))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqabsb_s8(a: i8) -> i8 {
-    simd_extract(vqabs_s8(vdup_n_s8(a)), 0)
+    simd_extract!(vqabs_s8(vdup_n_s8(a)), 0)
 }
 
 /// Signed saturating absolute value
@@ -17181,7 +17181,7 @@ pub unsafe fn vqabsb_s8(a: i8) -> i8 {
 #[cfg_attr(test, assert_instr(sqabs))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vqabsh_s16(a: i16) -> i16 {
-    simd_extract(vqabs_s16(vdup_n_s16(a)), 0)
+    simd_extract!(vqabs_s16(vdup_n_s16(a)), 0)
 }
 
 /// Signed saturating absolute value
diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index 567eeb37cc..7556f2915e 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -436,7 +436,7 @@ pub unsafe fn vcopy_laneq_s64<const LANE1: i32, const LANE2: i32>(
 ) -> int64x1_t {
     static_assert!(LANE1 == 0);
     static_assert_uimm_bits!(LANE2, 1);
-    transmute::<i64, _>(simd_extract(b, LANE2 as u32))
+    transmute::<i64, _>(simd_extract!(b, LANE2 as u32))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -451,7 +451,7 @@ pub unsafe fn vcopy_laneq_u64<const LANE1: i32, const LANE2: i32>(
 ) -> uint64x1_t {
     static_assert!(LANE1 == 0);
     static_assert_uimm_bits!(LANE2, 1);
-    transmute::<u64, _>(simd_extract(b, LANE2 as u32))
+    transmute::<u64, _>(simd_extract!(b, LANE2 as u32))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -466,7 +466,7 @@ pub unsafe fn vcopy_laneq_p64<const LANE1: i32, const LANE2: i32>(
 ) -> poly64x1_t {
     static_assert!(LANE1 == 0);
     static_assert_uimm_bits!(LANE2, 1);
-    transmute::<u64, _>(simd_extract(b, LANE2 as u32))
+    transmute::<u64, _>(simd_extract!(b, LANE2 as u32))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -481,7 +481,7 @@ pub unsafe fn vcopy_laneq_f64<const LANE1: i32, const LANE2: i32>(
 ) -> float64x1_t {
     static_assert!(LANE1 == 0);
     static_assert_uimm_bits!(LANE2, 1);
-    transmute::<f64, _>(simd_extract(b, LANE2 as u32))
+    transmute::<f64, _>(simd_extract!(b, LANE2 as u32))
 }
 
 /// Load multiple single-element structures to one, two, three, or four registers.
@@ -749,7 +749,7 @@ pub unsafe fn vld1q_dup_f64(ptr: *const f64) -> float64x2_t {
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vld1_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x1_t) -> float64x1_t {
     static_assert!(LANE == 0);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -760,7 +760,7 @@ pub unsafe fn vld1_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x1_t)
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vld1q_lane_f64<const LANE: i32>(ptr: *const f64, src: float64x2_t) -> float64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers.
@@ -2038,7 +2038,7 @@ pub unsafe fn vmovq_n_f64(value: f64) -> float64x2_t {
 #[cfg_attr(test, assert_instr(mov))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vget_high_f64(a: float64x2_t) -> float64x1_t {
-    float64x1_t(simd_extract(a, 1))
+    float64x1_t(simd_extract!(a, 1))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -2047,7 +2047,7 @@ pub unsafe fn vget_high_f64(a: float64x2_t) -> float64x1_t {
 #[cfg_attr(test, assert_instr(ext))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vget_high_p64(a: poly64x2_t) -> poly64x1_t {
-    transmute(u64x1::new(simd_extract(a, 1)))
+    transmute(u64x1::new(simd_extract!(a, 1)))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -2056,7 +2056,7 @@ pub unsafe fn vget_high_p64(a: poly64x2_t) -> poly64x1_t {
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vget_low_f64(a: float64x2_t) -> float64x1_t {
-    float64x1_t(simd_extract(a, 0))
+    float64x1_t(simd_extract!(a, 0))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -2065,7 +2065,7 @@ pub unsafe fn vget_low_f64(a: float64x2_t) -> float64x1_t {
 #[cfg_attr(test, assert_instr(nop))]
 #[stable(feature = "neon_intrinsics", since = "1.59.0")]
 pub unsafe fn vget_low_p64(a: poly64x2_t) -> poly64x1_t {
-    transmute(u64x1::new(simd_extract(a, 0)))
+    transmute(u64x1::new(simd_extract!(a, 0)))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -2076,7 +2076,7 @@ pub unsafe fn vget_low_p64(a: poly64x2_t) -> poly64x1_t {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, IMM5 = 0))]
 pub unsafe fn vget_lane_f64<const IMM5: i32>(v: float64x1_t) -> f64 {
     static_assert!(IMM5 == 0);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Duplicate vector element to vector or scalar
@@ -2087,7 +2087,7 @@ pub unsafe fn vget_lane_f64<const IMM5: i32>(v: float64x1_t) -> f64 {
 #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, IMM5 = 0))]
 pub unsafe fn vgetq_lane_f64<const IMM5: i32>(v: float64x2_t) -> f64 {
     static_assert_uimm_bits!(IMM5, 1);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Vector combine
diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
index 0c0fd53e18..631c302db9 100644
--- a/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -4141,7 +4141,7 @@ pub unsafe fn vdup_lane_u64<const N: i32>(a: uint64x1_t) -> uint64x1_t {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vdup_laneq_s64<const N: i32>(a: int64x2_t) -> int64x1_t {
     static_assert_uimm_bits!(N, 1);
-    transmute::<i64, _>(simd_extract(a, N as u32))
+    transmute::<i64, _>(simd_extract!(a, N as u32))
 }
 
 /// Set all vector lanes to the same value
@@ -4157,7 +4157,7 @@ pub unsafe fn vdup_laneq_s64<const N: i32>(a: int64x2_t) -> int64x1_t {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vdup_laneq_u64<const N: i32>(a: uint64x2_t) -> uint64x1_t {
     static_assert_uimm_bits!(N, 1);
-    transmute::<u64, _>(simd_extract(a, N as u32))
+    transmute::<u64, _>(simd_extract!(a, N as u32))
 }
 
 /// Extract vector from pair of vectors
@@ -13117,7 +13117,7 @@ vld4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a as _)
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8_t) {
     static_assert_uimm_bits!(LANE, 3);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13133,7 +13133,7 @@ pub unsafe fn vst1_lane_s8<const LANE: i32>(a: *mut i8, b: int8x8_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4_t) {
     static_assert_uimm_bits!(LANE, 2);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13149,7 +13149,7 @@ pub unsafe fn vst1_lane_s16<const LANE: i32>(a: *mut i16, b: int16x4_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2_t) {
     static_assert_uimm_bits!(LANE, 1);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13165,7 +13165,7 @@ pub unsafe fn vst1_lane_s32<const LANE: i32>(a: *mut i32, b: int32x2_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1_t) {
     static_assert!(LANE == 0);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13181,7 +13181,7 @@ pub unsafe fn vst1_lane_s64<const LANE: i32>(a: *mut i64, b: int64x1_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16_t) {
     static_assert_uimm_bits!(LANE, 4);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13197,7 +13197,7 @@ pub unsafe fn vst1q_lane_s8<const LANE: i32>(a: *mut i8, b: int8x16_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8_t) {
     static_assert_uimm_bits!(LANE, 3);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13213,7 +13213,7 @@ pub unsafe fn vst1q_lane_s16<const LANE: i32>(a: *mut i16, b: int16x8_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4_t) {
     static_assert_uimm_bits!(LANE, 2);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13229,7 +13229,7 @@ pub unsafe fn vst1q_lane_s32<const LANE: i32>(a: *mut i32, b: int32x4_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2_t) {
     static_assert_uimm_bits!(LANE, 1);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13245,7 +13245,7 @@ pub unsafe fn vst1q_lane_s64<const LANE: i32>(a: *mut i64, b: int64x2_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8_t) {
     static_assert_uimm_bits!(LANE, 3);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13261,7 +13261,7 @@ pub unsafe fn vst1_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x8_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4_t) {
     static_assert_uimm_bits!(LANE, 2);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13277,7 +13277,7 @@ pub unsafe fn vst1_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x4_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2_t) {
     static_assert_uimm_bits!(LANE, 1);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13293,7 +13293,7 @@ pub unsafe fn vst1_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x2_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1_t) {
     static_assert!(LANE == 0);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13309,7 +13309,7 @@ pub unsafe fn vst1_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x1_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16_t) {
     static_assert_uimm_bits!(LANE, 4);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13325,7 +13325,7 @@ pub unsafe fn vst1q_lane_u8<const LANE: i32>(a: *mut u8, b: uint8x16_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8_t) {
     static_assert_uimm_bits!(LANE, 3);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13341,7 +13341,7 @@ pub unsafe fn vst1q_lane_u16<const LANE: i32>(a: *mut u16, b: uint16x8_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4_t) {
     static_assert_uimm_bits!(LANE, 2);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13357,7 +13357,7 @@ pub unsafe fn vst1q_lane_u32<const LANE: i32>(a: *mut u32, b: uint32x4_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2_t) {
     static_assert_uimm_bits!(LANE, 1);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13373,7 +13373,7 @@ pub unsafe fn vst1q_lane_u64<const LANE: i32>(a: *mut u64, b: uint64x2_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8_t) {
     static_assert_uimm_bits!(LANE, 3);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13389,7 +13389,7 @@ pub unsafe fn vst1_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x8_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4_t) {
     static_assert_uimm_bits!(LANE, 2);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13405,7 +13405,7 @@ pub unsafe fn vst1_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x4_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16_t) {
     static_assert_uimm_bits!(LANE, 4);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13421,7 +13421,7 @@ pub unsafe fn vst1q_lane_p8<const LANE: i32>(a: *mut p8, b: poly8x16_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8_t) {
     static_assert_uimm_bits!(LANE, 3);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13437,7 +13437,7 @@ pub unsafe fn vst1q_lane_p16<const LANE: i32>(a: *mut p16, b: poly16x8_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1_t) {
     static_assert!(LANE == 0);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13453,7 +13453,7 @@ pub unsafe fn vst1_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x1_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2_t) {
     static_assert_uimm_bits!(LANE, 1);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13469,7 +13469,7 @@ pub unsafe fn vst1q_lane_p64<const LANE: i32>(a: *mut p64, b: poly64x2_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2_t) {
     static_assert_uimm_bits!(LANE, 1);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -13485,7 +13485,7 @@ pub unsafe fn vst1_lane_f32<const LANE: i32>(a: *mut f32, b: float32x2_t) {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vst1q_lane_f32<const LANE: i32>(a: *mut f32, b: float32x4_t) {
     static_assert_uimm_bits!(LANE, 2);
-    *a = simd_extract(b, LANE as u32);
+    *a = simd_extract!(b, LANE as u32);
 }
 
 /// Store multiple single-element structures from one, two, three, or four registers
@@ -21151,7 +21151,7 @@ pub unsafe fn vqdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vqdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) -> int16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    vqdmulhq_s16(a, vdupq_n_s16(simd_extract(b, LANE as u32)))
+    vqdmulhq_s16(a, vdupq_n_s16(simd_extract!(b, LANE as u32)))
 }
 
 /// Vector saturating doubling multiply high by scalar
@@ -21167,7 +21167,7 @@ pub unsafe fn vqdmulhq_laneq_s16<const LANE: i32>(a: int16x8_t, b: int16x8_t) ->
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vqdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) -> int16x4_t {
     static_assert_uimm_bits!(LANE, 3);
-    vqdmulh_s16(a, vdup_n_s16(simd_extract(b, LANE as u32)))
+    vqdmulh_s16(a, vdup_n_s16(simd_extract!(b, LANE as u32)))
 }
 
 /// Vector saturating doubling multiply high by scalar
@@ -21183,7 +21183,7 @@ pub unsafe fn vqdmulh_laneq_s16<const LANE: i32>(a: int16x4_t, b: int16x8_t) ->
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vqdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    vqdmulhq_s32(a, vdupq_n_s32(simd_extract(b, LANE as u32)))
+    vqdmulhq_s32(a, vdupq_n_s32(simd_extract!(b, LANE as u32)))
 }
 
 /// Vector saturating doubling multiply high by scalar
@@ -21199,7 +21199,7 @@ pub unsafe fn vqdmulhq_laneq_s32<const LANE: i32>(a: int32x4_t, b: int32x4_t) ->
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vqdmulh_laneq_s32<const LANE: i32>(a: int32x2_t, b: int32x4_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 2);
-    vqdmulh_s32(a, vdup_n_s32(simd_extract(b, LANE as u32)))
+    vqdmulh_s32(a, vdup_n_s32(simd_extract!(b, LANE as u32)))
 }
 
 /// Signed saturating extract narrow
@@ -28751,7 +28751,7 @@ pub unsafe fn vrsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_s8<const LANE: i32>(a: i8, b: int8x8_t) -> int8x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28767,7 +28767,7 @@ pub unsafe fn vset_lane_s8<const LANE: i32>(a: i8, b: int8x8_t) -> int8x8_t {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> int16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28783,7 +28783,7 @@ pub unsafe fn vset_lane_s16<const LANE: i32>(a: i16, b: int16x4_t) -> int16x4_t
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28799,7 +28799,7 @@ pub unsafe fn vset_lane_s32<const LANE: i32>(a: i32, b: int32x2_t) -> int32x2_t
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_s64<const LANE: i32>(a: i64, b: int64x1_t) -> int64x1_t {
     static_assert!(LANE == 0);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28815,7 +28815,7 @@ pub unsafe fn vset_lane_s64<const LANE: i32>(a: i64, b: int64x1_t) -> int64x1_t
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_u8<const LANE: i32>(a: u8, b: uint8x8_t) -> uint8x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28831,7 +28831,7 @@ pub unsafe fn vset_lane_u8<const LANE: i32>(a: u8, b: uint8x8_t) -> uint8x8_t {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_u16<const LANE: i32>(a: u16, b: uint16x4_t) -> uint16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28847,7 +28847,7 @@ pub unsafe fn vset_lane_u16<const LANE: i32>(a: u16, b: uint16x4_t) -> uint16x4_
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_u32<const LANE: i32>(a: u32, b: uint32x2_t) -> uint32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28863,7 +28863,7 @@ pub unsafe fn vset_lane_u32<const LANE: i32>(a: u32, b: uint32x2_t) -> uint32x2_
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_u64<const LANE: i32>(a: u64, b: uint64x1_t) -> uint64x1_t {
     static_assert!(LANE == 0);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28879,7 +28879,7 @@ pub unsafe fn vset_lane_u64<const LANE: i32>(a: u64, b: uint64x1_t) -> uint64x1_
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_p8<const LANE: i32>(a: p8, b: poly8x8_t) -> poly8x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28895,7 +28895,7 @@ pub unsafe fn vset_lane_p8<const LANE: i32>(a: p8, b: poly8x8_t) -> poly8x8_t {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_p16<const LANE: i32>(a: p16, b: poly16x4_t) -> poly16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28911,7 +28911,7 @@ pub unsafe fn vset_lane_p16<const LANE: i32>(a: p16, b: poly16x4_t) -> poly16x4_
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_p64<const LANE: i32>(a: p64, b: poly64x1_t) -> poly64x1_t {
     static_assert!(LANE == 0);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28927,7 +28927,7 @@ pub unsafe fn vset_lane_p64<const LANE: i32>(a: p64, b: poly64x1_t) -> poly64x1_
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_s8<const LANE: i32>(a: i8, b: int8x16_t) -> int8x16_t {
     static_assert_uimm_bits!(LANE, 4);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28943,7 +28943,7 @@ pub unsafe fn vsetq_lane_s8<const LANE: i32>(a: i8, b: int8x16_t) -> int8x16_t {
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_s16<const LANE: i32>(a: i16, b: int16x8_t) -> int16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28959,7 +28959,7 @@ pub unsafe fn vsetq_lane_s16<const LANE: i32>(a: i16, b: int16x8_t) -> int16x8_t
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_s32<const LANE: i32>(a: i32, b: int32x4_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28975,7 +28975,7 @@ pub unsafe fn vsetq_lane_s32<const LANE: i32>(a: i32, b: int32x4_t) -> int32x4_t
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_s64<const LANE: i32>(a: i64, b: int64x2_t) -> int64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -28991,7 +28991,7 @@ pub unsafe fn vsetq_lane_s64<const LANE: i32>(a: i64, b: int64x2_t) -> int64x2_t
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_u8<const LANE: i32>(a: u8, b: uint8x16_t) -> uint8x16_t {
     static_assert_uimm_bits!(LANE, 4);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -29007,7 +29007,7 @@ pub unsafe fn vsetq_lane_u8<const LANE: i32>(a: u8, b: uint8x16_t) -> uint8x16_t
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_u16<const LANE: i32>(a: u16, b: uint16x8_t) -> uint16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -29023,7 +29023,7 @@ pub unsafe fn vsetq_lane_u16<const LANE: i32>(a: u16, b: uint16x8_t) -> uint16x8
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_u32<const LANE: i32>(a: u32, b: uint32x4_t) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -29039,7 +29039,7 @@ pub unsafe fn vsetq_lane_u32<const LANE: i32>(a: u32, b: uint32x4_t) -> uint32x4
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_u64<const LANE: i32>(a: u64, b: uint64x2_t) -> uint64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -29055,7 +29055,7 @@ pub unsafe fn vsetq_lane_u64<const LANE: i32>(a: u64, b: uint64x2_t) -> uint64x2
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_p8<const LANE: i32>(a: p8, b: poly8x16_t) -> poly8x16_t {
     static_assert_uimm_bits!(LANE, 4);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -29071,7 +29071,7 @@ pub unsafe fn vsetq_lane_p8<const LANE: i32>(a: p8, b: poly8x16_t) -> poly8x16_t
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_p16<const LANE: i32>(a: p16, b: poly16x8_t) -> poly16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -29087,7 +29087,7 @@ pub unsafe fn vsetq_lane_p16<const LANE: i32>(a: p16, b: poly16x8_t) -> poly16x8
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_p64<const LANE: i32>(a: p64, b: poly64x2_t) -> poly64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -29103,7 +29103,7 @@ pub unsafe fn vsetq_lane_p64<const LANE: i32>(a: p64, b: poly64x2_t) -> poly64x2
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vset_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Insert vector element from another vector element
@@ -29119,7 +29119,7 @@ pub unsafe fn vset_lane_f32<const LANE: i32>(a: f32, b: float32x2_t) -> float32x
 #[cfg_attr(target_arch = "arm", unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800"))]
 pub unsafe fn vsetq_lane_f32<const LANE: i32>(a: f32, b: float32x4_t) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(b, LANE as u32, a)
+    simd_insert!(b, LANE as u32, a)
 }
 
 /// Signed Shift left
diff --git a/crates/core_arch/src/arm_shared/neon/mod.rs b/crates/core_arch/src/arm_shared/neon/mod.rs
index 2d12f5e99b..12da187067 100644
--- a/crates/core_arch/src/arm_shared/neon/mod.rs
+++ b/crates/core_arch/src/arm_shared/neon/mod.rs
@@ -1294,7 +1294,7 @@ extern "unadjusted" {
 )]
 pub unsafe fn vld1_lane_s8<const LANE: i32>(ptr: *const i8, src: int8x8_t) -> int8x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1314,7 +1314,7 @@ pub unsafe fn vld1_lane_s8<const LANE: i32>(ptr: *const i8, src: int8x8_t) -> in
 )]
 pub unsafe fn vld1q_lane_s8<const LANE: i32>(ptr: *const i8, src: int8x16_t) -> int8x16_t {
     static_assert_uimm_bits!(LANE, 4);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1334,7 +1334,7 @@ pub unsafe fn vld1q_lane_s8<const LANE: i32>(ptr: *const i8, src: int8x16_t) ->
 )]
 pub unsafe fn vld1_lane_s16<const LANE: i32>(ptr: *const i16, src: int16x4_t) -> int16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1354,7 +1354,7 @@ pub unsafe fn vld1_lane_s16<const LANE: i32>(ptr: *const i16, src: int16x4_t) ->
 )]
 pub unsafe fn vld1q_lane_s16<const LANE: i32>(ptr: *const i16, src: int16x8_t) -> int16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1374,7 +1374,7 @@ pub unsafe fn vld1q_lane_s16<const LANE: i32>(ptr: *const i16, src: int16x8_t) -
 )]
 pub unsafe fn vld1_lane_s32<const LANE: i32>(ptr: *const i32, src: int32x2_t) -> int32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1394,7 +1394,7 @@ pub unsafe fn vld1_lane_s32<const LANE: i32>(ptr: *const i32, src: int32x2_t) ->
 )]
 pub unsafe fn vld1q_lane_s32<const LANE: i32>(ptr: *const i32, src: int32x4_t) -> int32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1414,7 +1414,7 @@ pub unsafe fn vld1q_lane_s32<const LANE: i32>(ptr: *const i32, src: int32x4_t) -
 )]
 pub unsafe fn vld1_lane_s64<const LANE: i32>(ptr: *const i64, src: int64x1_t) -> int64x1_t {
     static_assert!(LANE == 0);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1434,7 +1434,7 @@ pub unsafe fn vld1_lane_s64<const LANE: i32>(ptr: *const i64, src: int64x1_t) ->
 )]
 pub unsafe fn vld1q_lane_s64<const LANE: i32>(ptr: *const i64, src: int64x2_t) -> int64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1454,7 +1454,7 @@ pub unsafe fn vld1q_lane_s64<const LANE: i32>(ptr: *const i64, src: int64x2_t) -
 )]
 pub unsafe fn vld1_lane_u8<const LANE: i32>(ptr: *const u8, src: uint8x8_t) -> uint8x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1474,7 +1474,7 @@ pub unsafe fn vld1_lane_u8<const LANE: i32>(ptr: *const u8, src: uint8x8_t) -> u
 )]
 pub unsafe fn vld1q_lane_u8<const LANE: i32>(ptr: *const u8, src: uint8x16_t) -> uint8x16_t {
     static_assert_uimm_bits!(LANE, 4);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1494,7 +1494,7 @@ pub unsafe fn vld1q_lane_u8<const LANE: i32>(ptr: *const u8, src: uint8x16_t) ->
 )]
 pub unsafe fn vld1_lane_u16<const LANE: i32>(ptr: *const u16, src: uint16x4_t) -> uint16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1514,7 +1514,7 @@ pub unsafe fn vld1_lane_u16<const LANE: i32>(ptr: *const u16, src: uint16x4_t) -
 )]
 pub unsafe fn vld1q_lane_u16<const LANE: i32>(ptr: *const u16, src: uint16x8_t) -> uint16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1534,7 +1534,7 @@ pub unsafe fn vld1q_lane_u16<const LANE: i32>(ptr: *const u16, src: uint16x8_t)
 )]
 pub unsafe fn vld1_lane_u32<const LANE: i32>(ptr: *const u32, src: uint32x2_t) -> uint32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1554,7 +1554,7 @@ pub unsafe fn vld1_lane_u32<const LANE: i32>(ptr: *const u32, src: uint32x2_t) -
 )]
 pub unsafe fn vld1q_lane_u32<const LANE: i32>(ptr: *const u32, src: uint32x4_t) -> uint32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1574,7 +1574,7 @@ pub unsafe fn vld1q_lane_u32<const LANE: i32>(ptr: *const u32, src: uint32x4_t)
 )]
 pub unsafe fn vld1_lane_u64<const LANE: i32>(ptr: *const u64, src: uint64x1_t) -> uint64x1_t {
     static_assert!(LANE == 0);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1594,7 +1594,7 @@ pub unsafe fn vld1_lane_u64<const LANE: i32>(ptr: *const u64, src: uint64x1_t) -
 )]
 pub unsafe fn vld1q_lane_u64<const LANE: i32>(ptr: *const u64, src: uint64x2_t) -> uint64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1614,7 +1614,7 @@ pub unsafe fn vld1q_lane_u64<const LANE: i32>(ptr: *const u64, src: uint64x2_t)
 )]
 pub unsafe fn vld1_lane_p8<const LANE: i32>(ptr: *const p8, src: poly8x8_t) -> poly8x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1634,7 +1634,7 @@ pub unsafe fn vld1_lane_p8<const LANE: i32>(ptr: *const p8, src: poly8x8_t) -> p
 )]
 pub unsafe fn vld1q_lane_p8<const LANE: i32>(ptr: *const p8, src: poly8x16_t) -> poly8x16_t {
     static_assert_uimm_bits!(LANE, 4);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1654,7 +1654,7 @@ pub unsafe fn vld1q_lane_p8<const LANE: i32>(ptr: *const p8, src: poly8x16_t) ->
 )]
 pub unsafe fn vld1_lane_p16<const LANE: i32>(ptr: *const p16, src: poly16x4_t) -> poly16x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1674,7 +1674,7 @@ pub unsafe fn vld1_lane_p16<const LANE: i32>(ptr: *const p16, src: poly16x4_t) -
 )]
 pub unsafe fn vld1q_lane_p16<const LANE: i32>(ptr: *const p16, src: poly16x8_t) -> poly16x8_t {
     static_assert_uimm_bits!(LANE, 3);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1696,7 +1696,7 @@ pub unsafe fn vld1q_lane_p16<const LANE: i32>(ptr: *const p16, src: poly16x8_t)
 )]
 pub unsafe fn vld1_lane_p64<const LANE: i32>(ptr: *const p64, src: poly64x1_t) -> poly64x1_t {
     static_assert!(LANE == 0);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1718,7 +1718,7 @@ pub unsafe fn vld1_lane_p64<const LANE: i32>(ptr: *const p64, src: poly64x1_t) -
 )]
 pub unsafe fn vld1q_lane_p64<const LANE: i32>(ptr: *const p64, src: poly64x2_t) -> poly64x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1738,7 +1738,7 @@ pub unsafe fn vld1q_lane_p64<const LANE: i32>(ptr: *const p64, src: poly64x2_t)
 )]
 pub unsafe fn vld1_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x2_t) -> float32x2_t {
     static_assert_uimm_bits!(LANE, 1);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure to one lane of one register.
@@ -1758,7 +1758,7 @@ pub unsafe fn vld1_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x2_t)
 )]
 pub unsafe fn vld1q_lane_f32<const LANE: i32>(ptr: *const f32, src: float32x4_t) -> float32x4_t {
     static_assert_uimm_bits!(LANE, 2);
-    simd_insert(src, LANE as u32, *ptr)
+    simd_insert!(src, LANE as u32, *ptr)
 }
 
 /// Load one single-element structure and Replicate to all lanes (of one register).
@@ -5918,7 +5918,7 @@ pub unsafe fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t {
 )]
 pub unsafe fn vgetq_lane_u64<const IMM5: i32>(v: uint64x2_t) -> u64 {
     static_assert_uimm_bits!(IMM5, 1);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -5937,7 +5937,7 @@ pub unsafe fn vgetq_lane_u64<const IMM5: i32>(v: uint64x2_t) -> u64 {
 )]
 pub unsafe fn vget_lane_u64<const IMM5: i32>(v: uint64x1_t) -> u64 {
     static_assert!(IMM5 == 0);
-    simd_extract(v, 0)
+    simd_extract!(v, 0)
 }
 
 /// Move vector element to general-purpose register
@@ -5956,7 +5956,7 @@ pub unsafe fn vget_lane_u64<const IMM5: i32>(v: uint64x1_t) -> u64 {
 )]
 pub unsafe fn vget_lane_u16<const IMM5: i32>(v: uint16x4_t) -> u16 {
     static_assert_uimm_bits!(IMM5, 2);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -5975,7 +5975,7 @@ pub unsafe fn vget_lane_u16<const IMM5: i32>(v: uint16x4_t) -> u16 {
 )]
 pub unsafe fn vget_lane_s16<const IMM5: i32>(v: int16x4_t) -> i16 {
     static_assert_uimm_bits!(IMM5, 2);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -5994,7 +5994,7 @@ pub unsafe fn vget_lane_s16<const IMM5: i32>(v: int16x4_t) -> i16 {
 )]
 pub unsafe fn vget_lane_p16<const IMM5: i32>(v: poly16x4_t) -> p16 {
     static_assert_uimm_bits!(IMM5, 2);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6013,7 +6013,7 @@ pub unsafe fn vget_lane_p16<const IMM5: i32>(v: poly16x4_t) -> p16 {
 )]
 pub unsafe fn vget_lane_u32<const IMM5: i32>(v: uint32x2_t) -> u32 {
     static_assert_uimm_bits!(IMM5, 1);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6032,7 +6032,7 @@ pub unsafe fn vget_lane_u32<const IMM5: i32>(v: uint32x2_t) -> u32 {
 )]
 pub unsafe fn vget_lane_s32<const IMM5: i32>(v: int32x2_t) -> i32 {
     static_assert_uimm_bits!(IMM5, 1);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Duplicate vector element to vector or scalar
@@ -6051,7 +6051,7 @@ pub unsafe fn vget_lane_s32<const IMM5: i32>(v: int32x2_t) -> i32 {
 )]
 pub unsafe fn vget_lane_f32<const IMM5: i32>(v: float32x2_t) -> f32 {
     static_assert_uimm_bits!(IMM5, 1);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Duplicate vector element to vector or scalar
@@ -6070,7 +6070,7 @@ pub unsafe fn vget_lane_f32<const IMM5: i32>(v: float32x2_t) -> f32 {
 )]
 pub unsafe fn vgetq_lane_f32<const IMM5: i32>(v: float32x4_t) -> f32 {
     static_assert_uimm_bits!(IMM5, 2);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6089,7 +6089,7 @@ pub unsafe fn vgetq_lane_f32<const IMM5: i32>(v: float32x4_t) -> f32 {
 )]
 pub unsafe fn vget_lane_p64<const IMM5: i32>(v: poly64x1_t) -> p64 {
     static_assert!(IMM5 == 0);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6108,7 +6108,7 @@ pub unsafe fn vget_lane_p64<const IMM5: i32>(v: poly64x1_t) -> p64 {
 )]
 pub unsafe fn vgetq_lane_p64<const IMM5: i32>(v: poly64x2_t) -> p64 {
     static_assert_uimm_bits!(IMM5, 1);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6127,7 +6127,7 @@ pub unsafe fn vgetq_lane_p64<const IMM5: i32>(v: poly64x2_t) -> p64 {
 )]
 pub unsafe fn vget_lane_s64<const IMM5: i32>(v: int64x1_t) -> i64 {
     static_assert!(IMM5 == 0);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6146,7 +6146,7 @@ pub unsafe fn vget_lane_s64<const IMM5: i32>(v: int64x1_t) -> i64 {
 )]
 pub unsafe fn vgetq_lane_s64<const IMM5: i32>(v: int64x2_t) -> i64 {
     static_assert_uimm_bits!(IMM5, 1);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6165,7 +6165,7 @@ pub unsafe fn vgetq_lane_s64<const IMM5: i32>(v: int64x2_t) -> i64 {
 )]
 pub unsafe fn vgetq_lane_u16<const IMM5: i32>(v: uint16x8_t) -> u16 {
     static_assert_uimm_bits!(IMM5, 3);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6184,7 +6184,7 @@ pub unsafe fn vgetq_lane_u16<const IMM5: i32>(v: uint16x8_t) -> u16 {
 )]
 pub unsafe fn vgetq_lane_u32<const IMM5: i32>(v: uint32x4_t) -> u32 {
     static_assert_uimm_bits!(IMM5, 2);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6203,7 +6203,7 @@ pub unsafe fn vgetq_lane_u32<const IMM5: i32>(v: uint32x4_t) -> u32 {
 )]
 pub unsafe fn vgetq_lane_s16<const IMM5: i32>(v: int16x8_t) -> i16 {
     static_assert_uimm_bits!(IMM5, 3);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6222,7 +6222,7 @@ pub unsafe fn vgetq_lane_s16<const IMM5: i32>(v: int16x8_t) -> i16 {
 )]
 pub unsafe fn vgetq_lane_p16<const IMM5: i32>(v: poly16x8_t) -> p16 {
     static_assert_uimm_bits!(IMM5, 3);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6241,7 +6241,7 @@ pub unsafe fn vgetq_lane_p16<const IMM5: i32>(v: poly16x8_t) -> p16 {
 )]
 pub unsafe fn vgetq_lane_s32<const IMM5: i32>(v: int32x4_t) -> i32 {
     static_assert_uimm_bits!(IMM5, 2);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6260,7 +6260,7 @@ pub unsafe fn vgetq_lane_s32<const IMM5: i32>(v: int32x4_t) -> i32 {
 )]
 pub unsafe fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
     static_assert_uimm_bits!(IMM5, 3);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6279,7 +6279,7 @@ pub unsafe fn vget_lane_u8<const IMM5: i32>(v: uint8x8_t) -> u8 {
 )]
 pub unsafe fn vget_lane_s8<const IMM5: i32>(v: int8x8_t) -> i8 {
     static_assert_uimm_bits!(IMM5, 3);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6298,7 +6298,7 @@ pub unsafe fn vget_lane_s8<const IMM5: i32>(v: int8x8_t) -> i8 {
 )]
 pub unsafe fn vget_lane_p8<const IMM5: i32>(v: poly8x8_t) -> p8 {
     static_assert_uimm_bits!(IMM5, 3);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6317,7 +6317,7 @@ pub unsafe fn vget_lane_p8<const IMM5: i32>(v: poly8x8_t) -> p8 {
 )]
 pub unsafe fn vgetq_lane_u8<const IMM5: i32>(v: uint8x16_t) -> u8 {
     static_assert_uimm_bits!(IMM5, 4);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6336,7 +6336,7 @@ pub unsafe fn vgetq_lane_u8<const IMM5: i32>(v: uint8x16_t) -> u8 {
 )]
 pub unsafe fn vgetq_lane_s8<const IMM5: i32>(v: int8x16_t) -> i8 {
     static_assert_uimm_bits!(IMM5, 4);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Move vector element to general-purpose register
@@ -6355,7 +6355,7 @@ pub unsafe fn vgetq_lane_s8<const IMM5: i32>(v: int8x16_t) -> i8 {
 )]
 pub unsafe fn vgetq_lane_p8<const IMM5: i32>(v: poly8x16_t) -> p8 {
     static_assert_uimm_bits!(IMM5, 4);
-    simd_extract(v, IMM5 as u32)
+    simd_extract!(v, IMM5 as u32)
 }
 
 /// Duplicate vector element to vector or scalar
@@ -6427,7 +6427,7 @@ pub unsafe fn vget_high_s32(a: int32x4_t) -> int32x2_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub unsafe fn vget_high_s64(a: int64x2_t) -> int64x1_t {
-    int64x1_t(simd_extract(a, 1))
+    int64x1_t(simd_extract!(a, 1))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -6499,7 +6499,7 @@ pub unsafe fn vget_high_u32(a: uint32x4_t) -> uint32x2_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub unsafe fn vget_high_u64(a: uint64x2_t) -> uint64x1_t {
-    uint64x1_t(simd_extract(a, 1))
+    uint64x1_t(simd_extract!(a, 1))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -6621,7 +6621,7 @@ pub unsafe fn vget_low_s32(a: int32x4_t) -> int32x2_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub unsafe fn vget_low_s64(a: int64x2_t) -> int64x1_t {
-    int64x1_t(simd_extract(a, 0))
+    int64x1_t(simd_extract!(a, 0))
 }
 
 /// Duplicate vector element to vector or scalar
@@ -6689,7 +6689,7 @@ pub unsafe fn vget_low_u32(a: uint32x4_t) -> uint32x2_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub unsafe fn vget_low_u64(a: uint64x2_t) -> uint64x1_t {
-    uint64x1_t(simd_extract(a, 0))
+    uint64x1_t(simd_extract!(a, 0))
 }
 
 /// Duplicate vector element to vector or scalar
diff --git a/crates/core_arch/src/macros.rs b/crates/core_arch/src/macros.rs
index 56d922b0fd..4c3bbc9395 100644
--- a/crates/core_arch/src/macros.rs
+++ b/crates/core_arch/src/macros.rs
@@ -76,3 +76,20 @@ macro_rules! simd_shuffle {
         )
     }};
 }
+
+#[allow(unused)]
+macro_rules! simd_insert {
+    ($x:expr, $idx:expr, $val:expr $(,)?) => {{
+        simd_insert($x, const { $idx }, $val)
+    }};
+}
+
+#[allow(unused)]
+macro_rules! simd_extract {
+    ($x:expr, $idx:expr $(,)?) => {{
+        simd_extract($x, const { $idx })
+    }};
+    ($x:expr, $idx:expr, $ty:ty $(,)?) => {{
+        simd_extract::<_, $ty>($x, const { $idx })
+    }};
+}
diff --git a/crates/core_arch/src/wasm32/simd128.rs b/crates/core_arch/src/wasm32/simd128.rs
index 4819195dc6..f376bdbe63 100644
--- a/crates/core_arch/src/wasm32/simd128.rs
+++ b/crates/core_arch/src/wasm32/simd128.rs
@@ -1088,7 +1088,7 @@ pub use i64x2_shuffle as u64x2_shuffle;
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i8x16_extract_lane<const N: usize>(a: v128) -> i8 {
     static_assert!(N < 16);
-    unsafe { simd_extract(a.as_i8x16(), N as u32) }
+    unsafe { simd_extract!(a.as_i8x16(), N as u32) }
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 16 packed u8 numbers.
@@ -1102,7 +1102,7 @@ pub fn i8x16_extract_lane<const N: usize>(a: v128) -> i8 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u8x16_extract_lane<const N: usize>(a: v128) -> u8 {
     static_assert!(N < 16);
-    unsafe { simd_extract(a.as_u8x16(), N as u32) }
+    unsafe { simd_extract!(a.as_u8x16(), N as u32) }
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 16 packed i8 numbers.
@@ -1116,7 +1116,7 @@ pub fn u8x16_extract_lane<const N: usize>(a: v128) -> u8 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i8x16_replace_lane<const N: usize>(a: v128, val: i8) -> v128 {
     static_assert!(N < 16);
-    unsafe { simd_insert(a.as_i8x16(), N as u32, val).v128() }
+    unsafe { simd_insert!(a.as_i8x16(), N as u32, val).v128() }
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 16 packed u8 numbers.
@@ -1130,7 +1130,7 @@ pub fn i8x16_replace_lane<const N: usize>(a: v128, val: i8) -> v128 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u8x16_replace_lane<const N: usize>(a: v128, val: u8) -> v128 {
     static_assert!(N < 16);
-    unsafe { simd_insert(a.as_u8x16(), N as u32, val).v128() }
+    unsafe { simd_insert!(a.as_u8x16(), N as u32, val).v128() }
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
@@ -1144,7 +1144,7 @@ pub fn u8x16_replace_lane<const N: usize>(a: v128, val: u8) -> v128 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_extract_lane<const N: usize>(a: v128) -> i16 {
     static_assert!(N < 8);
-    unsafe { simd_extract(a.as_i16x8(), N as u32) }
+    unsafe { simd_extract!(a.as_i16x8(), N as u32) }
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 8 packed u16 numbers.
@@ -1158,7 +1158,7 @@ pub fn i16x8_extract_lane<const N: usize>(a: v128) -> i16 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u16x8_extract_lane<const N: usize>(a: v128) -> u16 {
     static_assert!(N < 8);
-    unsafe { simd_extract(a.as_u16x8(), N as u32) }
+    unsafe { simd_extract!(a.as_u16x8(), N as u32) }
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 8 packed i16 numbers.
@@ -1172,7 +1172,7 @@ pub fn u16x8_extract_lane<const N: usize>(a: v128) -> u16 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i16x8_replace_lane<const N: usize>(a: v128, val: i16) -> v128 {
     static_assert!(N < 8);
-    unsafe { simd_insert(a.as_i16x8(), N as u32, val).v128() }
+    unsafe { simd_insert!(a.as_i16x8(), N as u32, val).v128() }
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 8 packed u16 numbers.
@@ -1186,7 +1186,7 @@ pub fn i16x8_replace_lane<const N: usize>(a: v128, val: i16) -> v128 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn u16x8_replace_lane<const N: usize>(a: v128, val: u16) -> v128 {
     static_assert!(N < 8);
-    unsafe { simd_insert(a.as_u16x8(), N as u32, val).v128() }
+    unsafe { simd_insert!(a.as_u16x8(), N as u32, val).v128() }
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 4 packed i32 numbers.
@@ -1200,7 +1200,7 @@ pub fn u16x8_replace_lane<const N: usize>(a: v128, val: u16) -> v128 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_extract_lane<const N: usize>(a: v128) -> i32 {
     static_assert!(N < 4);
-    unsafe { simd_extract(a.as_i32x4(), N as u32) }
+    unsafe { simd_extract!(a.as_i32x4(), N as u32) }
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 4 packed u32 numbers.
@@ -1226,7 +1226,7 @@ pub fn u32x4_extract_lane<const N: usize>(a: v128) -> u32 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i32x4_replace_lane<const N: usize>(a: v128, val: i32) -> v128 {
     static_assert!(N < 4);
-    unsafe { simd_insert(a.as_i32x4(), N as u32, val).v128() }
+    unsafe { simd_insert!(a.as_i32x4(), N as u32, val).v128() }
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 4 packed u32 numbers.
@@ -1252,7 +1252,7 @@ pub fn u32x4_replace_lane<const N: usize>(a: v128, val: u32) -> v128 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_extract_lane<const N: usize>(a: v128) -> i64 {
     static_assert!(N < 2);
-    unsafe { simd_extract(a.as_i64x2(), N as u32) }
+    unsafe { simd_extract!(a.as_i64x2(), N as u32) }
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 2 packed u64 numbers.
@@ -1278,7 +1278,7 @@ pub fn u64x2_extract_lane<const N: usize>(a: v128) -> u64 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn i64x2_replace_lane<const N: usize>(a: v128, val: i64) -> v128 {
     static_assert!(N < 2);
-    unsafe { simd_insert(a.as_i64x2(), N as u32, val).v128() }
+    unsafe { simd_insert!(a.as_i64x2(), N as u32, val).v128() }
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 2 packed u64 numbers.
@@ -1304,7 +1304,7 @@ pub fn u64x2_replace_lane<const N: usize>(a: v128, val: u64) -> v128 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f32x4_extract_lane<const N: usize>(a: v128) -> f32 {
     static_assert!(N < 4);
-    unsafe { simd_extract(a.as_f32x4(), N as u32) }
+    unsafe { simd_extract!(a.as_f32x4(), N as u32) }
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 4 packed f32 numbers.
@@ -1318,7 +1318,7 @@ pub fn f32x4_extract_lane<const N: usize>(a: v128) -> f32 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f32x4_replace_lane<const N: usize>(a: v128, val: f32) -> v128 {
     static_assert!(N < 4);
-    unsafe { simd_insert(a.as_f32x4(), N as u32, val).v128() }
+    unsafe { simd_insert!(a.as_f32x4(), N as u32, val).v128() }
 }
 
 /// Extracts a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
@@ -1332,7 +1332,7 @@ pub fn f32x4_replace_lane<const N: usize>(a: v128, val: f32) -> v128 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f64x2_extract_lane<const N: usize>(a: v128) -> f64 {
     static_assert!(N < 2);
-    unsafe { simd_extract(a.as_f64x2(), N as u32) }
+    unsafe { simd_extract!(a.as_f64x2(), N as u32) }
 }
 
 /// Replaces a lane from a 128-bit vector interpreted as 2 packed f64 numbers.
@@ -1346,7 +1346,7 @@ pub fn f64x2_extract_lane<const N: usize>(a: v128) -> f64 {
 #[stable(feature = "wasm_simd", since = "1.54.0")]
 pub fn f64x2_replace_lane<const N: usize>(a: v128, val: f64) -> v128 {
     static_assert!(N < 2);
-    unsafe { simd_insert(a.as_f64x2(), N as u32, val).v128() }
+    unsafe { simd_insert!(a.as_f64x2(), N as u32, val).v128() }
 }
 
 /// Returns a new vector with lanes selected from the lanes of the first input
diff --git a/crates/core_arch/src/x86/avx.rs b/crates/core_arch/src/x86/avx.rs
index 9a05ef620e..72eb43a5c1 100644
--- a/crates/core_arch/src/x86/avx.rs
+++ b/crates/core_arch/src/x86/avx.rs
@@ -1329,7 +1329,7 @@ pub unsafe fn _mm256_insertf128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i {
     static_assert_uimm_bits!(INDEX, 5);
-    transmute(simd_insert(a.as_i8x32(), INDEX as u32, i))
+    transmute(simd_insert!(a.as_i8x32(), INDEX as u32, i))
 }
 
 /// Copies `a` to result, and inserts the 16-bit integer `i` into result
@@ -1343,7 +1343,7 @@ pub unsafe fn _mm256_insert_epi8<const INDEX: i32>(a: __m256i, i: i8) -> __m256i
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m256i {
     static_assert_uimm_bits!(INDEX, 4);
-    transmute(simd_insert(a.as_i16x16(), INDEX as u32, i))
+    transmute(simd_insert!(a.as_i16x16(), INDEX as u32, i))
 }
 
 /// Copies `a` to result, and inserts the 32-bit integer `i` into result
@@ -1357,7 +1357,7 @@ pub unsafe fn _mm256_insert_epi16<const INDEX: i32>(a: __m256i, i: i16) -> __m25
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi32<const INDEX: i32>(a: __m256i, i: i32) -> __m256i {
     static_assert_uimm_bits!(INDEX, 3);
-    transmute(simd_insert(a.as_i32x8(), INDEX as u32, i))
+    transmute(simd_insert!(a.as_i32x8(), INDEX as u32, i))
 }
 
 /// Loads 256-bits (composed of 4 packed double-precision (64-bit)
@@ -2914,7 +2914,7 @@ pub unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a
 //#[cfg_attr(test, assert_instr(movss))] FIXME
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
-    simd_extract(a, 0)
+    simd_extract!(a, 0)
 }
 
 // LLVM intrinsics used in the above functions
diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs
index c4a117424d..1f7a0b771a 100644
--- a/crates/core_arch/src/x86/avx2.rs
+++ b/crates/core_arch/src/x86/avx2.rs
@@ -3586,7 +3586,7 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
     static_assert_uimm_bits!(INDEX, 5);
-    simd_extract::<_, u8>(a.as_u8x32(), INDEX as u32) as i32
+    simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32
 }
 
 /// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
@@ -3602,7 +3602,7 @@ pub unsafe fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
     static_assert_uimm_bits!(INDEX, 4);
-    simd_extract::<_, u16>(a.as_u16x16(), INDEX as u32) as i32
+    simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32
 }
 
 /// Extracts a 32-bit integer from `a`, selected with `INDEX`.
@@ -3615,7 +3615,7 @@ pub unsafe fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
     static_assert_uimm_bits!(INDEX, 3);
-    simd_extract(a.as_i32x8(), INDEX as u32)
+    simd_extract!(a.as_i32x8(), INDEX as u32)
 }
 
 /// Returns the first element of the input vector of `[4 x double]`.
@@ -3626,7 +3626,7 @@ pub unsafe fn _mm256_extract_epi32<const INDEX: i32>(a: __m256i) -> i32 {
 //#[cfg_attr(test, assert_instr(movsd))] FIXME
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
-    simd_extract(a, 0)
+    simd_extract!(a, 0)
 }
 
 /// Returns the first element of the input vector of `[8 x i32]`.
@@ -3636,7 +3636,7 @@ pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 #[target_feature(enable = "avx2")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
-    simd_extract(a.as_i32x8(), 0)
+    simd_extract!(a.as_i32x8(), 0)
 }
 
 #[allow(improper_ctypes)]
diff --git a/crates/core_arch/src/x86/avx512f.rs b/crates/core_arch/src/x86/avx512f.rs
index af95505547..3def4a39d6 100644
--- a/crates/core_arch/src/x86/avx512f.rs
+++ b/crates/core_arch/src/x86/avx512f.rs
@@ -25556,7 +25556,7 @@ pub unsafe fn _mm512_castsi512_pd(a: __m512i) -> __m512d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(vmovd))]
 pub unsafe fn _mm512_cvtsi512_si32(a: __m512i) -> i32 {
-    let extract: i32 = simd_extract(a.as_i32x16(), 0);
+    let extract: i32 = simd_extract!(a.as_i32x16(), 0);
     extract
 }
 
@@ -34622,12 +34622,12 @@ pub unsafe fn _mm512_set_pd(
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovss))]
 pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
+    let extractsrc: f32 = simd_extract!(src, 0);
     let mut mov: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
-        mov = simd_extract(b, 0);
+        mov = simd_extract!(b, 0);
     }
-    simd_insert(a, 0, mov)
+    simd_insert!(a, 0, mov)
 }
 
 /// Move the lower single-precision (32-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34640,9 +34640,9 @@ pub unsafe fn _mm_mask_move_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -
 pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     let mut mov: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        mov = simd_extract(b, 0);
+        mov = simd_extract!(b, 0);
     }
-    simd_insert(a, 0, mov)
+    simd_insert!(a, 0, mov)
 }
 
 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34653,12 +34653,12 @@ pub unsafe fn _mm_maskz_move_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmovsd))]
 pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
+    let extractsrc: f64 = simd_extract!(src, 0);
     let mut mov: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
-        mov = simd_extract(b, 0);
+        mov = simd_extract!(b, 0);
     }
-    simd_insert(a, 0, mov)
+    simd_insert!(a, 0, mov)
 }
 
 /// Move the lower double-precision (64-bit) floating-point element from b to the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34671,9 +34671,9 @@ pub unsafe fn _mm_mask_move_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d
 pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     let mut mov: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        mov = simd_extract(b, 0);
+        mov = simd_extract!(b, 0);
     }
-    simd_insert(a, 0, mov)
+    simd_insert!(a, 0, mov)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34684,14 +34684,14 @@ pub unsafe fn _mm_maskz_move_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddss))]
 pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
+    let extractsrc: f32 = simd_extract!(src, 0);
     let mut add: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         add = extracta + extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34704,11 +34704,11 @@ pub unsafe fn _mm_mask_add_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     let mut add: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         add = extracta + extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34719,14 +34719,14 @@ pub unsafe fn _mm_maskz_add_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vaddsd))]
 pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
+    let extractsrc: f64 = simd_extract!(src, 0);
     let mut add: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         add = extracta + extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Add the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34739,11 +34739,11 @@ pub unsafe fn _mm_mask_add_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     let mut add: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         add = extracta + extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34754,14 +34754,14 @@ pub unsafe fn _mm_maskz_add_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubss))]
 pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
+    let extractsrc: f32 = simd_extract!(src, 0);
     let mut add: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         add = extracta - extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Subtract the lower single-precision (32-bit) floating-point element in b from the lower single-precision (32-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34774,11 +34774,11 @@ pub unsafe fn _mm_mask_sub_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     let mut add: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         add = extracta - extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34789,14 +34789,14 @@ pub unsafe fn _mm_maskz_sub_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vsubsd))]
 pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
+    let extractsrc: f64 = simd_extract!(src, 0);
     let mut add: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         add = extracta - extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Subtract the lower double-precision (64-bit) floating-point element in b from the lower double-precision (64-bit) floating-point element in a, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34809,11 +34809,11 @@ pub unsafe fn _mm_mask_sub_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     let mut add: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         add = extracta - extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34824,14 +34824,14 @@ pub unsafe fn _mm_maskz_sub_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulss))]
 pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
+    let extractsrc: f32 = simd_extract!(src, 0);
     let mut add: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         add = extracta * extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34844,11 +34844,11 @@ pub unsafe fn _mm_mask_mul_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     let mut add: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         add = extracta * extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34859,14 +34859,14 @@ pub unsafe fn _mm_maskz_mul_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vmulsd))]
 pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
+    let extractsrc: f64 = simd_extract!(src, 0);
     let mut add: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         add = extracta * extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point element in a and b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34879,11 +34879,11 @@ pub unsafe fn _mm_mask_mul_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     let mut add: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         add = extracta * extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34894,14 +34894,14 @@ pub unsafe fn _mm_maskz_mul_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivss))]
 pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) -> __m128 {
-    let extractsrc: f32 = simd_extract(src, 0);
+    let extractsrc: f32 = simd_extract!(src, 0);
     let mut add: f32 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         add = extracta / extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Divide the lower single-precision (32-bit) floating-point element in a by the lower single-precision (32-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -34914,11 +34914,11 @@ pub unsafe fn _mm_mask_div_ss(src: __m128, k: __mmask8, a: __m128, b: __m128) ->
 pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
     let mut add: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         add = extracta / extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34929,14 +34929,14 @@ pub unsafe fn _mm_maskz_div_ss(k: __mmask8, a: __m128, b: __m128) -> __m128 {
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vdivsd))]
 pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
-    let extractsrc: f64 = simd_extract(src, 0);
+    let extractsrc: f64 = simd_extract!(src, 0);
     let mut add: f64 = extractsrc;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         add = extracta / extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Divide the lower double-precision (64-bit) floating-point element in a by the lower double-precision (64-bit) floating-point element in b, store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -34949,11 +34949,11 @@ pub unsafe fn _mm_mask_div_sd(src: __m128d, k: __mmask8, a: __m128d, b: __m128d)
 pub unsafe fn _mm_maskz_div_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128d {
     let mut add: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         add = extracta / extractb;
     }
-    simd_insert(a, 0, add)
+    simd_insert!(a, 0, add)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point elements in a and b, store the maximum value in the lower element of dst using writemask k (the element is copied from src when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35904,13 +35904,13 @@ pub unsafe fn _mm_maskz_scalef_sd(k: __mmask8, a: __m128d, b: __m128d) -> __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd213ss))]
 pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fmadd: f32 = simd_extract(a, 0);
+    let mut fmadd: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         fmadd = vfmadd132ss(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -35923,12 +35923,12 @@ pub unsafe fn _mm_mask_fmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
     let mut fmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         fmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -35939,13 +35939,13 @@ pub unsafe fn _mm_maskz_fmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd213ss))]
 pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fmadd: f32 = simd_extract(c, 0);
+    let mut fmadd: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         fmadd = vfmadd132ss(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(c, 0, fmadd)
+    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35956,13 +35956,13 @@ pub unsafe fn _mm_mask3_fmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd213sd))]
 pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmadd: f64 = simd_extract(a, 0);
+    let mut fmadd: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         fmadd = vfmadd132sd(fmadd, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -35975,12 +35975,12 @@ pub unsafe fn _mm_mask_fmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
     let mut fmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         fmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -35991,13 +35991,13 @@ pub unsafe fn _mm_maskz_fmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmadd213sd))]
 pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fmadd: f64 = simd_extract(c, 0);
+    let mut fmadd: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         fmadd = vfmadd132sd(extracta, extractb, fmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(c, 0, fmadd)
+    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -36008,14 +36008,14 @@ pub unsafe fn _mm_mask3_fmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub213ss))]
 pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fmsub: f32 = simd_extract(a, 0);
+    let mut fmsub: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
         fmsub = vfmadd132ss(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -36028,13 +36028,13 @@ pub unsafe fn _mm_mask_fmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) ->
 pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
     let mut fmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
         fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -36045,14 +36045,14 @@ pub unsafe fn _mm_maskz_fmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub213ss))]
 pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fmsub: f32 = simd_extract(c, 0);
+    let mut fmsub: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         let extractc = -fmsub;
         fmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(c, 0, fmsub)
+    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -36063,14 +36063,14 @@ pub unsafe fn _mm_mask3_fmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub213sd))]
 pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fmsub: f64 = simd_extract(a, 0);
+    let mut fmsub: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
         fmsub = vfmadd132sd(fmsub, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -36083,13 +36083,13 @@ pub unsafe fn _mm_mask_fmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d)
 pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
     let mut fmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
         fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -36100,14 +36100,14 @@ pub unsafe fn _mm_maskz_fmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfmsub213sd))]
 pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fmsub: f64 = simd_extract(c, 0);
+    let mut fmsub: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         let extractc = -fmsub;
         fmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(c, 0, fmsub)
+    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -36118,14 +36118,14 @@ pub unsafe fn _mm_mask3_fmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd213ss))]
 pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fnmadd: f32 = simd_extract(a, 0);
+    let mut fnmadd: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -36138,13 +36138,13 @@ pub unsafe fn _mm_mask_fnmadd_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
     let mut fnmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
+        let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -36155,14 +36155,14 @@ pub unsafe fn _mm_maskz_fnmadd_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd213ss))]
 pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fnmadd: f32 = simd_extract(c, 0);
+    let mut fnmadd: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
+        let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         fnmadd = vfmadd132ss(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(c, 0, fnmadd)
+    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -36173,14 +36173,14 @@ pub unsafe fn _mm_mask3_fnmadd_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd213sd))]
 pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmadd: f64 = simd_extract(a, 0);
+    let mut fnmadd: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -36193,13 +36193,13 @@ pub unsafe fn _mm_mask_fnmadd_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
     let mut fnmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
+        let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -36210,14 +36210,14 @@ pub unsafe fn _mm_maskz_fnmadd_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmadd213sd))]
 pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fnmadd: f64 = simd_extract(c, 0);
+    let mut fnmadd: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
+        let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         fnmadd = vfmadd132sd(extracta, extractb, fnmadd, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(c, 0, fnmadd)
+    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -36228,15 +36228,15 @@ pub unsafe fn _mm_mask3_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub213ss))]
 pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -> __m128 {
-    let mut fnmsub: f32 = simd_extract(a, 0);
+    let mut fnmsub: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmsub;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -36249,14 +36249,14 @@ pub unsafe fn _mm_mask_fnmsub_ss(a: __m128, k: __mmask8, b: __m128, c: __m128) -
 pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128) -> __m128 {
     let mut fnmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
+        let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.
@@ -36267,15 +36267,15 @@ pub unsafe fn _mm_maskz_fnmsub_ss(k: __mmask8, a: __m128, b: __m128, c: __m128)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub213ss))]
 pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8) -> __m128 {
-    let mut fnmsub: f32 = simd_extract(c, 0);
+    let mut fnmsub: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
+        let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         let extractc = -fnmsub;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(c, 0, fnmsub)
+    simd_insert!(c, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -36286,15 +36286,15 @@ pub unsafe fn _mm_mask3_fnmsub_ss(a: __m128, b: __m128, c: __m128, k: __mmask8)
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub213sd))]
 pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d) -> __m128d {
-    let mut fnmsub: f64 = simd_extract(a, 0);
+    let mut fnmsub: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmsub;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.
@@ -36307,14 +36307,14 @@ pub unsafe fn _mm_mask_fnmsub_sd(a: __m128d, k: __mmask8, b: __m128d, c: __m128d
 pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128d) -> __m128d {
     let mut fnmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
+        let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.
@@ -36325,15 +36325,15 @@ pub unsafe fn _mm_maskz_fnmsub_sd(k: __mmask8, a: __m128d, b: __m128d, c: __m128
 #[unstable(feature = "stdarch_x86_avx512", issue = "111137")]
 #[cfg_attr(test, assert_instr(vfnmsub213sd))]
 pub unsafe fn _mm_mask3_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d, k: __mmask8) -> __m128d {
-    let mut fnmsub: f64 = simd_extract(c, 0);
+    let mut fnmsub: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
+        let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         let extractc = -fnmsub;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, _MM_FROUND_CUR_DIRECTION);
     }
-    simd_insert(c, 0, fnmsub)
+    simd_insert!(c, 0, fnmsub)
 }
 
 /// Add the lower single-precision (32-bit) floating-point element in a and b, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38090,11 +38090,11 @@ pub unsafe fn _mm_maskz_scalef_round_sd<const ROUNDING: i32>(
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let extracta: f32 = simd_extract(a, 0);
-    let extractb: f32 = simd_extract(b, 0);
-    let extractc: f32 = simd_extract(c, 0);
+    let extracta: f32 = simd_extract!(a, 0);
+    let extractb: f32 = simd_extract!(b, 0);
+    let extractc: f32 = simd_extract!(c, 0);
     let r = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    simd_insert(a, 0, r)
+    simd_insert!(a, 0, r)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38119,13 +38119,13 @@ pub unsafe fn _mm_mask_fmadd_round_ss<const ROUNDING: i32>(
     c: __m128,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let mut fmadd: f32 = simd_extract(a, 0);
+    let mut fmadd: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         fmadd = vfmadd132ss(fmadd, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38152,12 +38152,12 @@ pub unsafe fn _mm_maskz_fmadd_round_ss<const ROUNDING: i32>(
     static_assert_rounding!(ROUNDING);
     let mut fmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         fmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -38182,13 +38182,13 @@ pub unsafe fn _mm_mask3_fmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let mut fmadd: f32 = simd_extract(c, 0);
+    let mut fmadd: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         fmadd = vfmadd132ss(extracta, extractb, fmadd, ROUNDING);
     }
-    simd_insert(c, 0, fmadd)
+    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -38212,11 +38212,11 @@ pub unsafe fn _mm_fmadd_round_sd<const ROUNDING: i32>(
     c: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let extracta: f64 = simd_extract(a, 0);
-    let extractb: f64 = simd_extract(b, 0);
-    let extractc: f64 = simd_extract(c, 0);
+    let extracta: f64 = simd_extract!(a, 0);
+    let extractb: f64 = simd_extract!(b, 0);
+    let extractc: f64 = simd_extract!(c, 0);
     let fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38241,13 +38241,13 @@ pub unsafe fn _mm_mask_fmadd_round_sd<const ROUNDING: i32>(
     c: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let mut fmadd: f64 = simd_extract(a, 0);
+    let mut fmadd: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         fmadd = vfmadd132sd(fmadd, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38274,12 +38274,12 @@ pub unsafe fn _mm_maskz_fmadd_round_sd<const ROUNDING: i32>(
     static_assert_rounding!(ROUNDING);
     let mut fmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         fmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fmadd)
+    simd_insert!(a, 0, fmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -38304,13 +38304,13 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let mut fmadd: f64 = simd_extract(c, 0);
+    let mut fmadd: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         fmadd = vfmadd132sd(extracta, extractb, fmadd, ROUNDING);
     }
-    simd_insert(c, 0, fmadd)
+    simd_insert!(c, 0, fmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38330,12 +38330,12 @@ pub unsafe fn _mm_mask3_fmadd_round_sd<const ROUNDING: i32>(
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let extracta: f32 = simd_extract(a, 0);
-    let extractb: f32 = simd_extract(b, 0);
-    let extractc: f32 = simd_extract(c, 0);
+    let extracta: f32 = simd_extract!(a, 0);
+    let extractb: f32 = simd_extract!(b, 0);
+    let extractc: f32 = simd_extract!(c, 0);
     let extractc = -extractc;
     let fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38360,14 +38360,14 @@ pub unsafe fn _mm_mask_fmsub_round_ss<const ROUNDING: i32>(
     c: __m128,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let mut fmsub: f32 = simd_extract(a, 0);
+    let mut fmsub: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
         fmsub = vfmadd132ss(fmsub, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38394,13 +38394,13 @@ pub unsafe fn _mm_maskz_fmsub_round_ss<const ROUNDING: i32>(
     static_assert_rounding!(ROUNDING);
     let mut fmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
         fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -38425,14 +38425,14 @@ pub unsafe fn _mm_mask3_fmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let mut fmsub: f32 = simd_extract(c, 0);
+    let mut fmsub: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
-        let extractb: f32 = simd_extract(b, 0);
+        let extracta: f32 = simd_extract!(a, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         let extractc = -fmsub;
         fmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(c, 0, fmsub)
+    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -38456,12 +38456,12 @@ pub unsafe fn _mm_fmsub_round_sd<const ROUNDING: i32>(
     c: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let extracta: f64 = simd_extract(a, 0);
-    let extractb: f64 = simd_extract(b, 0);
-    let extractc: f64 = simd_extract(c, 0);
+    let extracta: f64 = simd_extract!(a, 0);
+    let extractb: f64 = simd_extract!(b, 0);
+    let extractc: f64 = simd_extract!(c, 0);
     let extractc = -extractc;
     let fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38486,14 +38486,14 @@ pub unsafe fn _mm_mask_fmsub_round_sd<const ROUNDING: i32>(
     c: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let mut fmsub: f64 = simd_extract(a, 0);
+    let mut fmsub: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
         fmsub = vfmadd132sd(fmsub, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38520,13 +38520,13 @@ pub unsafe fn _mm_maskz_fmsub_round_sd<const ROUNDING: i32>(
     static_assert_rounding!(ROUNDING);
     let mut fmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
         fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fmsub)
+    simd_insert!(a, 0, fmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -38551,14 +38551,14 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let mut fmsub: f64 = simd_extract(c, 0);
+    let mut fmsub: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
-        let extractb: f64 = simd_extract(b, 0);
+        let extracta: f64 = simd_extract!(a, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         let extractc = -fmsub;
         fmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(c, 0, fmsub)
+    simd_insert!(c, 0, fmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38578,12 +38578,12 @@ pub unsafe fn _mm_mask3_fmsub_round_sd<const ROUNDING: i32>(
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fnmadd_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let extracta: f32 = simd_extract(a, 0);
+    let extracta: f32 = simd_extract!(a, 0);
     let extracta = -extracta;
-    let extractb: f32 = simd_extract(b, 0);
-    let extractc: f32 = simd_extract(c, 0);
+    let extractb: f32 = simd_extract!(b, 0);
+    let extractc: f32 = simd_extract!(c, 0);
     let fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38608,14 +38608,14 @@ pub unsafe fn _mm_mask_fnmadd_round_ss<const ROUNDING: i32>(
     c: __m128,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f32 = simd_extract(a, 0);
+    let mut fnmadd: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38642,13 +38642,13 @@ pub unsafe fn _mm_maskz_fnmadd_round_ss<const ROUNDING: i32>(
     static_assert_rounding!(ROUNDING);
     let mut fnmadd: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
+        let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         fnmadd = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -38673,14 +38673,14 @@ pub unsafe fn _mm_mask3_fnmadd_round_ss<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f32 = simd_extract(c, 0);
+    let mut fnmadd: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
+        let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         fnmadd = vfmadd132ss(extracta, extractb, fnmadd, ROUNDING);
     }
-    simd_insert(c, 0, fnmadd)
+    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -38704,12 +38704,12 @@ pub unsafe fn _mm_fnmadd_round_sd<const ROUNDING: i32>(
     c: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let extracta: f64 = simd_extract(a, 0);
+    let extracta: f64 = simd_extract!(a, 0);
     let extracta = -extracta;
-    let extractb: f64 = simd_extract(b, 0);
-    let extractc: f64 = simd_extract(c, 0);
+    let extractb: f64 = simd_extract!(b, 0);
+    let extractc: f64 = simd_extract!(c, 0);
     let fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from a when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38734,14 +38734,14 @@ pub unsafe fn _mm_mask_fnmadd_round_sd<const ROUNDING: i32>(
     c: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f64 = simd_extract(a, 0);
+    let mut fnmadd: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmadd;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38768,13 +38768,13 @@ pub unsafe fn _mm_maskz_fnmadd_round_sd<const ROUNDING: i32>(
     static_assert_rounding!(ROUNDING);
     let mut fnmadd: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
+        let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         fnmadd = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fnmadd)
+    simd_insert!(a, 0, fnmadd)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and add the negated intermediate result to the lower element in c. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -38799,14 +38799,14 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let mut fnmadd: f64 = simd_extract(c, 0);
+    let mut fnmadd: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
+        let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         fnmadd = vfmadd132sd(extracta, extractb, fnmadd, ROUNDING);
     }
-    simd_insert(c, 0, fnmadd)
+    simd_insert!(c, 0, fnmadd)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38826,13 +38826,13 @@ pub unsafe fn _mm_mask3_fnmadd_round_sd<const ROUNDING: i32>(
 #[rustc_legacy_const_generics(3)]
 pub unsafe fn _mm_fnmsub_round_ss<const ROUNDING: i32>(a: __m128, b: __m128, c: __m128) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let extracta: f32 = simd_extract(a, 0);
+    let extracta: f32 = simd_extract!(a, 0);
     let extracta = -extracta;
-    let extractb: f32 = simd_extract(b, 0);
-    let extractc: f32 = simd_extract(c, 0);
+    let extractb: f32 = simd_extract!(b, 0);
+    let extractc: f32 = simd_extract!(c, 0);
     let extractc = -extractc;
     let fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38857,15 +38857,15 @@ pub unsafe fn _mm_mask_fnmsub_round_ss<const ROUNDING: i32>(
     c: __m128,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f32 = simd_extract(a, 0);
+    let mut fnmsub: f32 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmsub;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper 3 packed elements from a to the upper elements of dst.\
@@ -38892,14 +38892,14 @@ pub unsafe fn _mm_maskz_fnmsub_round_ss<const ROUNDING: i32>(
     static_assert_rounding!(ROUNDING);
     let mut fnmsub: f32 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
+        let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
-        let extractc: f32 = simd_extract(c, 0);
+        let extractb: f32 = simd_extract!(b, 0);
+        let extractc: f32 = simd_extract!(c, 0);
         let extractc = -extractc;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower single-precision (32-bit) floating-point elements in a and b, subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper 3 packed elements from c to the upper elements of dst.\
@@ -38924,15 +38924,15 @@ pub unsafe fn _mm_mask3_fnmsub_round_ss<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m128 {
     static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f32 = simd_extract(c, 0);
+    let mut fnmsub: f32 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f32 = simd_extract(a, 0);
+        let extracta: f32 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f32 = simd_extract(b, 0);
+        let extractb: f32 = simd_extract!(b, 0);
         let extractc = -fnmsub;
         fnmsub = vfmadd132ss(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(c, 0, fnmsub)
+    simd_insert!(c, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.\
@@ -38956,13 +38956,13 @@ pub unsafe fn _mm_fnmsub_round_sd<const ROUNDING: i32>(
     c: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let extracta: f64 = simd_extract(a, 0);
+    let extracta: f64 = simd_extract!(a, 0);
     let extracta = -extracta;
-    let extractb: f64 = simd_extract(b, 0);
-    let extractc: f64 = simd_extract(c, 0);
+    let extractb: f64 = simd_extract!(b, 0);
+    let extractc: f64 = simd_extract!(c, 0);
     let extractc = -extractc;
     let fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -38987,15 +38987,15 @@ pub unsafe fn _mm_mask_fnmsub_round_sd<const ROUNDING: i32>(
     c: __m128d,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f64 = simd_extract(a, 0);
+    let mut fnmsub: f64 = simd_extract!(a, 0);
     if (k & 0b00000001) != 0 {
         let extracta = -fnmsub;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in dst using zeromask k (the element is zeroed out when mask bit 0 is not set), and copy the upper element from a to the upper element of dst.\
@@ -39022,14 +39022,14 @@ pub unsafe fn _mm_maskz_fnmsub_round_sd<const ROUNDING: i32>(
     static_assert_rounding!(ROUNDING);
     let mut fnmsub: f64 = 0.;
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
+        let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
-        let extractc: f64 = simd_extract(c, 0);
+        let extractb: f64 = simd_extract!(b, 0);
+        let extractc: f64 = simd_extract!(c, 0);
         let extractc = -extractc;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(a, 0, fnmsub)
+    simd_insert!(a, 0, fnmsub)
 }
 
 /// Multiply the lower double-precision (64-bit) floating-point elements in a and b, and subtract the lower element in c from the negated intermediate result. Store the result in the lower element of dst using writemask k (the element is copied from c when mask bit 0 is not set), and copy the upper element from c to the upper element of dst.\
@@ -39054,15 +39054,15 @@ pub unsafe fn _mm_mask3_fnmsub_round_sd<const ROUNDING: i32>(
     k: __mmask8,
 ) -> __m128d {
     static_assert_rounding!(ROUNDING);
-    let mut fnmsub: f64 = simd_extract(c, 0);
+    let mut fnmsub: f64 = simd_extract!(c, 0);
     if (k & 0b00000001) != 0 {
-        let extracta: f64 = simd_extract(a, 0);
+        let extracta: f64 = simd_extract!(a, 0);
         let extracta = -extracta;
-        let extractb: f64 = simd_extract(b, 0);
+        let extractb: f64 = simd_extract!(b, 0);
         let extractc = -fnmsub;
         fnmsub = vfmadd132sd(extracta, extractb, extractc, ROUNDING);
     }
-    simd_insert(c, 0, fnmsub)
+    simd_insert!(c, 0, fnmsub)
 }
 
 /// Fix up the lower single-precision (32-bit) floating-point elements in a and b using the lower 32-bit integer in c, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst. imm8 is used to set the required flags reporting.
@@ -39079,8 +39079,8 @@ pub unsafe fn _mm_fixupimm_ss<const IMM8: i32>(a: __m128, b: __m128, c: __m128i)
     let b = b.as_f32x4();
     let c = c.as_i32x4();
     let r = vfixupimmss(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f32 = simd_extract(r, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f32 = simd_extract!(r, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39103,8 +39103,8 @@ pub unsafe fn _mm_mask_fixupimm_ss<const IMM8: i32>(
     let b = b.as_f32x4();
     let c = c.as_i32x4();
     let fixupimm = vfixupimmss(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f32 = simd_extract(fixupimm, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f32 = simd_extract!(fixupimm, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39127,8 +39127,8 @@ pub unsafe fn _mm_maskz_fixupimm_ss<const IMM8: i32>(
     let b = b.as_f32x4();
     let c = c.as_i32x4();
     let fixupimm = vfixupimmssz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f32 = simd_extract(fixupimm, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f32 = simd_extract!(fixupimm, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39146,8 +39146,8 @@ pub unsafe fn _mm_fixupimm_sd<const IMM8: i32>(a: __m128d, b: __m128d, c: __m128
     let b = b.as_f64x2();
     let c = c.as_i64x2();
     let fixupimm = vfixupimmsd(a, b, c, IMM8, 0b11111111, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f64 = simd_extract(fixupimm, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f64 = simd_extract!(fixupimm, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39170,8 +39170,8 @@ pub unsafe fn _mm_mask_fixupimm_sd<const IMM8: i32>(
     let b = b.as_f64x2();
     let c = c.as_i64x2();
     let fixupimm = vfixupimmsd(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f64 = simd_extract(fixupimm, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f64 = simd_extract!(fixupimm, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39194,8 +39194,8 @@ pub unsafe fn _mm_maskz_fixupimm_sd<const IMM8: i32>(
     let b = b.as_f64x2();
     let c = c.as_i64x2();
     let fixupimm = vfixupimmsdz(a, b, c, IMM8, k, _MM_FROUND_CUR_DIRECTION);
-    let fixupimm: f64 = simd_extract(fixupimm, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f64 = simd_extract!(fixupimm, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39219,8 +39219,8 @@ pub unsafe fn _mm_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
     let b = b.as_f32x4();
     let c = c.as_i32x4();
     let r = vfixupimmss(a, b, c, IMM8, 0b11111111, SAE);
-    let fixupimm: f32 = simd_extract(r, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f32 = simd_extract!(r, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39245,8 +39245,8 @@ pub unsafe fn _mm_mask_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
     let b = b.as_f32x4();
     let c = c.as_i32x4();
     let r = vfixupimmss(a, b, c, IMM8, k, SAE);
-    let fixupimm: f32 = simd_extract(r, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f32 = simd_extract!(r, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39271,8 +39271,8 @@ pub unsafe fn _mm_maskz_fixupimm_round_ss<const IMM8: i32, const SAE: i32>(
     let b = b.as_f32x4();
     let c = c.as_i32x4();
     let r = vfixupimmssz(a, b, c, IMM8, k, SAE);
-    let fixupimm: f32 = simd_extract(r, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f32 = simd_extract!(r, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39296,8 +39296,8 @@ pub unsafe fn _mm_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
     let b = b.as_f64x2();
     let c = c.as_i64x2();
     let r = vfixupimmsd(a, b, c, IMM8, 0b11111111, SAE);
-    let fixupimm: f64 = simd_extract(r, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f64 = simd_extract!(r, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39322,8 +39322,8 @@ pub unsafe fn _mm_mask_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
     let b = b.as_f64x2();
     let c = c.as_i64x2();
     let r = vfixupimmsd(a, b, c, IMM8, k, SAE);
-    let fixupimm: f64 = simd_extract(r, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f64 = simd_extract!(r, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39348,8 +39348,8 @@ pub unsafe fn _mm_maskz_fixupimm_round_sd<const IMM8: i32, const SAE: i32>(
     let b = b.as_f64x2();
     let c = c.as_i64x2();
     let r = vfixupimmsdz(a, b, c, IMM8, k, SAE);
-    let fixupimm: f64 = simd_extract(r, 0);
-    let r = simd_insert(a, 0, fixupimm);
+    let fixupimm: f64 = simd_extract!(r, 0);
+    let r = simd_insert!(a, 0, fixupimm);
     transmute(r)
 }
 
@@ -39800,7 +39800,7 @@ pub unsafe fn _mm_cvt_roundu32_ss<const ROUNDING: i32>(a: __m128, b: u32) -> __m
 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
 pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
     let b = b as f32;
-    simd_insert(a, 0, b)
+    simd_insert!(a, 0, b)
 }
 
 /// Convert the signed 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -39812,7 +39812,7 @@ pub unsafe fn _mm_cvti32_ss(a: __m128, b: i32) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
 pub unsafe fn _mm_cvti32_sd(a: __m128d, b: i32) -> __m128d {
     let b = b as f64;
-    simd_insert(a, 0, b)
+    simd_insert!(a, 0, b)
 }
 
 /// Convert the lower single-precision (32-bit) floating-point element in a to a 32-bit integer with truncation, and store the result in dst.\
@@ -39958,7 +39958,7 @@ pub unsafe fn _mm_cvttsd_u32(a: __m128d) -> u32 {
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
 pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
     let b = b as f32;
-    simd_insert(a, 0, b)
+    simd_insert!(a, 0, b)
 }
 
 /// Convert the unsigned 32-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -39970,7 +39970,7 @@ pub unsafe fn _mm_cvtu32_ss(a: __m128, b: u32) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
 pub unsafe fn _mm_cvtu32_sd(a: __m128d, b: u32) -> __m128d {
     let b = b as f64;
-    simd_insert(a, 0, b)
+    simd_insert!(a, 0, b)
 }
 
 /// Compare the lower single-precision (32-bit) floating-point element in a and b based on the comparison operand specified by imm8, and return the boolean result (0 or 1).\
diff --git a/crates/core_arch/src/x86/sse.rs b/crates/core_arch/src/x86/sse.rs
index 17c4c07e94..2ec0ad4c1b 100644
--- a/crates/core_arch/src/x86/sse.rs
+++ b/crates/core_arch/src/x86/sse.rs
@@ -853,7 +853,7 @@ pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
 // no-op, and on Windows it's just a `mov`.
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
-    simd_extract(a, 0)
+    simd_extract!(a, 0)
 }
 
 /// Converts a 32 bit integer to a 32 bit float. The result vector is the input
@@ -1224,7 +1224,7 @@ pub unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i {
 #[cfg_attr(test, assert_instr(movss))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
-    *p = simd_extract(a, 0);
+    *p = simd_extract!(a, 0);
 }
 
 /// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs
index afc2aaede1..d1bb92ce6a 100644
--- a/crates/core_arch/src/x86/sse2.rs
+++ b/crates/core_arch/src/x86/sse2.rs
@@ -955,7 +955,7 @@ pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
-    simd_insert(a, 0, b as f64)
+    simd_insert!(a, 0, b as f64)
 }
 
 /// Converts packed 32-bit integers in `a` to packed single-precision (32-bit)
@@ -1000,7 +1000,7 @@ pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
 #[target_feature(enable = "sse2")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
-    simd_extract(a.as_i32x4(), 0)
+    simd_extract!(a.as_i32x4(), 0)
 }
 
 /// Sets packed 64-bit integers with the supplied values, from highest to
@@ -1399,7 +1399,7 @@ pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
     static_assert_uimm_bits!(IMM8, 3);
-    simd_extract::<_, u16>(a.as_u16x8(), IMM8 as u32) as i32
+    simd_extract!(a.as_u16x8(), IMM8 as u32, u16) as i32
 }
 
 /// Returns a new vector where the `imm8` element of `a` is replaced with `i`.
@@ -1412,7 +1412,7 @@ pub unsafe fn _mm_extract_epi16<const IMM8: i32>(a: __m128i) -> i32 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi16<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
     static_assert_uimm_bits!(IMM8, 3);
-    transmute(simd_insert(a.as_i16x8(), IMM8 as u32, i as i16))
+    transmute(simd_insert!(a.as_i16x8(), IMM8 as u32, i as i16))
 }
 
 /// Returns a mask of the most significant bit of each element in `a`.
@@ -1623,7 +1623,7 @@ pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 #[cfg_attr(test, assert_instr(addsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
+    simd_insert!(a, 0, _mm_cvtsd_f64(a) + _mm_cvtsd_f64(b))
 }
 
 /// Adds packed double-precision (64-bit) floating-point elements in `a` and
@@ -1647,7 +1647,7 @@ pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(divsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
+    simd_insert!(a, 0, _mm_cvtsd_f64(a) / _mm_cvtsd_f64(b))
 }
 
 /// Divide packed double-precision (64-bit) floating-point elements in `a` by
@@ -1719,7 +1719,7 @@ pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(mulsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
+    simd_insert!(a, 0, _mm_cvtsd_f64(a) * _mm_cvtsd_f64(b))
 }
 
 /// Multiplies packed double-precision (64-bit) floating-point elements in `a`
@@ -1743,7 +1743,7 @@ pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(sqrtsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(a, 0, _mm_cvtsd_f64(sqrtsd(b)))
+    simd_insert!(a, 0, _mm_cvtsd_f64(sqrtsd(b)))
 }
 
 /// Returns a new vector with the square root of each of the values in `a`.
@@ -1766,7 +1766,7 @@ pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(subsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
+    simd_insert!(a, 0, _mm_cvtsd_f64(a) - _mm_cvtsd_f64(b))
 }
 
 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
@@ -1879,7 +1879,7 @@ pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpltsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(_mm_cmplt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+    simd_insert!(_mm_cmplt_sd(b, a), 1, simd_extract!(a, 1, f64))
 }
 
 /// Returns a new vector with the low element of `a` replaced by the
@@ -1891,7 +1891,7 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmplesd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(_mm_cmple_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+    simd_insert!(_mm_cmple_sd(b, a), 1, simd_extract!(a, 1, f64))
 }
 
 /// Returns a new vector with the low element of `a` replaced by the result
@@ -1966,7 +1966,7 @@ pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpnltsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(_mm_cmpnlt_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+    simd_insert!(_mm_cmpnlt_sd(b, a), 1, simd_extract!(a, 1, f64))
 }
 
 /// Returns a new vector with the low element of `a` replaced by the
@@ -1978,7 +1978,7 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
 #[cfg_attr(test, assert_instr(cmpnlesd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
-    simd_insert(_mm_cmpnle_sd(b, a), 1, simd_extract::<_, f64>(a, 1))
+    simd_insert!(_mm_cmpnle_sd(b, a), 1, simd_extract!(a, 1, f64))
 }
 
 /// Compares corresponding elements in `a` and `b` for equality.
@@ -2319,7 +2319,7 @@ pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
 #[target_feature(enable = "sse2")]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
-    simd_extract(a, 0)
+    simd_extract!(a, 0)
 }
 
 /// Converts the lower single-precision (32-bit) floating-point element in `b`
@@ -2493,7 +2493,7 @@ pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
 #[cfg_attr(test, assert_instr(movhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
-    _mm_setr_pd(simd_extract(a, 0), *mem_addr)
+    _mm_setr_pd(simd_extract!(a, 0), *mem_addr)
 }
 
 /// Loads a double-precision value into the low-order bits of a 128-bit
@@ -2506,7 +2506,7 @@ pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 #[cfg_attr(test, assert_instr(movlps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
-    _mm_setr_pd(*mem_addr, simd_extract(a, 1))
+    _mm_setr_pd(*mem_addr, simd_extract!(a, 1))
 }
 
 /// Stores a 128-bit floating point vector of `[2 x double]` to a 128-bit
@@ -2533,7 +2533,7 @@ pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
-    *mem_addr = simd_extract(a, 0)
+    *mem_addr = simd_extract!(a, 0)
 }
 
 /// Stores 128-bits (composed of 2 packed double-precision (64-bit)
@@ -2615,7 +2615,7 @@ pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movhps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
-    *mem_addr = simd_extract(a, 1);
+    *mem_addr = simd_extract!(a, 1);
 }
 
 /// Stores the lower 64 bits of a 128-bit vector of `[2 x double]` to a
@@ -2627,7 +2627,7 @@ pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
 #[cfg_attr(all(test, not(target_os = "windows")), assert_instr(movlps))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
-    *mem_addr = simd_extract(a, 0);
+    *mem_addr = simd_extract!(a, 0);
 }
 
 /// Loads a double-precision (64-bit) floating-point element from memory
@@ -2713,7 +2713,7 @@ pub unsafe fn _mm_shuffle_pd<const MASK: i32>(a: __m128d, b: __m128d) -> __m128d
 #[cfg_attr(test, assert_instr(movsd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
-    _mm_setr_pd(simd_extract(b, 0), simd_extract(a, 1))
+    _mm_setr_pd(simd_extract!(b, 0), simd_extract!(a, 1))
 }
 
 /// Casts a 128-bit floating-point vector of `[2 x double]` into a 128-bit
diff --git a/crates/core_arch/src/x86/sse41.rs b/crates/core_arch/src/x86/sse41.rs
index af51a53feb..7fc3c79428 100644
--- a/crates/core_arch/src/x86/sse41.rs
+++ b/crates/core_arch/src/x86/sse41.rs
@@ -201,7 +201,7 @@ pub unsafe fn _mm_blend_ps<const IMM4: i32>(a: __m128, b: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
     static_assert_uimm_bits!(IMM8, 2);
-    simd_extract::<_, f32>(a, IMM8 as u32).to_bits() as i32
+    simd_extract!(a, IMM8 as u32, f32).to_bits() as i32
 }
 
 /// Extracts an 8-bit integer from `a`, selected with `IMM8`. Returns a 32-bit
@@ -217,7 +217,7 @@ pub unsafe fn _mm_extract_ps<const IMM8: i32>(a: __m128) -> i32 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
     static_assert_uimm_bits!(IMM8, 4);
-    simd_extract::<_, u8>(a.as_u8x16(), IMM8 as u32) as i32
+    simd_extract!(a.as_u8x16(), IMM8 as u32, u8) as i32
 }
 
 /// Extracts an 32-bit integer from `a` selected with `IMM8`
@@ -233,7 +233,7 @@ pub unsafe fn _mm_extract_epi8<const IMM8: i32>(a: __m128i) -> i32 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi32<const IMM8: i32>(a: __m128i) -> i32 {
     static_assert_uimm_bits!(IMM8, 2);
-    simd_extract::<_, i32>(a.as_i32x4(), IMM8 as u32)
+    simd_extract!(a.as_i32x4(), IMM8 as u32, i32)
 }
 
 /// Select a single value in `a` to store at some position in `b`,
@@ -281,7 +281,7 @@ pub unsafe fn _mm_insert_ps<const IMM8: i32>(a: __m128, b: __m128) -> __m128 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
     static_assert_uimm_bits!(IMM8, 4);
-    transmute(simd_insert(a.as_i8x16(), IMM8 as u32, i as i8))
+    transmute(simd_insert!(a.as_i8x16(), IMM8 as u32, i as i8))
 }
 
 /// Returns a copy of `a` with the 32-bit integer from `i` inserted at a
@@ -295,7 +295,7 @@ pub unsafe fn _mm_insert_epi8<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi32<const IMM8: i32>(a: __m128i, i: i32) -> __m128i {
     static_assert_uimm_bits!(IMM8, 2);
-    transmute(simd_insert(a.as_i32x4(), IMM8 as u32, i))
+    transmute(simd_insert!(a.as_i32x4(), IMM8 as u32, i))
 }
 
 /// Compares packed 8-bit integers in `a` and `b` and returns packed maximum
diff --git a/crates/core_arch/src/x86_64/avx.rs b/crates/core_arch/src/x86_64/avx.rs
index f699f61648..5715097d72 100644
--- a/crates/core_arch/src/x86_64/avx.rs
+++ b/crates/core_arch/src/x86_64/avx.rs
@@ -29,7 +29,7 @@ use crate::{
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_insert_epi64<const INDEX: i32>(a: __m256i, i: i64) -> __m256i {
     static_assert_uimm_bits!(INDEX, 2);
-    transmute(simd_insert(a.as_i64x4(), INDEX as u32, i))
+    transmute(simd_insert!(a.as_i64x4(), INDEX as u32, i))
 }
 
 #[cfg(test)]
diff --git a/crates/core_arch/src/x86_64/avx2.rs b/crates/core_arch/src/x86_64/avx2.rs
index 3388568eb4..b3b1431e56 100644
--- a/crates/core_arch/src/x86_64/avx2.rs
+++ b/crates/core_arch/src/x86_64/avx2.rs
@@ -30,7 +30,7 @@ use crate::core_arch::{simd_llvm::*, x86::*};
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm256_extract_epi64<const INDEX: i32>(a: __m256i) -> i64 {
     static_assert_uimm_bits!(INDEX, 2);
-    simd_extract(a.as_i64x4(), INDEX as u32)
+    simd_extract!(a.as_i64x4(), INDEX as u32)
 }
 
 #[cfg(test)]
diff --git a/crates/core_arch/src/x86_64/avx512f.rs b/crates/core_arch/src/x86_64/avx512f.rs
index d31110d758..fa58a443dc 100644
--- a/crates/core_arch/src/x86_64/avx512f.rs
+++ b/crates/core_arch/src/x86_64/avx512f.rs
@@ -59,7 +59,7 @@ pub unsafe fn _mm_cvtsd_u64(a: __m128d) -> u64 {
 #[cfg_attr(test, assert_instr(vcvtsi2ss))]
 pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
     let b = b as f32;
-    simd_insert(a, 0, b)
+    simd_insert!(a, 0, b)
 }
 
 /// Convert the signed 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -71,7 +71,7 @@ pub unsafe fn _mm_cvti64_ss(a: __m128, b: i64) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtsi2sd))]
 pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
     let b = b as f64;
-    simd_insert(a, 0, b)
+    simd_insert!(a, 0, b)
 }
 
 /// Convert the unsigned 64-bit integer b to a single-precision (32-bit) floating-point element, store the result in the lower element of dst, and copy the upper 3 packed elements from a to the upper elements of dst.
@@ -83,7 +83,7 @@ pub unsafe fn _mm_cvti64_sd(a: __m128d, b: i64) -> __m128d {
 #[cfg_attr(test, assert_instr(vcvtusi2ss))]
 pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
     let b = b as f32;
-    simd_insert(a, 0, b)
+    simd_insert!(a, 0, b)
 }
 
 /// Convert the unsigned 64-bit integer b to a double-precision (64-bit) floating-point element, store the result in the lower element of dst, and copy the upper element from a to the upper element of dst.
@@ -95,7 +95,7 @@ pub unsafe fn _mm_cvtu64_ss(a: __m128, b: u64) -> __m128 {
 #[cfg_attr(test, assert_instr(vcvtusi2sd))]
 pub unsafe fn _mm_cvtu64_sd(a: __m128d, b: u64) -> __m128d {
     let b = b as f64;
-    simd_insert(a, 0, b)
+    simd_insert!(a, 0, b)
 }
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to a 64-bit integer with truncation, and store the result in dst.
diff --git a/crates/core_arch/src/x86_64/sse2.rs b/crates/core_arch/src/x86_64/sse2.rs
index 9619cb7480..f0c7623ac0 100644
--- a/crates/core_arch/src/x86_64/sse2.rs
+++ b/crates/core_arch/src/x86_64/sse2.rs
@@ -107,7 +107,7 @@ pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
-    simd_extract(a.as_i64x2(), 0)
+    simd_extract!(a.as_i64x2(), 0)
 }
 
 /// Returns the lowest element of `a`.
@@ -130,7 +130,7 @@ pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
-    simd_insert(a, 0, b as f64)
+    simd_insert!(a, 0, b as f64)
 }
 
 /// Returns `a` with its lower element replaced by `b` after converting it to
diff --git a/crates/core_arch/src/x86_64/sse41.rs b/crates/core_arch/src/x86_64/sse41.rs
index d815a69a7e..49c6d95943 100644
--- a/crates/core_arch/src/x86_64/sse41.rs
+++ b/crates/core_arch/src/x86_64/sse41.rs
@@ -18,7 +18,7 @@ use stdarch_test::assert_instr;
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_extract_epi64<const IMM1: i32>(a: __m128i) -> i64 {
     static_assert_uimm_bits!(IMM1, 1);
-    simd_extract(a.as_i64x2(), IMM1 as u32)
+    simd_extract!(a.as_i64x2(), IMM1 as u32)
 }
 
 /// Returns a copy of `a` with the 64-bit integer from `i` inserted at a
@@ -32,7 +32,7 @@ pub unsafe fn _mm_extract_epi64<const IMM1: i32>(a: __m128i) -> i64 {
 #[stable(feature = "simd_x86", since = "1.27.0")]
 pub unsafe fn _mm_insert_epi64<const IMM1: i32>(a: __m128i, i: i64) -> __m128i {
     static_assert_uimm_bits!(IMM1, 1);
-    transmute(simd_insert(a.as_i64x2(), IMM1 as u32, i))
+    transmute(simd_insert!(a.as_i64x2(), IMM1 as u32, i))
 }
 
 #[cfg(test)]
diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec
index 559b8f4473..d6a2212cf2 100644
--- a/crates/stdarch-gen/neon.spec
+++ b/crates/stdarch-gen/neon.spec
@@ -156,7 +156,7 @@ generate float*_t
 
 /// Floating-point absolute difference
 name = vabd
-multi_fn = simd_extract, {vabd-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vabd-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 1.0
 b = 9.0
 validate 8.0
@@ -341,7 +341,7 @@ generate i64:u64, u64
 
 /// Floating-point compare equal
 name = vceq
-multi_fn = simd_extract, {vceq-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vceq-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 1.
 b = 2.
 validate 0
@@ -390,7 +390,7 @@ generate i64:u64, u64
 
 /// Floating-point compare bitwise equal to zero
 name = vceqz
-multi_fn = simd_extract, {vceqz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vceqz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
 a = 1.
 validate 0
 
@@ -453,7 +453,7 @@ generate i32:u32:i32, i64:u64:i64
 /// Signed saturating accumulate of unsigned value
 name = vuqadd
 out-suffix
-multi_fn = simd_extract, {vuqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vuqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 1
 b = 2
 validate 3
@@ -530,7 +530,7 @@ generate i64:u64, u64
 
 /// Floating-point compare greater than
 name = vcgt
-multi_fn = simd_extract, {vcgt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vcgt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 1.
 b = 2.
 validate 0
@@ -592,7 +592,7 @@ generate i64:u64, u64
 
 /// Floating-point compare less than
 name = vclt
-multi_fn = simd_extract, {vclt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vclt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 2.
 b = 1.
 validate 0
@@ -629,7 +629,7 @@ generate i64:u64, u64
 
 /// Floating-point compare greater than or equal
 name = vcge
-multi_fn = simd_extract, {vcge-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vcge-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 1.
 b = 2.
 validate 0
@@ -674,7 +674,7 @@ generate i64:u64, u64
 
 /// Floating-point compare less than or equal
 name = vcle
-multi_fn = simd_extract, {vcle-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vcle-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 2.
 b = 1.
 validate 0
@@ -756,7 +756,7 @@ generate i64:u64
 
 /// Floating-point compare greater than or equal to zero
 name = vcgez
-multi_fn = simd_extract, {vcgez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vcgez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
 a = -1.
 validate 0
 
@@ -794,7 +794,7 @@ generate i64:u64
 
 /// Floating-point compare greater than zero
 name = vcgtz
-multi_fn = simd_extract, {vcgtz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vcgtz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
 a = -1.
 validate 0
 
@@ -832,7 +832,7 @@ generate i64:u64
 
 /// Floating-point compare less than or equal to zero
 name = vclez
-multi_fn = simd_extract, {vclez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vclez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
 a = 2.
 validate 0
 
@@ -870,7 +870,7 @@ generate i64:u64
 
 /// Floating-point compare less than zero
 name = vcltz
-multi_fn = simd_extract, {vcltz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vcltz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
 a = 2.
 validate 0
 
@@ -1190,7 +1190,7 @@ generate float64x2_t:float32x2_t
 /// Floating-point convert to lower precision narrow, rounding to odd
 name = vcvtx
 double-suffixes
-multi_fn = simd_extract, {vcvtx-_f32_f64-noext, {vdupq_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vcvtx-_f32_f64-noext, {vdupq_n-in_ntt-noext, a}}, 0
 a = -1.0
 validate -1.0
 
@@ -1513,7 +1513,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32}
+multi_fn = transmute--<element_t _>, {simd_extract!, a, N as u32}
 a = 0, 1
 n = HFLEN
 validate 1
@@ -1529,7 +1529,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = transmute--<element_t _>, {simd_extract, a, N as u32}
+multi_fn = transmute--<element_t _>, {simd_extract!, a, N as u32}
 a = 0., 1.
 n = HFLEN
 validate 1.
@@ -1542,7 +1542,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_extract, a, N as u32
+multi_fn = simd_extract!, a, N as u32
 a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16
 n = HFLEN
 validate 1
@@ -1557,7 +1557,7 @@ name = vdup
 lane-suffixes
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_extract, a, N as u32
+multi_fn = simd_extract!, a, N as u32
 a = 1., 1., 1., 4.
 n = HFLEN
 validate 1.
@@ -2092,7 +2092,7 @@ generate int*_t
 
 /// Signed saturating negate
 name = vqneg
-multi_fn = simd_extract, {vqneg-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqneg-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
 a = 1
 validate -1
 
@@ -2121,7 +2121,7 @@ generate int*_t, int64x*_t
 name = vqsub
 multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
-multi_fn = simd_extract, {vqsub-in_ntt-noext, a, b}, 0
+multi_fn = simd_extract!, {vqsub-in_ntt-noext, a, b}, 0
 a = 42
 b = 1
 validate 41
@@ -2300,7 +2300,7 @@ generate int*_t, int64x*_t
 name = vqadd
 multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
-multi_fn = simd_extract, {vqadd-in_ntt-noext, a, b}, 0
+multi_fn = simd_extract!, {vqadd-in_ntt-noext, a, b}, 0
 a = 42
 b = 1
 validate 43
@@ -2984,7 +2984,7 @@ generate *const f32:float32x2x4_t:float32x2x4_t, *const f32:float32x4x4_t:float3
 name = vst1
 in1-lane-nox
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = *a, {simd_extract, b, LANE as u32}
+multi_fn = *a, {simd_extract!, b, LANE as u32}
 constn = LANE
 a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 n = 0
@@ -3005,7 +3005,7 @@ generate *mut p64:poly64x1_t:void, *mut p64:poly64x2_t:void
 name = vst1
 in1-lane-nox
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = *a, {simd_extract, b, LANE as u32}
+multi_fn = *a, {simd_extract!, b, LANE as u32}
 constn = LANE
 a = 0., 1., 2., 3., 4., 5., 6., 7., 8.
 n = 0
@@ -3696,7 +3696,7 @@ name = vmul
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_mul, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}}
+multi_fn = simd_mul, a, {transmute--<element_t _>, {simd_extract!, b, LANE as u32}}
 a = 1., 2., 3., 4.
 b = 2., 0., 0., 0.
 n = 0
@@ -3726,7 +3726,7 @@ generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2
 name = vmuls_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_extract, b:f32, b, LANE as u32
+multi_fn = simd_extract!, b:f32, b, LANE as u32
 multi_fn = a * b
 a = 1.
 b = 2., 0., 0., 0.
@@ -3739,7 +3739,7 @@ generate f32:float32x2_t:f32, f32:float32x4_t:f32
 name = vmuld_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_extract, b:f64, b, LANE as u32
+multi_fn = simd_extract!, b:f64, b, LANE as u32
 multi_fn = a * b
 a = 1.
 b = 2., 0.
@@ -3845,7 +3845,7 @@ generate poly8x16_t:poly8x16_t:poly16x8_t
 /// Polynomial multiply long
 name = vmull_high
 no-q
-multi_fn = vmull-noqself-noext, {simd_extract, a, 1}, {simd_extract, b, 1}
+multi_fn = vmull-noqself-noext, {simd_extract!, a, 1}, {simd_extract!, b, 1}
 a = 1, 15
 b = 1, 3
 validate 17
@@ -3931,7 +3931,7 @@ name = vmulx
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmulx-in0-noext, a, {transmute--<element_t _>, {simd_extract, b, LANE as u32}}
+multi_fn = vmulx-in0-noext, a, {transmute--<element_t _>, {simd_extract!, b, LANE as u32}}
 a = 1.
 b = 2., 0.
 n = 0
@@ -3970,7 +3970,7 @@ name = vmulx
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vmulx-out-noext, a, {simd_extract, b, LANE as u32}
+multi_fn = vmulx-out-noext, a, {simd_extract!, b, LANE as u32}
 
 a = 2.
 b = 3., 0., 0., 0.
@@ -4022,7 +4022,7 @@ name = vfma
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
+multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract!, c, LANE as u32}}
 a = 2., 3., 4., 5.
 b = 6., 4., 7., 8.
 c = 2., 0., 0., 0.
@@ -4041,7 +4041,7 @@ name = vfma
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = simd_extract, c:out_t, c, LANE as u32
+multi_fn = simd_extract!, c:out_t, c, LANE as u32
 multi_fn = vfma-in2lane-_, b, c, a
 a = 2.
 b = 6.
@@ -4096,7 +4096,7 @@ name = vfms
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract, c, LANE as u32}}
+multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract!, c, LANE as u32}}
 a = 14., 11., 18., 21.
 b = 6., 4., 7., 8.
 c = 2., 0., 0., 0.
@@ -4982,8 +4982,8 @@ generate float32x2_t
 /// Floating-point add pairwise
 name = vpadd
 out-suffix
-multi_fn = simd_extract, a1:out_t, a, 0
-multi_fn = simd_extract, a2:out_t, a, 1
+multi_fn = simd_extract!, a1:out_t, a, 0
+multi_fn = simd_extract!, a2:out_t, a, 1
 multi_fn = a1 + a2
 a = 1., 2.
 validate 3.
@@ -5050,7 +5050,7 @@ generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
 name = vqdmull
 multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
-multi_fn = simd_extract, {vqdmull-in_ntt-noext, a, b}, 0
+multi_fn = simd_extract!, {vqdmull-in_ntt-noext, a, b}, 0
 a = 2
 b = 3
 validate 12
@@ -5127,7 +5127,7 @@ generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t
 name = vqdmullh_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_extract, b:in_t0, b, N as u32
+multi_fn = simd_extract!, b:in_t0, b, N as u32
 multi_fn = vqdmullh-noqself-noext, a, b
 a = 2
 b = 0, 2, 2, 0, 2, 0, 0, 0
@@ -5141,7 +5141,7 @@ generate i16:int16x4_t:i32, i16:int16x8_t:i32
 name = vqdmulls_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_extract, b:in_t0, b, N as u32
+multi_fn = simd_extract!, b:in_t0, b, N as u32
 multi_fn = vqdmulls-noqself-noext, a, b
 a = 2
 b = 0, 2, 2, 0, 2, 0, 0, 0
@@ -5266,7 +5266,7 @@ generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:
 /// Signed saturating doubling multiply-add long
 name = vqdmlal
 multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c}
-multi_fn = vqadd-out-noext, a, {simd_extract, x, 0}
+multi_fn = vqadd-out-noext, a, {simd_extract!, x, 0}
 a = 1
 b = 1
 c = 2
@@ -5292,7 +5292,7 @@ name = vqdmlalh_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vqdmlal-self-noext, a, b, {simd_extract, c, LANE as u32}
+multi_fn = vqdmlal-self-noext, a, b, {simd_extract!, c, LANE as u32}
 a = 1
 b = 1
 c = 2, 1, 1, 1, 1, 1, 1, 1
@@ -5390,7 +5390,7 @@ generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:
 /// Signed saturating doubling multiply-subtract long
 name = vqdmlsl
 multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c}
-multi_fn = vqsub-out-noext, a, {simd_extract, x, 0}
+multi_fn = vqsub-out-noext, a, {simd_extract!, x, 0}
 a = 10
 b = 1
 c = 2
@@ -5416,7 +5416,7 @@ name = vqdmlslh_lane
 in2-suffix
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vqdmlsl-self-noext, a, b, {simd_extract, c, LANE as u32}
+multi_fn = vqdmlsl-self-noext, a, b, {simd_extract!, c, LANE as u32}
 a = 10
 b = 1
 c = 2, 1, 1, 1, 1, 1, 1, 1
@@ -5445,7 +5445,7 @@ generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
 name = vqdmulh
 multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
-multi_fn = simd_extract, {vqdmulh-in_ntt-noext, a, b}, 0
+multi_fn = simd_extract!, {vqdmulh-in_ntt-noext, a, b}, 0
 a = 1
 b = 2
 validate 0
@@ -5483,7 +5483,7 @@ generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t
 name = vqdmulhh_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_extract, b:in_t0, b, N as u32
+multi_fn = simd_extract!, b:in_t0, b, N as u32
 multi_fn = vqdmulhh-out_ntt-noext, a, b
 a = 2
 b = 0, 0, MAX, 0, 0, 0, 0, 0
@@ -5497,7 +5497,7 @@ generate i16:int16x4_t:i16, i16:int16x8_t:i16
 name = vqdmulhs_lane
 constn = N
 multi_fn = static_assert_imm-in_exp_len-N
-multi_fn = simd_extract, b:in_t0, b, N as u32
+multi_fn = simd_extract!, b:in_t0, b, N as u32
 multi_fn = vqdmulhs-out_ntt-noext, a, b
 a = 2
 b = 0, MAX, 0, 0
@@ -5512,7 +5512,7 @@ name = vqdmulh
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vqdmulh-out-noext, a, {vdup-nout-noext, {simd_extract, b, LANE as u32}}
+multi_fn = vqdmulh-out-noext, a, {vdup-nout-noext, {simd_extract!, b, LANE as u32}}
 a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX
 b = 2, 1, 1, 1, 1, 1, 1, 1
 n = 0
@@ -5551,7 +5551,7 @@ generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t
 
 /// Saturating extract narrow
 name = vqmovn
-multi_fn = simd_extract, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0
 a = 1
 validate 1
 
@@ -5600,7 +5600,7 @@ generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t
 
 /// Signed saturating extract unsigned narrow
 name = vqmovun
-multi_fn = simd_extract, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0
 a = 1
 validate 1
 
@@ -5632,7 +5632,7 @@ generate int16x4_t, int16x8_t, int32x2_t, int32x4_t
 
 /// Signed saturating rounding doubling multiply returning high half
 name = vqrdmulh
-multi_fn = simd_extract, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 1
 b = 2
 validate 0
@@ -5674,7 +5674,7 @@ name = vqrdmulh
 lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = vqrdmulh-out-noext, a, {simd_extract, b, LANE as u32}
+multi_fn = vqrdmulh-out-noext, a, {simd_extract!, b, LANE as u32}
 a = 1
 b = 0, 2, 0, 0, 0, 0, 0, 0,
 n = 1
@@ -5700,7 +5700,7 @@ name = vqrdmlah
 multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
 multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c
-multi_fn = simd_extract, {vqrdmlah-in_ntt-noext, a, b, c}, 0
+multi_fn = simd_extract!, {vqrdmlah-in_ntt-noext, a, b, c}, 0
 a = 1
 b = 1
 c = 2
@@ -5733,7 +5733,7 @@ name = vqrdmlah
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vqrdmlah-self-noext, a, b, {simd_extract, c, LANE as u32}
+multi_fn = vqrdmlah-self-noext, a, b, {simd_extract!, c, LANE as u32}
 a = 1
 b = 1
 c = 0, 2, 0, 0, 0, 0, 0, 0
@@ -5761,7 +5761,7 @@ name = vqrdmlsh
 multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
 multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c
-multi_fn = simd_extract, {vqrdmlsh-in_ntt-noext, a, b, c}, 0
+multi_fn = simd_extract!, {vqrdmlsh-in_ntt-noext, a, b, c}, 0
 a = 1
 b = 1
 c = 2
@@ -5794,7 +5794,7 @@ name = vqrdmlsh
 in2-lane-suffixes
 constn = LANE
 multi_fn = static_assert_imm-in2_exp_len-LANE
-multi_fn = vqrdmlsh-self-noext, a, b, {simd_extract, c, LANE as u32}
+multi_fn = vqrdmlsh-self-noext, a, b, {simd_extract!, c, LANE as u32}
 a = 1
 b = 1
 c = 0, 2, 0, 0, 0, 0, 0, 0
@@ -5823,7 +5823,7 @@ generate int*_t, int64x*_t
 name = vqrshl
 multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
-multi_fn = simd_extract, {vqrshl-in_ntt-noext, a, b}, 0
+multi_fn = simd_extract!, {vqrshl-in_ntt-noext, a, b}, 0
 a = 1
 b = 2
 validate 4
@@ -5852,7 +5852,7 @@ name = vqrshl
 out-suffix
 multi_fn = vdup_n-out_ntt-noext, a:out_ntt, a
 multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b
-multi_fn = simd_extract, {vqrshl-out_ntt-noext, a, b}, 0
+multi_fn = simd_extract!, {vqrshl-out_ntt-noext, a, b}, 0
 a = 1
 b = 2
 validate 4
@@ -5885,7 +5885,7 @@ noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
 multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
-multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0
+multi_fn = simd_extract!, {vqrshrn_n-in_ntt-::<N>, a}, 0
 a = 4
 n = 2
 validate 1
@@ -5932,7 +5932,7 @@ noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
 multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
-multi_fn = simd_extract, {vqrshrn_n-in_ntt-::<N>, a}, 0
+multi_fn = simd_extract!, {vqrshrn_n-in_ntt-::<N>, a}, 0
 a = 4
 n = 2
 validate 1
@@ -5979,7 +5979,7 @@ noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
 multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a
-multi_fn = simd_extract, {vqrshrun_n-in_ntt-::<N>, a}, 0
+multi_fn = simd_extract!, {vqrshrun_n-in_ntt-::<N>, a}, 0
 a = 4
 n = 2
 validate 1
@@ -6018,7 +6018,7 @@ generate int*_t, int64x*_t
 /// Signed saturating shift left
 name = vqshl
 multi_fn = vqshl-in_ntt-noext, c:in_ntt, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}
-multi_fn = simd_extract, c, 0
+multi_fn = simd_extract!, c, 0
 a = 1
 b = 2
 validate 4
@@ -6046,7 +6046,7 @@ generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint6
 name = vqshl
 out-suffix
 multi_fn = vqshl-out_ntt-noext, c:out_ntt, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}
-multi_fn = simd_extract, c, 0
+multi_fn = simd_extract!, c, 0
 a = 1
 b = 2
 validate 4
@@ -6073,7 +6073,7 @@ name = vqshl
 n-suffix
 constn = N
 multi_fn = static_assert_imm-out_bits_exp_len-N
-multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
 a = 1
 n = 2
 validate 4
@@ -6100,7 +6100,7 @@ name = vqshl
 n-suffix
 constn = N
 multi_fn = static_assert_imm-out_bits_exp_len-N
-multi_fn = simd_extract, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
 a = 1
 n = 2
 validate 4
@@ -6132,7 +6132,7 @@ name = vqshlu
 n-suffix
 constn = N
 multi_fn = static_assert_imm-out_bits_exp_len-N
-multi_fn = simd_extract, {vqshlu_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqshlu_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0
 a = 1
 n = 2
 validate 4
@@ -6165,7 +6165,7 @@ name = vqshrn
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
 a = 4
 n = 2
 validate 1
@@ -6212,7 +6212,7 @@ name = vqshrn
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_extract, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
 a = 4
 n = 2
 validate 1
@@ -6258,7 +6258,7 @@ name = vqshrun
 noq-n-suffix
 constn = N
 multi_fn = static_assert-N-1-halfbits
-multi_fn = simd_extract, {vqshrun_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqshrun_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0
 a = 4
 n = 2
 validate 1
@@ -6283,7 +6283,7 @@ generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32
 /// Unsigned saturating accumulate of signed value
 name = vsqadd
 out-suffix
-multi_fn = simd_extract, {vsqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
+multi_fn = simd_extract!, {vsqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0
 a = 2
 b = 2
 validate 4
@@ -6845,7 +6845,7 @@ generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint3
 name = vset_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_insert, b, LANE as u32, a
+multi_fn = simd_insert!, b, LANE as u32, a
 a = 1
 b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 n = 0
@@ -6867,7 +6867,7 @@ name = vsetq_lane
 no-q
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_insert, b, LANE as u32, a
+multi_fn = simd_insert!, b, LANE as u32, a
 a = 1
 b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
 n = 0
@@ -6888,7 +6888,7 @@ generate p64:poly64x2_t:poly64x2_t
 name = vset_lane
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_insert, b, LANE as u32, a
+multi_fn = simd_insert!, b, LANE as u32, a
 a = 1.
 b = 0., 2., 3., 4.
 n = 0
@@ -6905,7 +6905,7 @@ name = vsetq_lane
 no-q
 constn = LANE
 multi_fn = static_assert_imm-in_exp_len-LANE
-multi_fn = simd_insert, b, LANE as u32, a
+multi_fn = simd_insert!, b, LANE as u32, a
 a = 1.
 b = 0., 2., 3., 4.
 n = 0
@@ -7241,7 +7241,7 @@ validate -2147483648.0
 a = -2147483648.500000477
 validate -2147483648.0
 
-multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+multi_fn = transmute, {self-out-_, {simd_extract!, a, 0}}
 link-aarch64 = llvm.aarch64.frint32x.f64:f64:::f64
 generate float64x1_t
 
@@ -7282,7 +7282,7 @@ validate -2147483648.0
 a = -2147483649.0
 validate -2147483648.0
 
-multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+multi_fn = transmute, {self-out-_, {simd_extract!, a, 0}}
 link-aarch64 = llvm.aarch64.frint32z.f64:f64:::f64
 generate float64x1_t
 
@@ -7324,7 +7324,7 @@ validate -9223372036854775808.0
 a = -9223372036854777856.0
 validate -9223372036854775808.0
 
-multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+multi_fn = transmute, {self-out-_, {simd_extract!, a, 0}}
 link-aarch64 = llvm.aarch64.frint64x.f64:f64:::f64
 generate float64x1_t
 
@@ -7365,7 +7365,7 @@ validate -9223372036854775808.0
 a = -9223372036854777856.0
 validate -9223372036854775808.0
 
-multi_fn = transmute, {self-out-_, {simd_extract, a, 0}}
+multi_fn = transmute, {self-out-_, {simd_extract!, a, 0}}
 link-aarch64 = llvm.aarch64.frint64z.f64:f64:::f64
 generate float64x1_t
 
@@ -7799,7 +7799,7 @@ generate int64x*_t
 
 /// Signed saturating absolute value
 name = vqabs
-multi_fn = simd_extract, {vqabs-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
+multi_fn = simd_extract!, {vqabs-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0
 a = -7
 validate 7
 

From b754cf27ab44cf012f9480e2bec996d949c16752 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Sat, 17 Feb 2024 13:28:37 +0100
Subject: [PATCH 2/2] avoid using simd_extract in SimdTy::extract (since the
 index is not a constant there)

---
 crates/core_arch/src/simd.rs | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/crates/core_arch/src/simd.rs b/crates/core_arch/src/simd.rs
index 281fefba42..b386b17848 100644
--- a/crates/core_arch/src/simd.rs
+++ b/crates/core_arch/src/simd.rs
@@ -18,17 +18,28 @@ macro_rules! simd_ty {
             #[inline(always)]
             pub(crate) const fn splat(value: $ety) -> Self {
                 $id($({
+                    // We want this to be repeated for each element.
+                    // So we need to use `elem_name` in a `$(...)`.
+                    // But we don't actually need that name for anything so we use a dummy struct.
                     #[allow(non_camel_case_types, dead_code)]
                     struct $elem_name;
                     value
                 }),*)
             }
 
+            /// Extract the element at position `index`.
+            /// `index` is not a constant so this is not efficient!
+            /// Use for testing only.
             // FIXME: Workaround rust@60637
             #[inline(always)]
             pub(crate) fn extract(self, index: usize) -> $ety {
+                // Here we assume that there is no padding.
+                let len = crate::mem::size_of::<Self>() / crate::mem::size_of::<$ety>();
+                assert!(index < len);
+                // Now that we know this is in-bounds, use pointer arithmetic to access the right element.
+                let self_ptr = &self as *const Self as *const $ety;
                 unsafe {
-                    crate::core_arch::simd_llvm::simd_extract(self, index as u32)
+                    self_ptr.add(index).read()
                 }
             }
         }
@@ -62,15 +73,6 @@ macro_rules! simd_m_ty {
                     Self::bool_to_internal(value)
                 }),*)
             }
-
-            // FIXME: Workaround rust@60637
-            #[inline(always)]
-            pub(crate) fn extract(self, index: usize) -> bool {
-                let r: $ety = unsafe {
-                    crate::core_arch::simd_llvm::simd_extract(self, index as u32)
-                };
-                r != 0
-            }
         }
     }
 }