From e38d5ac0e9d42b0cf372e7e33ab41027b842ecfd Mon Sep 17 00:00:00 2001
From: Alex Crichton <alex@alexcrichton.com>
Date: Sun, 28 Jan 2018 23:40:39 -0600
Subject: [PATCH] Move from #[inline(always)] to #[inline] (#306)

* Move from #[inline(always)] to #[inline]

This commit blanket changes all `#[inline(always)]` annotations to `#[inline]`.
Fear not though, this should not be a regression! To clarify, though, this
change is done out of correctness to ensure that we don't hit stray LLVM errors.

Most of the LLVM intrinsics and various LLVM functions we actually lower down to
only work correctly if they are invoked from a function with an appropriate
target feature set. For example if we were to out-of-the-blue invoke an AVX
intrinsic then we get a [codegen error][avx-error]. This error comes about
because the surrounding function isn't enabling the AVX feature. Now in general
we don't have a lot of control over how this crate is consumed by downstream
crates. It'd be a pretty bad mistake if all mistakes showed up as scary
un-debuggable codegen errors in LLVM!

On the other side of this issue *we* as the invokers of these intrinsics are
"doing the right thing". All our functions in this crate are tagged
appropriately with target features to be codegen'd correctly. Indeed we have
plenty of tests asserting that we can codegen everything across multiple
platforms!

The error comes about here because of precisely the `#[inline(always)]`
attribute. Typically LLVM *won't* inline functions across target feature sets.
For example if you have a normal function which calls a function that enables
AVX2, then the target, no matter how small, won't be inlined into the caller.
This is done for correctness (register preserving and all that) but is also how
these codegen errors are prevented in practice.

Now we as stdsimd, however, are currently tagging all functions with "always
inline this, no matter what". That ends up, apparently, bypassing the logic of
"is this even possible to inline". In turn we start inlining things like AVX
intrinsics into functions that can't actually call AVX intrinsics, creating
codegen errors at compile time.

So with all that motivation, this commit switches to the normal inline hints for
these functions, just `#[inline]`, instead of `#[inline(always)]`. Now for the
stdsimd crate it is absolutely critical that all functions are inlined to have
good performance. Using `#[inline]`, however, shouldn't hamper that!

The compiler will recognize the `#[inline]` attribute and make sure that each of
these functions is *candidate* to being inlined into any and all downstream
codegen units. (aka if we were missing `#[inline]` then LLVM wouldn't even know
the definition to inline most of the time). After that, though, we're relying on
LLVM to naturally inline these functions as opposed to forcing it to do so.
Typically, however, these intrinsics are one-liners and are trivially
inlineable, so I'd imagine that LLVM will go ahead and inline everything all
over the place.

All in all this change is brought about by #253 which noticed various codegen
errors. I originally thought it was due to ABI issues but turned out to be
wrong! (although that was also a bug which has since been resolved). In any case
after this change I was able to get the example in #253 to execute in both
release and debug mode.

Closes #253

[avx-error]: https://play.rust-lang.org/?gist=50cb08f1e2242e22109a6d69318bd112&version=nightly

* Add inline(always) on eflags intrinsics

Their ABI actually relies on it!

* Leave #[inline(always)] on portable types

They're causing test failures on ARM, let's investigate later.
---
 coresimd/src/aarch64/neon.rs     |   8 +-
 coresimd/src/aarch64/v8.rs       |  10 +-
 coresimd/src/arm/neon.rs         |  46 ++--
 coresimd/src/arm/v6.rs           |   4 +-
 coresimd/src/arm/v7.rs           |   8 +-
 coresimd/src/nvptx/mod.rs        |  26 +-
 coresimd/src/x86/i386/fxsr.rs    |   4 +-
 coresimd/src/x86/i586/abm.rs     |   4 +-
 coresimd/src/x86/i586/avx.rs     | 368 +++++++++++++--------------
 coresimd/src/x86/i586/avx2.rs    | 388 ++++++++++++++--------------
 coresimd/src/x86/i586/bmi.rs     |  16 +-
 coresimd/src/x86/i586/bmi2.rs    |   8 +-
 coresimd/src/x86/i586/bswap.rs   |   4 +-
 coresimd/src/x86/i586/cpuid.rs   |   8 +-
 coresimd/src/x86/i586/rdtsc.rs   |   4 +-
 coresimd/src/x86/i586/sse.rs     | 220 ++++++++--------
 coresimd/src/x86/i586/sse2.rs    | 420 +++++++++++++++----------------
 coresimd/src/x86/i586/sse3.rs    |  22 +-
 coresimd/src/x86/i586/sse41.rs   | 104 ++++----
 coresimd/src/x86/i586/sse42.rs   |  34 +--
 coresimd/src/x86/i586/ssse3.rs   |  32 +--
 coresimd/src/x86/i586/tbm.rs     |  44 ++--
 coresimd/src/x86/i586/xsave.rs   |  16 +-
 coresimd/src/x86/i686/mmx.rs     |  98 ++++----
 coresimd/src/x86/i686/sse2.rs    |  22 +-
 coresimd/src/x86/i686/sse41.rs   |  12 +-
 coresimd/src/x86/i686/sse42.rs   |   2 +-
 coresimd/src/x86/i686/sse4a.rs   |   8 +-
 coresimd/src/x86/i686/ssse3.rs   |  32 +--
 coresimd/src/x86/mod.rs          |  38 +--
 coresimd/src/x86/x86_64/abm.rs   |   4 +-
 coresimd/src/x86/x86_64/avx.rs   |   2 +-
 coresimd/src/x86/x86_64/avx2.rs  |   2 +-
 coresimd/src/x86/x86_64/bmi.rs   |  16 +-
 coresimd/src/x86/x86_64/bmi2.rs  |   8 +-
 coresimd/src/x86/x86_64/fxsr.rs  |   4 +-
 coresimd/src/x86/x86_64/sse.rs   |   6 +-
 coresimd/src/x86/x86_64/sse2.rs  |  22 +-
 coresimd/src/x86/x86_64/sse41.rs |   4 +-
 coresimd/src/x86/x86_64/sse42.rs |   2 +-
 coresimd/src/x86/x86_64/xsave.rs |  12 +-
 41 files changed, 1046 insertions(+), 1046 deletions(-)

diff --git a/coresimd/src/aarch64/neon.rs b/coresimd/src/aarch64/neon.rs
index 353a5987c7..047fe2c4a7 100644
--- a/coresimd/src/aarch64/neon.rs
+++ b/coresimd/src/aarch64/neon.rs
@@ -8,7 +8,7 @@ use simd_llvm::simd_add;
 use v128::f64x2;
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
 pub unsafe fn vadd_f64(a: f64, b: f64) -> f64 {
@@ -16,7 +16,7 @@ pub unsafe fn vadd_f64(a: f64, b: f64) -> f64 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
 pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 {
@@ -24,7 +24,7 @@ pub unsafe fn vaddq_f64(a: f64x2, b: f64x2) -> f64x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
@@ -32,7 +32,7 @@ pub unsafe fn vaddd_s64(a: i64, b: i64) -> i64 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddd_u64(a: u64, b: u64) -> u64 {
diff --git a/coresimd/src/aarch64/v8.rs b/coresimd/src/aarch64/v8.rs
index 55a352bcd8..e9c00338bd 100644
--- a/coresimd/src/aarch64/v8.rs
+++ b/coresimd/src/aarch64/v8.rs
@@ -9,14 +9,14 @@
 use stdsimd_test::assert_instr;
 
 /// Reverse the order of the bytes.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rev))]
 pub unsafe fn _rev_u64(x: u64) -> u64 {
     x.swap_bytes() as u64
 }
 
 /// Count Leading Zeros.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(clz))]
 pub unsafe fn _clz_u64(x: u64) -> u64 {
     x.leading_zeros() as u64
@@ -29,7 +29,7 @@ extern "C" {
 }
 
 /// Reverse the bit order.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rbit))]
 pub unsafe fn _rbit_u64(x: u64) -> u64 {
     rbit_u64(x as i64) as u64
@@ -39,7 +39,7 @@ pub unsafe fn _rbit_u64(x: u64) -> u64 {
 ///
 /// When all bits of the operand are set it returns the size of the operand in
 /// bits.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(cls))]
 pub unsafe fn _cls_u32(x: u32) -> u32 {
     u32::leading_zeros((((((x as i32) >> 31) as u32) ^ x) << 1) | 1) as u32
@@ -49,7 +49,7 @@ pub unsafe fn _cls_u32(x: u32) -> u32 {
 ///
 /// When all bits of the operand are set it returns the size of the operand in
 /// bits.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(cls))]
 pub unsafe fn _cls_u64(x: u64) -> u64 {
     u64::leading_zeros((((((x as i64) >> 63) as u64) ^ x) << 1) | 1) as u64
diff --git a/coresimd/src/arm/neon.rs b/coresimd/src/arm/neon.rs
index 0c4efae29f..858594ccd4 100644
--- a/coresimd/src/arm/neon.rs
+++ b/coresimd/src/arm/neon.rs
@@ -9,7 +9,7 @@ use v64::{f32x2, i16x4, i32x2, i8x8, u16x4, u32x2, u8x8};
 use v128::{f32x4, i16x8, i32x4, i64x2, i8x16, u16x8, u32x4, u64x2, u8x16};
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_s8(a: i8x8, b: i8x8) -> i8x8 {
@@ -17,7 +17,7 @@ pub unsafe fn vadd_s8(a: i8x8, b: i8x8) -> i8x8 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_s8(a: i8x16, b: i8x16) -> i8x16 {
@@ -25,7 +25,7 @@ pub unsafe fn vaddq_s8(a: i8x16, b: i8x16) -> i8x16 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_s16(a: i16x4, b: i16x4) -> i16x4 {
@@ -33,7 +33,7 @@ pub unsafe fn vadd_s16(a: i16x4, b: i16x4) -> i16x4 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_s16(a: i16x8, b: i16x8) -> i16x8 {
@@ -41,7 +41,7 @@ pub unsafe fn vaddq_s16(a: i16x8, b: i16x8) -> i16x8 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_s32(a: i32x2, b: i32x2) -> i32x2 {
@@ -49,7 +49,7 @@ pub unsafe fn vadd_s32(a: i32x2, b: i32x2) -> i32x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_s32(a: i32x4, b: i32x4) -> i32x4 {
@@ -57,7 +57,7 @@ pub unsafe fn vaddq_s32(a: i32x4, b: i32x4) -> i32x4 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_s64(a: i64x2, b: i64x2) -> i64x2 {
@@ -65,7 +65,7 @@ pub unsafe fn vaddq_s64(a: i64x2, b: i64x2) -> i64x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_u8(a: u8x8, b: u8x8) -> u8x8 {
@@ -73,7 +73,7 @@ pub unsafe fn vadd_u8(a: u8x8, b: u8x8) -> u8x8 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_u8(a: u8x16, b: u8x16) -> u8x16 {
@@ -81,7 +81,7 @@ pub unsafe fn vaddq_u8(a: u8x16, b: u8x16) -> u8x16 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_u16(a: u16x4, b: u16x4) -> u16x4 {
@@ -89,7 +89,7 @@ pub unsafe fn vadd_u16(a: u16x4, b: u16x4) -> u16x4 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_u16(a: u16x8, b: u16x8) -> u16x8 {
@@ -97,7 +97,7 @@ pub unsafe fn vaddq_u16(a: u16x8, b: u16x8) -> u16x8 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vadd_u32(a: u32x2, b: u32x2) -> u32x2 {
@@ -105,7 +105,7 @@ pub unsafe fn vadd_u32(a: u32x2, b: u32x2) -> u32x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_u32(a: u32x4, b: u32x4) -> u32x4 {
@@ -113,7 +113,7 @@ pub unsafe fn vaddq_u32(a: u32x4, b: u32x4) -> u32x4 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(add))]
 pub unsafe fn vaddq_u64(a: u64x2, b: u64x2) -> u64x2 {
@@ -121,7 +121,7 @@ pub unsafe fn vaddq_u64(a: u64x2, b: u64x2) -> u64x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
 pub unsafe fn vadd_f32(a: f32x2, b: f32x2) -> f32x2 {
@@ -129,7 +129,7 @@ pub unsafe fn vadd_f32(a: f32x2, b: f32x2) -> f32x2 {
 }
 
 /// Vector add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(fadd))]
 pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 {
@@ -137,7 +137,7 @@ pub unsafe fn vaddq_f32(a: f32x4, b: f32x4) -> f32x4 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 {
@@ -147,7 +147,7 @@ pub unsafe fn vaddl_s8(a: i8x8, b: i8x8) -> i16x8 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 {
@@ -157,7 +157,7 @@ pub unsafe fn vaddl_s16(a: i16x4, b: i16x4) -> i32x4 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(saddl))]
 pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 {
@@ -167,7 +167,7 @@ pub unsafe fn vaddl_s32(a: i32x2, b: i32x2) -> i64x2 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 {
@@ -177,7 +177,7 @@ pub unsafe fn vaddl_u8(a: u8x8, b: u8x8) -> u16x8 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 {
@@ -187,7 +187,7 @@ pub unsafe fn vaddl_u16(a: u16x4, b: u16x4) -> u32x4 {
 }
 
 /// Vector long add.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(uaddl))]
 pub unsafe fn vaddl_u32(a: u32x2, b: u32x2) -> u64x2 {
@@ -205,7 +205,7 @@ extern "C" {
 }
 
 /// Reciprocal square-root estimate.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "neon")]
 #[cfg_attr(test, assert_instr(frsqrte))]
 pub unsafe fn vrsqrte_f32(a: f32x2) -> f32x2 {
diff --git a/coresimd/src/arm/v6.rs b/coresimd/src/arm/v6.rs
index 33fdda67e9..c2011fba78 100644
--- a/coresimd/src/arm/v6.rs
+++ b/coresimd/src/arm/v6.rs
@@ -10,14 +10,14 @@
 use stdsimd_test::assert_instr;
 
 /// Reverse the order of the bytes.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rev))]
 pub unsafe fn _rev_u16(x: u16) -> u16 {
     x.swap_bytes() as u16
 }
 
 /// Reverse the order of the bytes.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rev))]
 pub unsafe fn _rev_u32(x: u32) -> u32 {
     x.swap_bytes() as u32
diff --git a/coresimd/src/arm/v7.rs b/coresimd/src/arm/v7.rs
index b620013114..f8a735f157 100644
--- a/coresimd/src/arm/v7.rs
+++ b/coresimd/src/arm/v7.rs
@@ -13,28 +13,28 @@ pub use super::v6::*;
 use stdsimd_test::assert_instr;
 
 /// Count Leading Zeros.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(clz))]
 pub unsafe fn _clz_u8(x: u8) -> u8 {
     x.leading_zeros() as u8
 }
 
 /// Count Leading Zeros.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(clz))]
 pub unsafe fn _clz_u16(x: u16) -> u16 {
     x.leading_zeros() as u16
 }
 
 /// Count Leading Zeros.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(clz))]
 pub unsafe fn _clz_u32(x: u32) -> u32 {
     x.leading_zeros() as u32
 }
 
 /// Reverse the bit order.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rbit))]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg(dont_compile_me)] // FIXME need to add `v7` upstream in rustc
diff --git a/coresimd/src/nvptx/mod.rs b/coresimd/src/nvptx/mod.rs
index 21248e3193..8a444b43ae 100644
--- a/coresimd/src/nvptx/mod.rs
+++ b/coresimd/src/nvptx/mod.rs
@@ -42,79 +42,79 @@ extern "C" {
 }
 
 /// Synchronizes all threads in the block.
-#[inline(always)]
+#[inline]
 pub unsafe fn _syncthreads() -> () {
     syncthreads()
 }
 
 /// x-th thread-block dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_dim_x() -> i32 {
     block_dim_x()
 }
 
 /// y-th thread-block dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_dim_y() -> i32 {
     block_dim_y()
 }
 
 /// z-th thread-block dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_dim_z() -> i32 {
     block_dim_z()
 }
 
 /// x-th thread-block index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_idx_x() -> i32 {
     block_idx_x()
 }
 
 /// y-th thread-block index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_idx_y() -> i32 {
     block_idx_y()
 }
 
 /// z-th thread-block index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _block_idx_z() -> i32 {
     block_idx_z()
 }
 
 /// x-th block-grid dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _grid_dim_x() -> i32 {
     grid_dim_x()
 }
 
 /// y-th block-grid dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _grid_dim_y() -> i32 {
     grid_dim_y()
 }
 
 /// z-th block-grid dimension.
-#[inline(always)]
+#[inline]
 pub unsafe fn _grid_dim_z() -> i32 {
     grid_dim_z()
 }
 
 /// x-th thread index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _thread_idx_x() -> i32 {
     thread_idx_x()
 }
 
 /// y-th thread index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _thread_idx_y() -> i32 {
     thread_idx_y()
 }
 
 /// z-th thread index.
-#[inline(always)]
+#[inline]
 pub unsafe fn _thread_idx_z() -> i32 {
     thread_idx_z()
 }
diff --git a/coresimd/src/x86/i386/fxsr.rs b/coresimd/src/x86/i386/fxsr.rs
index 28c8fb5c2a..b67057880a 100644
--- a/coresimd/src/x86/i386/fxsr.rs
+++ b/coresimd/src/x86/i386/fxsr.rs
@@ -21,7 +21,7 @@ extern "C" {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxsave))]
 pub unsafe fn _fxsave(mem_addr: *mut u8) {
@@ -42,7 +42,7 @@ pub unsafe fn _fxsave(mem_addr: *mut u8) {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxrstor))]
 pub unsafe fn _fxrstor(mem_addr: *const u8) {
diff --git a/coresimd/src/x86/i586/abm.rs b/coresimd/src/x86/i586/abm.rs
index 8ee4659d2d..5480b964ab 100644
--- a/coresimd/src/x86/i586/abm.rs
+++ b/coresimd/src/x86/i586/abm.rs
@@ -23,7 +23,7 @@ use stdsimd_test::assert_instr;
 /// Counts the leading most significant zero bits.
 ///
 /// When the operand is zero, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "lzcnt")]
 #[cfg_attr(test, assert_instr(lzcnt))]
 pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
@@ -31,7 +31,7 @@ pub unsafe fn _lzcnt_u32(x: u32) -> u32 {
 }
 
 /// Counts the bits that are set.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "popcnt")]
 #[cfg_attr(test, assert_instr(popcnt))]
 pub unsafe fn _popcnt32(x: i32) -> i32 {
diff --git a/coresimd/src/x86/i586/avx.rs b/coresimd/src/x86/i586/avx.rs
index c21b9a0caf..cba133c734 100644
--- a/coresimd/src/x86/i586/avx.rs
+++ b/coresimd/src/x86/i586/avx.rs
@@ -26,7 +26,7 @@ use x86::*;
 
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddpd))]
 pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -35,7 +35,7 @@ pub unsafe fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Add packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddps))]
 pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
@@ -45,7 +45,7 @@ pub unsafe fn _mm256_add_ps(a: __m256, b: __m256) -> __m256 {
 /// Compute the bitwise AND of a packed double-precision (64-bit)
 /// floating-point elements
 /// in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vandpd' instuction.
 // See https://github.com/rust-lang-nursery/stdsimd/issues/71
@@ -58,7 +58,7 @@ pub unsafe fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise AND of packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandps))]
 pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
@@ -69,7 +69,7 @@ pub unsafe fn _mm256_and_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the bitwise OR packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vorpd' instuction.
 // See https://github.com/rust-lang-nursery/stdsimd/issues/71
@@ -82,7 +82,7 @@ pub unsafe fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise OR packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vorps))]
 pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
@@ -93,7 +93,7 @@ pub unsafe fn _mm256_or_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Shuffle double-precision (64-bit) floating-point elements within 128-bit
 /// lanes using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vshufpd, imm8 = 0x1))]
 pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
@@ -135,7 +135,7 @@ pub unsafe fn _mm256_shuffle_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a` within
 /// 128-bit lanes using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vshufps, imm8 = 0x0))]
 pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -186,7 +186,7 @@ pub unsafe fn _mm256_shuffle_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
 /// elements in `a`
 /// and then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // FIXME: Should be 'vandnpd' instruction.
 #[cfg_attr(test, assert_instr(vandnps))]
@@ -199,7 +199,7 @@ pub unsafe fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d {
 /// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
 /// elements in `a`
 /// and then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vandnps))]
 pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
@@ -210,7 +210,7 @@ pub unsafe fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compare packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`, and return packed maximum values
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaxpd))]
 pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -219,7 +219,7 @@ pub unsafe fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a`
 /// and `b`, and return packed maximum values
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaxps))]
 pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
@@ -228,7 +228,7 @@ pub unsafe fn _mm256_max_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compare packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`, and return packed minimum values
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vminpd))]
 pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -237,7 +237,7 @@ pub unsafe fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a`
 /// and `b`, and return packed minimum values
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vminps))]
 pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
@@ -246,7 +246,7 @@ pub unsafe fn _mm256_min_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Add packed double-precision (64-bit) floating-point elements
 /// in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmulpd))]
 pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -255,7 +255,7 @@ pub unsafe fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Add packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmulps))]
 pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
@@ -264,7 +264,7 @@ pub unsafe fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Alternatively add and subtract packed double-precision (64-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddsubpd))]
 pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -273,7 +273,7 @@ pub unsafe fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Alternatively add and subtract packed single-precision (32-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vaddsubps))]
 pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
@@ -282,7 +282,7 @@ pub unsafe fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
 /// from packed elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsubpd))]
 pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -291,7 +291,7 @@ pub unsafe fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Subtract packed single-precision (32-bit) floating-point elements in `b`
 /// from packed elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsubps))]
 pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
@@ -300,7 +300,7 @@ pub unsafe fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the division of each of the 8 packed 32-bit floating-point elements
 /// in `a` by the corresponding packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdivps))]
 pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
@@ -309,7 +309,7 @@ pub unsafe fn _mm256_div_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the division of each of the 4 packed 64-bit floating-point elements
 /// in `a` by the corresponding packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdivpd))]
 pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -327,7 +327,7 @@ pub unsafe fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d {
 /// For a complete list of options, check [the LLVM docs][llvm_docs].
 ///
 /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd, b = 0x3))]
 pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d {
@@ -339,7 +339,7 @@ pub unsafe fn _mm256_round_pd(a: __m256d, b: i32) -> __m256d {
 
 /// Round packed double-precision (64-bit) floating point elements in `a`
 /// toward positive infinity.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd))]
 pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
@@ -348,7 +348,7 @@ pub unsafe fn _mm256_ceil_pd(a: __m256d) -> __m256d {
 
 /// Round packed double-precision (64-bit) floating point elements in `a`
 /// toward negative infinity.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundpd))]
 pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
@@ -366,7 +366,7 @@ pub unsafe fn _mm256_floor_pd(a: __m256d) -> __m256d {
 /// For a complete list of options, check [the LLVM docs][llvm_docs].
 ///
 /// [llvm_docs]: https://github.com/llvm-mirror/clang/blob/dcd8d797b20291f1a6b3e0ddda085aa2bbb382a8/lib/Headers/avxintrin.h#L382
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps, b = 0x00))]
 pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 {
@@ -380,7 +380,7 @@ pub unsafe fn _mm256_round_ps(a: __m256, b: i32) -> __m256 {
 
 /// Round packed single-precision (32-bit) floating point elements in `a`
 /// toward positive infinity.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps))]
 pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
@@ -389,7 +389,7 @@ pub unsafe fn _mm256_ceil_ps(a: __m256) -> __m256 {
 
 /// Round packed single-precision (32-bit) floating point elements in `a`
 /// toward negative infinity.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vroundps))]
 pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
@@ -398,7 +398,7 @@ pub unsafe fn _mm256_floor_ps(a: __m256) -> __m256 {
 
 /// Return the square root of packed single-precision (32-bit) floating point
 /// elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsqrtps))]
 pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
@@ -407,7 +407,7 @@ pub unsafe fn _mm256_sqrt_ps(a: __m256) -> __m256 {
 
 /// Return the square root of packed double-precision (64-bit) floating point
 /// elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vsqrtpd))]
 pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
@@ -416,7 +416,7 @@ pub unsafe fn _mm256_sqrt_pd(a: __m256d) -> __m256d {
 
 /// Blend packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendpd, imm8 = 9))]
 pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
@@ -458,7 +458,7 @@ pub unsafe fn _mm256_blend_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendps, imm8 = 9))]
 pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -508,7 +508,7 @@ pub unsafe fn _mm256_blend_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 
 /// Blend packed double-precision (64-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendvpd))]
 pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
@@ -517,7 +517,7 @@ pub unsafe fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from
 /// `a` and `b` using `c` as a mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vblendvps))]
 pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
@@ -528,7 +528,7 @@ pub unsafe fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256 {
 /// elements in `a` and `b` using the high 4 bits in `imm8`,
 /// sum the four products, and conditionally return the sum
 ///  using the low 4 bits of `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vdpps, imm8 = 0x0))]
 pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -542,7 +542,7 @@ pub unsafe fn _mm256_dp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// of 4 64-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in even locations,
 /// while sums of elements from `b` are returned in odd locations.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhaddpd))]
 pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -554,7 +554,7 @@ pub unsafe fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d {
 /// In the result, sums of elements from `a` are returned in locations of
 /// indices 0, 1, 4, 5; while sums of elements from `b` are locations
 /// 2, 3, 6, 7.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhaddps))]
 pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
@@ -565,7 +565,7 @@ pub unsafe fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256 {
 /// of 4 64-bit floating points `a` and `b`.
 /// In the result, sums of elements from `a` are returned in even locations,
 /// while sums of elements from `b` are returned in odd locations.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhsubpd))]
 pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -577,7 +577,7 @@ pub unsafe fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d {
 /// In the result, sums of elements from `a` are returned in locations of
 /// indices 0, 1, 4, 5; while sums of elements from `b` are locations
 /// 2, 3, 6, 7.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vhsubps))]
 pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
@@ -586,7 +586,7 @@ pub unsafe fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // FIXME Should be 'vxorpd' instruction.
 #[cfg_attr(test, assert_instr(vxorps))]
@@ -598,7 +598,7 @@ pub unsafe fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256 {
@@ -675,7 +675,7 @@ pub const _CMP_TRUE_US: i32 = 0x1f;
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
 pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
@@ -688,7 +688,7 @@ pub unsafe fn _mm_cmp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// Compare packed double-precision (64-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcmpeqpd, imm8 = 0))] // TODO Validate vcmppd
 pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
@@ -701,7 +701,7 @@ pub unsafe fn _mm256_cmp_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
 /// Compare packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
 pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
@@ -714,7 +714,7 @@ pub unsafe fn _mm_cmp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 /// Compare packed single-precision (32-bit) floating-point
 /// elements in `a` and `b` based on the comparison operand
 /// specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcmpeqps, imm8 = 0))] // TODO Validate vcmpps
 pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -729,7 +729,7 @@ pub unsafe fn _mm256_cmp_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
 /// store the result in the lower element of returned vector,
 /// and copy the upper element from `a` to the upper element of returned
 /// vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vcmpeqsd, imm8 = 0))] // TODO Validate vcmpsd
 pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
@@ -744,7 +744,7 @@ pub unsafe fn _mm_cmp_sd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// store the result in the lower element of returned vector,
 /// and copy the upper 3 packed elements from `a` to the upper elements of
 /// returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vcmpeqss, imm8 = 0))] // TODO Validate vcmpss
 pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
@@ -756,7 +756,7 @@ pub unsafe fn _mm_cmp_ss(a: __m128, b: __m128, imm8: i32) -> __m128 {
 
 /// Convert packed 32-bit integers in `a` to packed double-precision (64-bit)
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtdq2pd))]
 pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
@@ -765,7 +765,7 @@ pub unsafe fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d {
 
 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtdq2ps))]
 pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
@@ -774,7 +774,7 @@ pub unsafe fn _mm256_cvtepi32_ps(a: __m256i) -> __m256 {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed single-precision (32-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtpd2ps))]
 pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
@@ -783,7 +783,7 @@ pub unsafe fn _mm256_cvtpd_ps(a: __m256d) -> __m128 {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtps2dq))]
 pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
@@ -792,7 +792,7 @@ pub unsafe fn _mm256_cvtps_epi32(a: __m256) -> __m256i {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed double-precision (64-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtps2pd))]
 pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
@@ -801,7 +801,7 @@ pub unsafe fn _mm256_cvtps_pd(a: __m128) -> __m256d {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvttpd2dq))]
 pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
@@ -810,7 +810,7 @@ pub unsafe fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvtpd2dq))]
 pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
@@ -819,7 +819,7 @@ pub unsafe fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vcvttps2dq))]
 pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
@@ -828,7 +828,7 @@ pub unsafe fn _mm256_cvttps_epi32(a: __m256) -> __m256i {
 
 /// Extract 128 bits (composed of 4 packed single-precision (32-bit)
 /// floating-point elements) from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128))]
 pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
@@ -840,7 +840,7 @@ pub unsafe fn _mm256_extractf128_ps(a: __m256, imm8: i32) -> __m128 {
 
 /// Extract 128 bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128))]
 pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
@@ -851,7 +851,7 @@ pub unsafe fn _mm256_extractf128_pd(a: __m256d, imm8: i32) -> __m128d {
 }
 
 /// Extract 128 bits (composed of integer data) from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vextractf128))]
 pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
@@ -864,7 +864,7 @@ pub unsafe fn _mm256_extractf128_si256(a: __m256i, imm8: i32) -> __m128i {
 }
 
 /// Zero the contents of all XMM or YMM registers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vzeroall))]
 pub unsafe fn _mm256_zeroall() {
@@ -873,7 +873,7 @@ pub unsafe fn _mm256_zeroall() {
 
 /// Zero the upper 128 bits of all YMM registers;
 /// the lower 128-bits of the registers are unmodified.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vzeroupper))]
 pub unsafe fn _mm256_zeroupper() {
@@ -882,7 +882,7 @@ pub unsafe fn _mm256_zeroupper() {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps))]
 pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
@@ -891,7 +891,7 @@ pub unsafe fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256 {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// using the control in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps))]
 pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
@@ -900,7 +900,7 @@ pub unsafe fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128 {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 {
@@ -952,7 +952,7 @@ pub unsafe fn _mm256_permute_ps(a: __m256, imm8: i32) -> __m256 {
 
 /// Shuffle single-precision (32-bit) floating-point elements in `a`
 /// using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 #[cfg_attr(test, assert_instr(vpermilps, imm8 = 9))]
 pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 {
@@ -1005,7 +1005,7 @@ pub unsafe fn _mm_permute_ps(a: __m128, imm8: i32) -> __m128 {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// within 256-bit lanes using the control in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
 pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
@@ -1014,7 +1014,7 @@ pub unsafe fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// using the control in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd))]
 pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
@@ -1023,7 +1023,7 @@ pub unsafe fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// within 128-bit lanes using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
 pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d {
@@ -1065,7 +1065,7 @@ pub unsafe fn _mm256_permute_pd(a: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle double-precision (64-bit) floating-point elements in `a`
 /// using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 #[cfg_attr(test, assert_instr(vpermilpd, imm8 = 0x1))]
 pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
@@ -1091,7 +1091,7 @@ pub unsafe fn _mm_permute_pd(a: __m128d, imm8: i32) -> __m128d {
 
 /// Shuffle 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) selected by `imm8` from `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x5))]
 pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256 {
@@ -1103,7 +1103,7 @@ pub unsafe fn _mm256_permute2f128_ps(a: __m256, b: __m256, imm8: i32) -> __m256
 
 /// Shuffle 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) selected by `imm8` from `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
 pub unsafe fn _mm256_permute2f128_pd(a: __m256d, b: __m256d, imm8: i32) -> __m256d {
@@ -1115,7 +1115,7 @@ pub unsafe fn _mm256_permute2f128_pd(a: __m256d, b: __m256d, imm8: i32) -> __m25
 
 /// Shuffle 258-bits (composed of integer data) selected by `imm8`
 /// from `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 0x31))]
 pub unsafe fn _mm256_permute2f128_si256(
@@ -1132,7 +1132,7 @@ pub unsafe fn _mm256_permute2f128_si256(
 
 /// Broadcast a single-precision (32-bit) floating-point element from memory
 /// to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
@@ -1141,7 +1141,7 @@ pub unsafe fn _mm256_broadcast_ss(f: &f32) -> __m256 {
 
 /// Broadcast a single-precision (32-bit) floating-point element from memory
 /// to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
@@ -1150,7 +1150,7 @@ pub unsafe fn _mm_broadcast_ss(f: &f32) -> __m128 {
 
 /// Broadcast a double-precision (64-bit) floating-point element from memory
 /// to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
@@ -1159,7 +1159,7 @@ pub unsafe fn _mm256_broadcast_sd(f: &f64) -> __m256d {
 
 /// Broadcast 128 bits from memory (composed of 4 packed single-precision
 /// (32-bit) floating-point elements) to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastf128))]
 pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
@@ -1168,7 +1168,7 @@ pub unsafe fn _mm256_broadcast_ps(a: &__m128) -> __m256 {
 
 /// Broadcast 128 bits from memory (composed of 2 packed double-precision
 /// (64-bit) floating-point elements) to all elements of the returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vbroadcastf128))]
 pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
@@ -1178,7 +1178,7 @@ pub unsafe fn _mm256_broadcast_pd(a: &__m128d) -> __m256d {
 /// Copy `a` to result, then insert 128 bits (composed of 4 packed
 /// single-precision (32-bit) floating-point elements) from `b` into result
 /// at the location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
@@ -1192,7 +1192,7 @@ pub unsafe fn _mm256_insertf128_ps(a: __m256, b: __m128, imm8: i32) -> __m256 {
 /// Copy `a` to result, then insert 128 bits (composed of 2 packed
 /// double-precision (64-bit) floating-point elements) from `b` into result
 /// at the location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d {
@@ -1204,7 +1204,7 @@ pub unsafe fn _mm256_insertf128_pd(a: __m256d, b: __m128d, imm8: i32) -> __m256d
 
 /// Copy `a` to result, then insert 128 bits from `b` into result
 /// at the location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_insertf128_si256(
@@ -1220,7 +1220,7 @@ pub unsafe fn _mm256_insertf128_si256(
 
 /// Copy `a` to result, and insert the 8-bit integer `i` into result
 /// at the location specified by `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
@@ -1229,7 +1229,7 @@ pub unsafe fn _mm256_insert_epi8(a: __m256i, i: i8, index: i32) -> __m256i {
 
 /// Copy `a` to result, and insert the 16-bit integer `i` into result
 /// at the location specified by `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
@@ -1238,7 +1238,7 @@ pub unsafe fn _mm256_insert_epi16(a: __m256i, i: i16, index: i32) -> __m256i {
 
 /// Copy `a` to result, and insert the 32-bit integer `i` into result
 /// at the location specified by `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i {
@@ -1249,7 +1249,7 @@ pub unsafe fn _mm256_insert_epi32(a: __m256i, i: i32, index: i32) -> __m256i {
 /// floating-point elements) from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
 pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
@@ -1260,7 +1260,7 @@ pub unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d {
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovapd expected
 pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) {
@@ -1271,7 +1271,7 @@ pub unsafe fn _mm256_store_pd(mem_addr: *const f64, a: __m256d) {
 /// floating-point elements) from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))]
 pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
@@ -1282,7 +1282,7 @@ pub unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256 {
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))]
 pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) {
@@ -1292,7 +1292,7 @@ pub unsafe fn _mm256_store_ps(mem_addr: *const f32, a: __m256) {
 /// Load 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
 pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
@@ -1308,7 +1308,7 @@ pub unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d {
 /// Store 256-bits (composed of 4 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovupd expected
 pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
@@ -1318,7 +1318,7 @@ pub unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d) {
 /// Load 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))]
 pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
@@ -1334,7 +1334,7 @@ pub unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256 {
 /// Store 256-bits (composed of 8 packed single-precision (32-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))]
 pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
@@ -1344,7 +1344,7 @@ pub unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256) {
 /// Load 256-bits of integer data from memory into result.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
 pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
@@ -1354,7 +1354,7 @@ pub unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i {
 /// Store 256-bits of integer data from `a` into memory.
 /// `mem_addr` must be aligned on a 32-byte boundary or a
 /// general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovaps))] // FIXME vmovdqa expected
 pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
@@ -1363,7 +1363,7 @@ pub unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i) {
 
 /// Load 256-bits of integer data from memory into result.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
 pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
@@ -1378,7 +1378,7 @@ pub unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i {
 
 /// Store 256-bits of integer data from `a` into memory.
 /// 	`mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovups))] // FIXME vmovdqu expected
 pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
@@ -1388,7 +1388,7 @@ pub unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i) {
 /// Load packed double-precision (64-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d {
@@ -1397,7 +1397,7 @@ pub unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d
 
 /// Store packed double-precision (64-bit) floating-point elements from `a`
 /// into memory using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d) {
@@ -1407,7 +1407,7 @@ pub unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d)
 /// Load packed double-precision (64-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
@@ -1416,7 +1416,7 @@ pub unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d {
 
 /// Store packed double-precision (64-bit) floating-point elements from `a`
 /// into memory using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovpd))]
 pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
@@ -1426,7 +1426,7 @@ pub unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d) {
 /// Load packed single-precision (32-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256 {
@@ -1435,7 +1435,7 @@ pub unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256
 
 /// Store packed single-precision (32-bit) floating-point elements from `a`
 /// into memory using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256) {
@@ -1445,7 +1445,7 @@ pub unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256)
 /// Load packed single-precision (32-bit) floating-point elements from memory
 /// into result using `mask` (elements are zeroed out when the high bit of the
 /// corresponding element is not set).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
@@ -1454,7 +1454,7 @@ pub unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128 {
 
 /// Store packed single-precision (32-bit) floating-point elements from `a`
 /// into memory using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmaskmovps))]
 pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
@@ -1463,7 +1463,7 @@ pub unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128) {
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements
 /// from `a`, and return the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovshdup))]
 pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
@@ -1472,7 +1472,7 @@ pub unsafe fn _mm256_movehdup_ps(a: __m256) -> __m256 {
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
 /// from `a`, and return the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovsldup))]
 pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
@@ -1481,7 +1481,7 @@ pub unsafe fn _mm256_moveldup_ps(a: __m256) -> __m256 {
 
 /// Duplicate even-indexed double-precision (64-bit) floating-point elements
 /// from "a", and return the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
@@ -1491,7 +1491,7 @@ pub unsafe fn _mm256_movedup_pd(a: __m256d) -> __m256d {
 /// Load 256-bits of integer data from unaligned memory into result.
 /// This intrinsic may perform better than `_mm256_loadu_si256` when the
 /// data crosses a cache line boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vlddqu))]
 pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
@@ -1501,7 +1501,7 @@ pub unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i {
 /// Moves integer data from a 256-bit integer vector to a 32-byte
 /// aligned memory location. To minimize caching, the data is flagged as
 /// non-temporal (unlikely to be used again soon)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntdq
 pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) {
@@ -1511,7 +1511,7 @@ pub unsafe fn _mm256_stream_si256(mem_addr: *const __m256i, a: __m256i) {
 /// Moves double-precision values from a 256-bit vector of [4 x double]
 /// to a 32-byte aligned memory location. To minimize caching, the data is
 /// flagged as non-temporal (unlikely to be used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))] // FIXME vmovntpd
 pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: __m256d) {
@@ -1522,7 +1522,7 @@ pub unsafe fn _mm256_stream_pd(mem_addr: *const f64, a: __m256d) {
 /// of [8 x float] to a 32-byte aligned memory location. To minimize
 /// caching, the data is flagged as non-temporal (unlikely to be used again
 /// soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovntps))]
 pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: __m256) {
@@ -1532,7 +1532,7 @@ pub unsafe fn _mm256_stream_ps(mem_addr: *const f32, a: __m256) {
 /// Compute the approximate reciprocal of packed single-precision (32-bit)
 /// floating-point elements in `a`, and return the results. The maximum
 /// relative error for this approximation is less than 1.5*2^-12.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vrcpps))]
 pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
@@ -1542,7 +1542,7 @@ pub unsafe fn _mm256_rcp_ps(a: __m256) -> __m256 {
 /// Compute the approximate reciprocal square root of packed single-precision
 /// (32-bit) floating-point elements in `a`, and return the results.
 /// The maximum relative error for this approximation is less than 1.5*2^-12.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vrsqrtps))]
 pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
@@ -1551,7 +1551,7 @@ pub unsafe fn _mm256_rsqrt_ps(a: __m256) -> __m256 {
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements
 /// from the high half of each 128-bit lane in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpckhpd))]
 pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -1560,7 +1560,7 @@ pub unsafe fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the high half of each 128-bit lane in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpckhps))]
 pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
@@ -1569,7 +1569,7 @@ pub unsafe fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256 {
 
 /// Unpack and interleave double-precision (64-bit) floating-point elements
 /// from the low half of each 128-bit lane in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpcklpd))]
 pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
@@ -1578,7 +1578,7 @@ pub unsafe fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the low half of each 128-bit lane in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vunpcklps))]
 pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
@@ -1589,7 +1589,7 @@ pub unsafe fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256 {
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
@@ -1600,7 +1600,7 @@ pub unsafe fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32 {
 /// `b`, and set `ZF` to 1 if the result is zero, otherwise set `ZF` to 0.
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
@@ -1612,7 +1612,7 @@ pub unsafe fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32 {
 /// Compute the bitwise NOT of `a` and then AND with `b`, and set `CF` to 1 if
 /// the result is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and
 /// `CF` values are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vptest))]
 pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
@@ -1626,7 +1626,7 @@ pub unsafe fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
@@ -1640,7 +1640,7 @@ pub unsafe fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
@@ -1655,7 +1655,7 @@ pub unsafe fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32 {
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
@@ -1669,7 +1669,7 @@ pub unsafe fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
@@ -1683,7 +1683,7 @@ pub unsafe fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
@@ -1698,7 +1698,7 @@ pub unsafe fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32 {
 /// `CF` to 1 if the sign bit of each 64-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestpd))]
 pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
@@ -1712,7 +1712,7 @@ pub unsafe fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
@@ -1726,7 +1726,7 @@ pub unsafe fn _mm256_testz_ps(a: __m256, b: __m256) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
@@ -1741,7 +1741,7 @@ pub unsafe fn _mm256_testc_ps(a: __m256, b: __m256) -> i32 {
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
@@ -1755,7 +1755,7 @@ pub unsafe fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `ZF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
@@ -1769,7 +1769,7 @@ pub unsafe fn _mm_testz_ps(a: __m128, b: __m128) -> i32 {
 /// NOT of `a` and then AND with `b`, producing an intermediate value, and set
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return the `CF` value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
@@ -1784,7 +1784,7 @@ pub unsafe fn _mm_testc_ps(a: __m128, b: __m128) -> i32 {
 /// `CF` to 1 if the sign bit of each 32-bit element in the intermediate value
 /// is zero, otherwise set `CF` to 0. Return 1 if both the `ZF` and `CF` values
 /// are zero, otherwise return 0.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vtestps))]
 pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
@@ -1794,7 +1794,7 @@ pub unsafe fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32 {
 /// Set each bit of the returned mask based on the most significant bit of the
 /// corresponding packed double-precision (64-bit) floating-point element in
 /// `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovmskpd))]
 pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
@@ -1804,7 +1804,7 @@ pub unsafe fn _mm256_movemask_pd(a: __m256d) -> i32 {
 /// Set each bit of the returned mask based on the most significant bit of the
 /// corresponding packed single-precision (32-bit) floating-point element in
 /// `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vmovmskps))]
 pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
@@ -1812,7 +1812,7 @@ pub unsafe fn _mm256_movemask_ps(a: __m256) -> i32 {
 }
 
 /// Return vector of type __m256d with all elements set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))] // FIXME vxorpd expected
 pub unsafe fn _mm256_setzero_pd() -> __m256d {
@@ -1820,7 +1820,7 @@ pub unsafe fn _mm256_setzero_pd() -> __m256d {
 }
 
 /// Return vector of type __m256 with all elements set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm256_setzero_ps() -> __m256 {
@@ -1828,7 +1828,7 @@ pub unsafe fn _mm256_setzero_ps() -> __m256 {
 }
 
 /// Return vector of type __m256i with all elements set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vxor))]
 pub unsafe fn _mm256_setzero_si256() -> __m256i {
@@ -1837,7 +1837,7 @@ pub unsafe fn _mm256_setzero_si256() -> __m256i {
 
 /// Set packed double-precision (64-bit) floating-point elements in returned
 /// vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -1847,7 +1847,7 @@ pub unsafe fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 
 /// Set packed single-precision (32-bit) floating-point elements in returned
 /// vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set_ps(
@@ -1858,7 +1858,7 @@ pub unsafe fn _mm256_set_ps(
 
 /// Set packed 8-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set_epi8(
@@ -1877,7 +1877,7 @@ pub unsafe fn _mm256_set_epi8(
 }
 
 /// Set packed 16-bit integers in returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set_epi16(
@@ -1895,7 +1895,7 @@ pub unsafe fn _mm256_set_epi16(
 }
 
 /// Set packed 32-bit integers in returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set_epi32(
@@ -1905,7 +1905,7 @@ pub unsafe fn _mm256_set_epi32(
 }
 
 /// Set packed 64-bit integers in returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -1915,7 +1915,7 @@ pub unsafe fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
 
 /// Set packed double-precision (64-bit) floating-point elements in returned
 /// vector with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
@@ -1924,7 +1924,7 @@ pub unsafe fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d {
 
 /// Set packed single-precision (32-bit) floating-point elements in returned
 /// vector with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_ps(
@@ -1935,7 +1935,7 @@ pub unsafe fn _mm256_setr_ps(
 
 /// Set packed 8-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_epi8(
@@ -1955,7 +1955,7 @@ pub unsafe fn _mm256_setr_epi8(
 
 /// Set packed 16-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_epi16(
@@ -1974,7 +1974,7 @@ pub unsafe fn _mm256_setr_epi16(
 
 /// Set packed 32-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_setr_epi32(
@@ -1985,7 +1985,7 @@ pub unsafe fn _mm256_setr_epi32(
 
 /// Set packed 64-bit integers in returned vector with the supplied values in
 /// reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -1995,7 +1995,7 @@ pub unsafe fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i {
 
 /// Broadcast double-precision (64-bit) floating-point value `a` to all
 /// elements of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
@@ -2004,7 +2004,7 @@ pub unsafe fn _mm256_set1_pd(a: f64) -> __m256d {
 
 /// Broadcast single-precision (32-bit) floating-point value `a` to all
 /// elements of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
@@ -2013,7 +2013,7 @@ pub unsafe fn _mm256_set1_ps(a: f32) -> __m256 {
 
 /// Broadcast 8-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastb`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vpshufb))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -2030,7 +2030,7 @@ pub unsafe fn _mm256_set1_epi8(a: i8) -> __m256i {
 
 /// Broadcast 16-bit integer `a` to all all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastw`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(vpshufb))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -2041,7 +2041,7 @@ pub unsafe fn _mm256_set1_epi16(a: i16) -> __m256i {
 
 /// Broadcast 32-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastd`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
@@ -2050,7 +2050,7 @@ pub unsafe fn _mm256_set1_epi32(a: i32) -> __m256i {
 
 /// Broadcast 64-bit integer `a` to all elements of returned vector.
 /// This intrinsic may generate the `vpbroadcastq`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(vmovddup))]
 #[cfg_attr(test, assert_instr(vinsertf128))]
@@ -2060,7 +2060,7 @@ pub unsafe fn _mm256_set1_epi64x(a: i64) -> __m256i {
 }
 
 /// Cast vector of type __m256d to type __m256.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2069,7 +2069,7 @@ pub unsafe fn _mm256_castpd_ps(a: __m256d) -> __m256 {
 }
 
 /// Cast vector of type __m256 to type __m256d.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2078,7 +2078,7 @@ pub unsafe fn _mm256_castps_pd(a: __m256) -> __m256d {
 }
 
 /// Casts vector of type __m256 to type __m256i.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2087,7 +2087,7 @@ pub unsafe fn _mm256_castps_si256(a: __m256) -> __m256i {
 }
 
 /// Casts vector of type __m256i to type __m256.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2096,7 +2096,7 @@ pub unsafe fn _mm256_castsi256_ps(a: __m256i) -> __m256 {
 }
 
 /// Casts vector of type __m256d to type __m256i.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2105,7 +2105,7 @@ pub unsafe fn _mm256_castpd_si256(a: __m256d) -> __m256i {
 }
 
 /// Casts vector of type __m256i to type __m256d.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2114,7 +2114,7 @@ pub unsafe fn _mm256_castsi256_pd(a: __m256i) -> __m256d {
 }
 
 /// Casts vector of type __m256 to type __m128.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2123,7 +2123,7 @@ pub unsafe fn _mm256_castps256_ps128(a: __m256) -> __m128 {
 }
 
 /// Casts vector of type __m256d to type __m128d.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2132,7 +2132,7 @@ pub unsafe fn _mm256_castpd256_pd128(a: __m256d) -> __m128d {
 }
 
 /// Casts vector of type __m256i to type __m128i.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2144,7 +2144,7 @@ pub unsafe fn _mm256_castsi256_si128(a: __m256i) -> __m128i {
 
 /// Casts vector of type __m128 to type __m256;
 /// the upper 128 bits of the result are undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2155,7 +2155,7 @@ pub unsafe fn _mm256_castps128_ps256(a: __m128) -> __m256 {
 
 /// Casts vector of type __m128d to type __m256d;
 /// the upper 128 bits of the result are undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2166,7 +2166,7 @@ pub unsafe fn _mm256_castpd128_pd256(a: __m128d) -> __m256d {
 
 /// Casts vector of type __m128i to type __m256i;
 /// the upper 128 bits of the result are undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2180,7 +2180,7 @@ pub unsafe fn _mm256_castsi128_si256(a: __m128i) -> __m256i {
 /// Constructs a 256-bit floating-point vector of [8 x float] from a
 /// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
 /// the value of the source vector. The upper 128 bits are set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2191,7 +2191,7 @@ pub unsafe fn _mm256_zextps128_ps256(a: __m128) -> __m256 {
 /// Constructs a 256-bit integer vector from a 128-bit integer vector.
 /// The lower 128 bits contain the value of the source vector. The upper
 /// 128 bits are set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2205,7 +2205,7 @@ pub unsafe fn _mm256_zextsi128_si256(a: __m128i) -> __m256i {
 /// 128-bit floating-point vector of [2 x double]. The lower 128 bits
 /// contain the value of the source vector. The upper 128 bits are set
 /// to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic is only used for compilation and does not generate any
 // instructions, thus it has zero latency.
@@ -2214,7 +2214,7 @@ pub unsafe fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d {
 }
 
 /// Return vector of type `__m256` with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_undefined_ps() -> __m256 {
@@ -2222,7 +2222,7 @@ pub unsafe fn _mm256_undefined_ps() -> __m256 {
 }
 
 /// Return vector of type `__m256d` with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_undefined_pd() -> __m256d {
@@ -2230,7 +2230,7 @@ pub unsafe fn _mm256_undefined_pd() -> __m256d {
 }
 
 /// Return vector of type __m256i with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_undefined_si256() -> __m256i {
@@ -2238,7 +2238,7 @@ pub unsafe fn _mm256_undefined_si256() -> __m256i {
 }
 
 /// Set packed __m256 returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
@@ -2246,7 +2246,7 @@ pub unsafe fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256 {
 }
 
 /// Set packed __m256d returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
@@ -2256,7 +2256,7 @@ pub unsafe fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d {
 }
 
 /// Set packed __m256i returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
@@ -2266,7 +2266,7 @@ pub unsafe fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i {
 }
 
 /// Set packed __m256 returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
@@ -2274,7 +2274,7 @@ pub unsafe fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256 {
 }
 
 /// Set packed __m256d returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
@@ -2282,7 +2282,7 @@ pub unsafe fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d {
 }
 
 /// Set packed __m256i returned vector with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 #[cfg_attr(test, assert_instr(vinsertf128))]
 pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
@@ -2293,7 +2293,7 @@ pub unsafe fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i {
 /// floating-point elements) from memory, and combine them into a 256-bit
 /// value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_loadu2_m128(
@@ -2307,7 +2307,7 @@ pub unsafe fn _mm256_loadu2_m128(
 /// floating-point elements) from memory, and combine them into a 256-bit
 /// value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_loadu2_m128d(
@@ -2320,7 +2320,7 @@ pub unsafe fn _mm256_loadu2_m128d(
 /// Load two 128-bit values (composed of integer data) from memory, and combine
 /// them into a 256-bit value.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_loadu2_m128i(
@@ -2335,7 +2335,7 @@ pub unsafe fn _mm256_loadu2_m128i(
 /// single-precision (32-bit) floating-point elements) from `a` into memory two
 /// different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_storeu2_m128(
@@ -2351,7 +2351,7 @@ pub unsafe fn _mm256_storeu2_m128(
 /// double-precision (64-bit) floating-point elements) from `a` into memory two
 /// different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_storeu2_m128d(
@@ -2366,7 +2366,7 @@ pub unsafe fn _mm256_storeu2_m128d(
 /// Store the high and low 128-bit halves (each composed of integer data) from
 /// `a` into memory two different 128-bit locations.
 /// `hiaddr` and `loaddr` do not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx,sse2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_storeu2_m128i(
@@ -2380,7 +2380,7 @@ pub unsafe fn _mm256_storeu2_m128i(
 }
 
 /// Returns the first element of the input vector of [8 x float].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 //#[cfg_attr(test, assert_instr(movss))] FIXME
 pub unsafe fn _mm256_cvtss_f32(a: __m256) -> f32 {
diff --git a/coresimd/src/x86/i586/avx2.rs b/coresimd/src/x86/i586/avx2.rs
index 72892913bb..540031009e 100644
--- a/coresimd/src/x86/i586/avx2.rs
+++ b/coresimd/src/x86/i586/avx2.rs
@@ -31,7 +31,7 @@ use x86::*;
 use stdsimd_test::assert_instr;
 
 /// Computes the absolute values of packed 32-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsd))]
 pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
@@ -39,7 +39,7 @@ pub unsafe fn _mm256_abs_epi32(a: __m256i) -> __m256i {
 }
 
 /// Computes the absolute values of packed 16-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsw))]
 pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
@@ -47,7 +47,7 @@ pub unsafe fn _mm256_abs_epi16(a: __m256i) -> __m256i {
 }
 
 /// Computes the absolute values of packed 8-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpabsb))]
 pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
@@ -55,7 +55,7 @@ pub unsafe fn _mm256_abs_epi8(a: __m256i) -> __m256i {
 }
 
 /// Add packed 64-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddq))]
 pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -63,7 +63,7 @@ pub unsafe fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddd))]
 pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -71,7 +71,7 @@ pub unsafe fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddw))]
 pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -79,7 +79,7 @@ pub unsafe fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddb))]
 pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -87,7 +87,7 @@ pub unsafe fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddsb))]
 pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -95,7 +95,7 @@ pub unsafe fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddsw))]
 pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -103,7 +103,7 @@ pub unsafe fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddusb))]
 pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -111,7 +111,7 @@ pub unsafe fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpaddusw))]
 pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -120,7 +120,7 @@ pub unsafe fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Concatenate pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
 /// result, shift the result right by `n` bytes, and return the low 16 bytes.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpalignr, n = 15))]
 pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
@@ -187,7 +187,7 @@ pub unsafe fn _mm256_alignr_epi8(a: __m256i, b: __m256i, n: i32) -> __m256i {
 
 /// Compute the bitwise AND of 256 bits (representing integer data)
 /// in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vandps))]
 pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
@@ -196,7 +196,7 @@ pub unsafe fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compute the bitwise NOT of 256 bits (representing integer data)
 /// in `a` and then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vandnps))]
 pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
@@ -205,7 +205,7 @@ pub unsafe fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Average packed unsigned 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpavgw))]
 pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -213,7 +213,7 @@ pub unsafe fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Average packed unsigned 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpavgb))]
 pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -221,7 +221,7 @@ pub unsafe fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))]
 pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
@@ -253,7 +253,7 @@ pub unsafe fn _mm_blend_epi32(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Blend packed 32-bit integers from `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendd, imm8 = 9))]
 pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
@@ -305,7 +305,7 @@ pub unsafe fn _mm256_blend_epi32(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Blend packed 16-bit integers from `a` and `b` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendw, imm8 = 9))]
 pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
@@ -359,7 +359,7 @@ pub unsafe fn _mm256_blend_epi16(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Blend packed 8-bit integers from `a` and `b` using `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpblendvb))]
 pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
@@ -368,7 +368,7 @@ pub unsafe fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m25
 
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
 /// the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
@@ -379,7 +379,7 @@ pub unsafe fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
 
 /// Broadcast the low packed 8-bit integer from `a` to all elements of
 /// the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastb))]
 pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
@@ -392,7 +392,7 @@ pub unsafe fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
 // often compiled to vbroadcastss.
 /// Broadcast the low packed 32-bit integer from `a` to all elements of
 /// the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
@@ -405,7 +405,7 @@ pub unsafe fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
 // often compiled to vbroadcastss.
 /// Broadcast the low packed 32-bit integer from `a` to all elements of
 /// the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
@@ -416,7 +416,7 @@ pub unsafe fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
 
 /// Broadcast the low packed 64-bit integer from `a` to all elements of
 /// the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastq))]
 pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
@@ -429,7 +429,7 @@ pub unsafe fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
 // often compiled to vbroadcastsd.
 /// Broadcast the low packed 64-bit integer from `a` to all elements of
 /// the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
@@ -440,7 +440,7 @@ pub unsafe fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
 
 /// Broadcast the low double-precision (64-bit) floating-point element
 /// from `a` to all elements of the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vmovddup))]
 pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
@@ -449,7 +449,7 @@ pub unsafe fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
 
 /// Broadcast the low double-precision (64-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastsd))]
 pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
@@ -460,7 +460,7 @@ pub unsafe fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
 // vbroadcastf128.
 /// Broadcast 128 bits of integer data from a to all 128-bit lanes in
 /// the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
     let zero = _mm_setzero_si128();
@@ -470,7 +470,7 @@ pub unsafe fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
 
 /// Broadcast the low single-precision (32-bit) floating-point element
 /// from `a` to all elements of the 128-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
@@ -479,7 +479,7 @@ pub unsafe fn _mm_broadcastss_ps(a: __m128) -> __m128 {
 
 /// Broadcast the low single-precision (32-bit) floating-point element
 /// from `a` to all elements of the 256-bit returned value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vbroadcastss))]
 pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
@@ -488,7 +488,7 @@ pub unsafe fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
 
 /// Broadcast the low packed 16-bit integer from a to all elements of
 /// the 128-bit returned value
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
@@ -499,7 +499,7 @@ pub unsafe fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
 
 /// Broadcast the low packed 16-bit integer from a to all elements of
 /// the 256-bit returned value
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpbroadcastw))]
 pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
@@ -509,7 +509,7 @@ pub unsafe fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqq))]
 pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -517,7 +517,7 @@ pub unsafe fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqd))]
 pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -525,7 +525,7 @@ pub unsafe fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqw))]
 pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -533,7 +533,7 @@ pub unsafe fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpeqb))]
 pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -541,7 +541,7 @@ pub unsafe fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtq))]
 pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -549,7 +549,7 @@ pub unsafe fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtd))]
 pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -557,7 +557,7 @@ pub unsafe fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtw))]
 pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -565,7 +565,7 @@ pub unsafe fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpcmpgtb))]
 pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -573,7 +573,7 @@ pub unsafe fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Sign-extend 16-bit integers to 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxwd))]
 pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
@@ -581,7 +581,7 @@ pub unsafe fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 16-bit integers to 64-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxwq))]
 pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
@@ -591,7 +591,7 @@ pub unsafe fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 32-bit integers to 64-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxdq))]
 pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
@@ -599,7 +599,7 @@ pub unsafe fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 8-bit integers to 16-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbw))]
 pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
@@ -607,7 +607,7 @@ pub unsafe fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 8-bit integers to 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbd))]
 pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
@@ -617,7 +617,7 @@ pub unsafe fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
 }
 
 /// Sign-extend 8-bit integers to 64-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovsxbq))]
 pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
@@ -628,7 +628,7 @@ pub unsafe fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
 
 /// Zero extend packed unsigned 16-bit integers in `a` to packed 32-bit
 /// integers, and store the results in dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxwd))]
 pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
@@ -637,7 +637,7 @@ pub unsafe fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
 
 /// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
 /// integers. The upper four elements of `a` are unused.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxwq))]
 pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
@@ -647,7 +647,7 @@ pub unsafe fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
 }
 
 /// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxdq))]
 pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
@@ -655,7 +655,7 @@ pub unsafe fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
 }
 
 /// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbw))]
 pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
@@ -664,7 +664,7 @@ pub unsafe fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
 
 /// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
 /// integers. The upper eight elements of `a` are unused.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbd))]
 pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
@@ -675,7 +675,7 @@ pub unsafe fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
 
 /// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
 /// integers. The upper twelve elements of `a` are unused.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovzxbq))]
 pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
@@ -685,7 +685,7 @@ pub unsafe fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
 }
 
 /// Extract 128 bits (of integer data) from `a` selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vextractf128, imm8 = 1))]
 pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
@@ -699,7 +699,7 @@ pub unsafe fn _mm256_extracti128_si256(a: __m256i, imm8: i32) -> __m128i {
 }
 
 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddw))]
 pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -707,7 +707,7 @@ pub unsafe fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Horizontally add adjacent pairs of 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddd))]
 pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -716,7 +716,7 @@ pub unsafe fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Horizontally add adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphaddsw))]
 pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -724,7 +724,7 @@ pub unsafe fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Horizontally substract adjacent pairs of 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubw))]
 pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -732,7 +732,7 @@ pub unsafe fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Horizontally substract adjacent pairs of 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubd))]
 pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -741,7 +741,7 @@ pub unsafe fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vphsubsw))]
 pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -751,7 +751,7 @@ pub unsafe fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm_i32gather_epi32(
@@ -772,7 +772,7 @@ pub unsafe fn _mm_i32gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_epi32(
@@ -792,7 +792,7 @@ pub unsafe fn _mm_mask_i32gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm256_i32gather_epi32(
@@ -813,7 +813,7 @@ pub unsafe fn _mm256_i32gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdd, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_epi32(
@@ -833,7 +833,7 @@ pub unsafe fn _mm256_mask_i32gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm_i32gather_ps(
@@ -853,7 +853,7 @@ pub unsafe fn _mm_i32gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_ps(
@@ -870,7 +870,7 @@ pub unsafe fn _mm_mask_i32gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm256_i32gather_ps(
@@ -890,7 +890,7 @@ pub unsafe fn _mm256_i32gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdps, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_ps(
@@ -907,7 +907,7 @@ pub unsafe fn _mm256_mask_i32gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm_i32gather_epi64(
@@ -928,7 +928,7 @@ pub unsafe fn _mm_i32gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_epi64(
@@ -948,7 +948,7 @@ pub unsafe fn _mm_mask_i32gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm256_i32gather_epi64(
@@ -969,7 +969,7 @@ pub unsafe fn _mm256_i32gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherdq, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_epi64(
@@ -989,7 +989,7 @@ pub unsafe fn _mm256_mask_i32gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm_i32gather_pd(
@@ -1009,7 +1009,7 @@ pub unsafe fn _mm_i32gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm_mask_i32gather_pd(
@@ -1026,7 +1026,7 @@ pub unsafe fn _mm_mask_i32gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm256_i32gather_pd(
@@ -1046,7 +1046,7 @@ pub unsafe fn _mm256_i32gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherdpd, scale = 1))]
 pub unsafe fn _mm256_mask_i32gather_pd(
@@ -1063,7 +1063,7 @@ pub unsafe fn _mm256_mask_i32gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm_i64gather_epi32(
@@ -1084,7 +1084,7 @@ pub unsafe fn _mm_i64gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_epi32(
@@ -1104,7 +1104,7 @@ pub unsafe fn _mm_mask_i64gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm256_i64gather_epi32(
@@ -1125,7 +1125,7 @@ pub unsafe fn _mm256_i64gather_epi32(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqd, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_epi32(
@@ -1145,7 +1145,7 @@ pub unsafe fn _mm256_mask_i64gather_epi32(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm_i64gather_ps(
@@ -1165,7 +1165,7 @@ pub unsafe fn _mm_i64gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_ps(
@@ -1182,7 +1182,7 @@ pub unsafe fn _mm_mask_i64gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm256_i64gather_ps(
@@ -1202,7 +1202,7 @@ pub unsafe fn _mm256_i64gather_ps(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqps, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_ps(
@@ -1219,7 +1219,7 @@ pub unsafe fn _mm256_mask_i64gather_ps(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm_i64gather_epi64(
@@ -1240,7 +1240,7 @@ pub unsafe fn _mm_i64gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_epi64(
@@ -1260,7 +1260,7 @@ pub unsafe fn _mm_mask_i64gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm256_i64gather_epi64(
@@ -1281,7 +1281,7 @@ pub unsafe fn _mm256_i64gather_epi64(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpgatherqq, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_epi64(
@@ -1301,7 +1301,7 @@ pub unsafe fn _mm256_mask_i64gather_epi64(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm_i64gather_pd(
@@ -1321,7 +1321,7 @@ pub unsafe fn _mm_i64gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm_mask_i64gather_pd(
@@ -1338,7 +1338,7 @@ pub unsafe fn _mm_mask_i64gather_pd(
 /// Return values from `slice` at offsets determined by `offsets * scale`,
 /// where
 /// `scale` is between 1 and 8.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm256_i64gather_pd(
@@ -1358,7 +1358,7 @@ pub unsafe fn _mm256_i64gather_pd(
 /// where
 /// `scale` is between 1 and 8. If mask is set, load the value from `src` in
 /// that position instead.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vgatherqpd, scale = 1))]
 pub unsafe fn _mm256_mask_i64gather_pd(
@@ -1374,7 +1374,7 @@ pub unsafe fn _mm256_mask_i64gather_pd(
 
 /// Copy `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
 /// location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vinsertf128, imm8 = 1))]
 pub unsafe fn _mm256_inserti128_si256(
@@ -1392,7 +1392,7 @@ pub unsafe fn _mm256_inserti128_si256(
 /// Multiply packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs
 /// of intermediate 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaddwd))]
 pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1403,7 +1403,7 @@ pub unsafe fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// corresponding signed 8-bit integer from `b`, producing intermediate
 /// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
 /// signed 16-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaddubsw))]
 pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1413,7 +1413,7 @@ pub unsafe fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
@@ -1423,7 +1423,7 @@ pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i
 /// Load packed 32-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 pub unsafe fn _mm256_maskload_epi32(
@@ -1435,7 +1435,7 @@ pub unsafe fn _mm256_maskload_epi32(
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
@@ -1445,7 +1445,7 @@ pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i
 /// Load packed 64-bit integers from memory pointed by `mem_addr` using `mask`
 /// (elements are zeroed out when the highest bit is not set in the
 /// corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 pub unsafe fn _mm256_maskload_epi64(
@@ -1457,7 +1457,7 @@ pub unsafe fn _mm256_maskload_epi64(
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
@@ -1467,7 +1467,7 @@ pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i)
 /// Store packed 32-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovd))]
 pub unsafe fn _mm256_maskstore_epi32(
@@ -1479,7 +1479,7 @@ pub unsafe fn _mm256_maskstore_epi32(
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
@@ -1489,7 +1489,7 @@ pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i)
 /// Store packed 64-bit integers from `a` into memory pointed by `mem_addr`
 /// using `mask` (elements are not stored when the highest bit is not set
 /// in the corresponding element).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaskmovq))]
 pub unsafe fn _mm256_maskstore_epi64(
@@ -1500,7 +1500,7 @@ pub unsafe fn _mm256_maskstore_epi64(
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsw))]
 pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1509,7 +1509,7 @@ pub unsafe fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 32-bit integers in `a` and `b`, and return the packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsd))]
 pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1518,7 +1518,7 @@ pub unsafe fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 8-bit integers in `a` and `b`, and return the packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxsb))]
 pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -1527,7 +1527,7 @@ pub unsafe fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return
 /// the packed maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxuw))]
 pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -1536,7 +1536,7 @@ pub unsafe fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return
 /// the packed maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxud))]
 pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
@@ -1545,7 +1545,7 @@ pub unsafe fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return
 /// the packed maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmaxub))]
 pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -1554,7 +1554,7 @@ pub unsafe fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsw))]
 pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1563,7 +1563,7 @@ pub unsafe fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 32-bit integers in `a` and `b`, and return the packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsd))]
 pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1572,7 +1572,7 @@ pub unsafe fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed 8-bit integers in `a` and `b`, and return the packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminsb))]
 pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -1581,7 +1581,7 @@ pub unsafe fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return
 /// the packed minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminuw))]
 pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -1590,7 +1590,7 @@ pub unsafe fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return
 /// the packed minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminud))]
 pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
@@ -1599,7 +1599,7 @@ pub unsafe fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return
 /// the packed minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpminub))]
 pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -1608,7 +1608,7 @@ pub unsafe fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Create mask from the most significant bit of each 8-bit element in `a`,
 /// return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmovmskb))]
 pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
@@ -1622,7 +1622,7 @@ pub unsafe fn _mm256_movemask_epi8(a: __m256i) -> i32 {
 /// selected from `b` starting at on the offset specified in `imm8`. Eight
 /// quadruplets are formed from sequential 8-bit integers selected from `a`
 /// starting at the offset specified in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vmpsadbw, imm8 = 0))]
 pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i {
@@ -1639,7 +1639,7 @@ pub unsafe fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i, imm8: i32) -> __m256i
 /// `a` and `b`
 ///
 /// Return the 64-bit results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmuldq))]
 pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1650,7 +1650,7 @@ pub unsafe fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// element in `a` and `b`
 ///
 /// Return the unsigned 64-bit results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmuludq))]
 pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
@@ -1660,7 +1660,7 @@ pub unsafe fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhw))]
 pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1670,7 +1670,7 @@ pub unsafe fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed unsigned 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers and returning the high 16 bits of the
 /// intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhuw))]
 pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -1680,7 +1680,7 @@ pub unsafe fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 16-bit integers in `a` and `b`, producing
 /// intermediate 32-bit integers, and return the low 16 bits of the
 /// intermediate integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmullw))]
 pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1690,7 +1690,7 @@ pub unsafe fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Multiply the packed 32-bit integers in `a` and `b`, producing
 /// intermediate 64-bit integers, and return the low 16 bits of the
 /// intermediate integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulld))]
 pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1701,7 +1701,7 @@ pub unsafe fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// intermediate signed 32-bit integers. Truncate each intermediate
 /// integer to the 18 most significant bits, round by adding 1, and
 /// return bits [16:1]
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpmulhrsw))]
 pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1710,7 +1710,7 @@ pub unsafe fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compute the bitwise OR of 256 bits (representing integer data) in `a`
 /// and `b`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vorps))]
 pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
@@ -1719,7 +1719,7 @@ pub unsafe fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpacksswb))]
 pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1728,7 +1728,7 @@ pub unsafe fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackssdw))]
 pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1737,7 +1737,7 @@ pub unsafe fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackuswb))]
 pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -1746,7 +1746,7 @@ pub unsafe fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpackusdw))]
 pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1757,7 +1757,7 @@ pub unsafe fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
 ///
 /// The last 3 bits of each integer of `b` are used as addresses into the 8
 /// integers of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermd))]
 pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -1765,7 +1765,7 @@ pub unsafe fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Permutes 64-bit integers from `a` using control mask `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermq, imm8 = 9))]
 pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
@@ -1817,7 +1817,7 @@ pub unsafe fn _mm256_permute4x64_epi64(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shuffle 128-bits of integer data selected by `imm8` from `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vperm2f128, imm8 = 9))]
 pub unsafe fn _mm256_permute2x128_si256(
@@ -1835,7 +1835,7 @@ pub unsafe fn _mm256_permute2x128_si256(
 
 /// Shuffle 64-bit floating-point elements in `a` across lanes using the
 /// control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermpd, imm8 = 1))]
 pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
@@ -1887,7 +1887,7 @@ pub unsafe fn _mm256_permute4x64_pd(a: __m256d, imm8: i32) -> __m256d {
 
 /// Shuffle eight 32-bit foating-point elements in `a` across lanes using
 /// the corresponding 32-bit integer index in `idx`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpermps))]
 pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
@@ -1898,7 +1898,7 @@ pub unsafe fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
 /// and `b`, then horizontally sum each consecutive 8 differences to
 /// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
 /// integers in the low 16 bits of the 64-bit return value
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsadbw))]
 pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -1934,7 +1934,7 @@ pub unsafe fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
 ///     r
 /// }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshufb))]
 pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -1974,7 +1974,7 @@ pub unsafe fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshufd, imm8 = 9))]
 pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
@@ -2035,7 +2035,7 @@ pub unsafe fn _mm256_shuffle_epi32(a: __m256i, imm8: i32) -> __m256i {
 /// Shuffle 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshufhw, imm8 = 9))]
 pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2092,7 +2092,7 @@ pub unsafe fn _mm256_shufflehi_epi16(a: __m256i, imm8: i32) -> __m256i {
 /// Shuffle 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
 /// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
 /// to the output.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpshuflw, imm8 = 9))]
 pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2149,7 +2149,7 @@ pub unsafe fn _mm256_shufflelo_epi16(a: __m256i, imm8: i32) -> __m256i {
 /// Negate packed 16-bit integers in `a` when the corresponding signed
 /// 16-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignw))]
 pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2159,7 +2159,7 @@ pub unsafe fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// Negate packed 32-bit integers in `a` when the corresponding signed
 /// 32-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignd))]
 pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -2169,7 +2169,7 @@ pub unsafe fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// Negate packed 8-bit integers in `a` when the corresponding signed
 /// 8-bit integer in `b` is negative, and return the results.
 /// Results are zeroed out when the corresponding element in `b` is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsignb))]
 pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2178,7 +2178,7 @@ pub unsafe fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllw))]
 pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
@@ -2187,7 +2187,7 @@ pub unsafe fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslld))]
 pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
@@ -2196,7 +2196,7 @@ pub unsafe fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` left by `count` while
 /// shifting in zeros, and return the result
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllq))]
 pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
@@ -2205,7 +2205,7 @@ pub unsafe fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllw))]
 pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2214,7 +2214,7 @@ pub unsafe fn _mm256_slli_epi16(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslld))]
 pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
@@ -2223,7 +2223,7 @@ pub unsafe fn _mm256_slli_epi32(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` left by `imm8` while
 /// shifting in zeros, return the results;
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllq))]
 pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
@@ -2231,7 +2231,7 @@ pub unsafe fn _mm256_slli_epi64(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
 pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
@@ -2245,7 +2245,7 @@ pub unsafe fn _mm256_slli_si256(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpslldq, imm8 = 3))]
 pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
@@ -2255,7 +2255,7 @@ pub unsafe fn _mm256_bslli_epi128(a: __m256i, imm8: i32) -> __m256i {
 /// Shift packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
 pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -2265,7 +2265,7 @@ pub unsafe fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
 /// Shift packed 32-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvd))]
 pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
@@ -2275,7 +2275,7 @@ pub unsafe fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
 /// Shift packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
 pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
@@ -2285,7 +2285,7 @@ pub unsafe fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
 /// Shift packed 64-bit integers in `a` left by the amount
 /// specified by the corresponding element in `count` while
 /// shifting in zeros, and return the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsllvq))]
 pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
@@ -2294,7 +2294,7 @@ pub unsafe fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while
 /// shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsraw))]
 pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
@@ -2303,7 +2303,7 @@ pub unsafe fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by `count` while
 /// shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrad))]
 pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
@@ -2312,7 +2312,7 @@ pub unsafe fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsraw))]
 pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2321,7 +2321,7 @@ pub unsafe fn _mm256_srai_epi16(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while
 /// shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrad))]
 pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
@@ -2330,7 +2330,7 @@ pub unsafe fn _mm256_srai_epi32(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsravd))]
 pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -2339,7 +2339,7 @@ pub unsafe fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by the
 /// corresponding element in `count` while shifting in sign bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsravd))]
 pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
@@ -2347,7 +2347,7 @@ pub unsafe fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
 pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
@@ -2361,7 +2361,7 @@ pub unsafe fn _mm256_srli_si256(a: __m256i, imm8: i32) -> __m256i {
 }
 
 /// Shift 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrldq, imm8 = 3))]
 pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
@@ -2370,7 +2370,7 @@ pub unsafe fn _mm256_bsrli_epi128(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
 pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
@@ -2379,7 +2379,7 @@ pub unsafe fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrld))]
 pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
@@ -2388,7 +2388,7 @@ pub unsafe fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
 pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
@@ -2397,7 +2397,7 @@ pub unsafe fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlw))]
 pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
@@ -2406,7 +2406,7 @@ pub unsafe fn _mm256_srli_epi16(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrld))]
 pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
@@ -2415,7 +2415,7 @@ pub unsafe fn _mm256_srli_epi32(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlq))]
 pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
@@ -2424,7 +2424,7 @@ pub unsafe fn _mm256_srli_epi64(a: __m256i, imm8: i32) -> __m256i {
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -2433,7 +2433,7 @@ pub unsafe fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvd))]
 pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
@@ -2442,7 +2442,7 @@ pub unsafe fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
 
 /// Shift packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
@@ -2451,7 +2451,7 @@ pub unsafe fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 64-bit integers in `a` right by the amount specified by
 /// the corresponding element in `count` while shifting in zeros,
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsrlvq))]
 pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
@@ -2461,7 +2461,7 @@ pub unsafe fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
 // TODO _mm256_stream_load_si256 (__m256i const* mem_addr)
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubw))]
 pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2469,7 +2469,7 @@ pub unsafe fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 16-bit integers in `a`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubd))]
 pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -2477,7 +2477,7 @@ pub unsafe fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 16-bit integers in `a`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubq))]
 pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -2485,7 +2485,7 @@ pub unsafe fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 16-bit integers in `a`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubb))]
 pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2494,7 +2494,7 @@ pub unsafe fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
 /// `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubsw))]
 pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2503,7 +2503,7 @@ pub unsafe fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
 /// `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubsb))]
 pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2512,7 +2512,7 @@ pub unsafe fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubusw))]
 pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
@@ -2521,7 +2521,7 @@ pub unsafe fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpsubusb))]
 pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
@@ -2560,7 +2560,7 @@ pub unsafe fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhbw))]
 pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2605,7 +2605,7 @@ pub unsafe fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpcklbw))]
 pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
@@ -2648,7 +2648,7 @@ pub unsafe fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhwd))]
 pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2689,7 +2689,7 @@ pub unsafe fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpcklwd))]
 pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
@@ -2729,7 +2729,7 @@ pub unsafe fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhdq))]
 pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -2765,7 +2765,7 @@ pub unsafe fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckldq))]
 pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
@@ -2801,7 +2801,7 @@ pub unsafe fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpckhqdq))]
 pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -2837,7 +2837,7 @@ pub unsafe fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vpunpcklqdq))]
 pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
@@ -2847,7 +2847,7 @@ pub unsafe fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
 
 /// Compute the bitwise XOR of 256 bits (representing integer data)
 /// in `a` and `b`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 #[cfg_attr(test, assert_instr(vxorps))]
 pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
@@ -2858,7 +2858,7 @@ pub unsafe fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
@@ -2870,7 +2870,7 @@ pub unsafe fn _mm256_extract_epi8(a: __m256i, imm8: i32) -> i8 {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
@@ -2879,7 +2879,7 @@ pub unsafe fn _mm256_extract_epi16(a: __m256i, imm8: i32) -> i16 {
 }
 
 /// Extract a 32-bit integer from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
@@ -2888,7 +2888,7 @@ pub unsafe fn _mm256_extract_epi32(a: __m256i, imm8: i32) -> i32 {
 }
 
 /// Returns the first element of the input vector of [4 x double].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 //#[cfg_attr(test, assert_instr(movsd))] FIXME
 pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
@@ -2896,7 +2896,7 @@ pub unsafe fn _mm256_cvtsd_f64(a: __m256d) -> f64 {
 }
 
 /// Returns the first element of the input vector of [8 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 //#[cfg_attr(test, assert_instr(movd))] FIXME
 pub unsafe fn _mm256_cvtsi256_si32(a: __m256i) -> i32 {
diff --git a/coresimd/src/x86/i586/bmi.rs b/coresimd/src/x86/i586/bmi.rs
index 9b3eee2aa0..512695e049 100644
--- a/coresimd/src/x86/i586/bmi.rs
+++ b/coresimd/src/x86/i586/bmi.rs
@@ -14,7 +14,7 @@ use stdsimd_test::assert_instr;
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(bextr))]
 pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
@@ -26,7 +26,7 @@ pub unsafe fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(bextr))]
 pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
@@ -34,7 +34,7 @@ pub unsafe fn _bextr2_u32(a: u32, control: u32) -> u32 {
 }
 
 /// Bitwise logical `AND` of inverted `a` with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(andn))]
 pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
@@ -42,7 +42,7 @@ pub unsafe fn _andn_u32(a: u32, b: u32) -> u32 {
 }
 
 /// Extract lowest set isolated bit.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsi))]
 pub unsafe fn _blsi_u32(x: u32) -> u32 {
@@ -50,7 +50,7 @@ pub unsafe fn _blsi_u32(x: u32) -> u32 {
 }
 
 /// Get mask up to lowest set bit.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsmsk))]
 pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
@@ -60,7 +60,7 @@ pub unsafe fn _blsmsk_u32(x: u32) -> u32 {
 /// Resets the lowest set bit of `x`.
 ///
 /// If `x` is sets CF.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsr))]
 pub unsafe fn _blsr_u32(x: u32) -> u32 {
@@ -70,7 +70,7 @@ pub unsafe fn _blsr_u32(x: u32) -> u32 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(tzcnt))]
 pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
@@ -80,7 +80,7 @@ pub unsafe fn _tzcnt_u32(x: u32) -> u32 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(tzcnt))]
 pub unsafe fn _mm_tzcnt_32(x: u32) -> i32 {
diff --git a/coresimd/src/x86/i586/bmi2.rs b/coresimd/src/x86/i586/bmi2.rs
index adc963e0f5..e4d393f990 100644
--- a/coresimd/src/x86/i586/bmi2.rs
+++ b/coresimd/src/x86/i586/bmi2.rs
@@ -17,7 +17,7 @@ use stdsimd_test::assert_instr;
 ///
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
-#[inline(always)]
+#[inline]
 // LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(imul))]
 #[cfg_attr(all(test, target_arch = "x86"), assert_instr(mulx))]
@@ -29,7 +29,7 @@ pub unsafe fn _mulx_u32(a: u32, b: u32, hi: &mut u32) -> u32 {
 }
 
 /// Zero higher bits of `a` >= `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(bzhi))]
 pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
@@ -38,7 +38,7 @@ pub unsafe fn _bzhi_u32(a: u32, index: u32) -> u32 {
 
 /// Scatter contiguous low order bits of `a` to the result at the positions
 /// specified by the `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pdep))]
 pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
@@ -47,7 +47,7 @@ pub unsafe fn _pdep_u32(a: u32, mask: u32) -> u32 {
 
 /// Gathers the bits of `x` specified by the `mask` into the contiguous low
 /// order bit positions of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pext))]
 pub unsafe fn _pext_u32(a: u32, mask: u32) -> u32 {
diff --git a/coresimd/src/x86/i586/bswap.rs b/coresimd/src/x86/i586/bswap.rs
index 8bac167569..92f1634bf5 100644
--- a/coresimd/src/x86/i586/bswap.rs
+++ b/coresimd/src/x86/i586/bswap.rs
@@ -6,14 +6,14 @@
 use stdsimd_test::assert_instr;
 
 /// Return an integer with the reversed byte order of x
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(bswap))]
 pub unsafe fn _bswap(x: i32) -> i32 {
     bswap_i32(x)
 }
 
 /// Return an integer with the reversed byte order of x
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(bswap))]
 pub unsafe fn _bswap64(x: i64) -> i64 {
     bswap_i64(x)
diff --git a/coresimd/src/x86/i586/cpuid.rs b/coresimd/src/x86/i586/cpuid.rs
index 2480eb58e0..eeb7ac3681 100644
--- a/coresimd/src/x86/i586/cpuid.rs
+++ b/coresimd/src/x86/i586/cpuid.rs
@@ -42,7 +42,7 @@ pub struct CpuidResult {
 /// [wiki_cpuid]: https://en.wikipedia.org/wiki/CPUID
 /// [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
 /// [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(cpuid))]
 pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
     let mut r = ::core::mem::uninitialized::<CpuidResult>();
@@ -62,14 +62,14 @@ pub unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> CpuidResult {
 }
 
 /// See [`__cpuid_count`](fn.__cpuid_count.html).
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(cpuid))]
 pub unsafe fn __cpuid(leaf: u32) -> CpuidResult {
     __cpuid_count(leaf, 0)
 }
 
 /// Does the host support the `cpuid` instruction?
-#[inline(always)]
+#[inline]
 pub fn has_cpuid() -> bool {
     #[cfg(target_arch = "x86_64")]
     {
@@ -111,7 +111,7 @@ pub fn has_cpuid() -> bool {
 ///
 /// See also [`__cpuid`](fn.__cpuid.html) and
 /// [`__cpuid_count`](fn.__cpuid_count.html).
-#[inline(always)]
+#[inline]
 pub unsafe fn __get_cpuid_max(leaf: u32) -> (u32, u32) {
     let CpuidResult { eax, ebx, .. } = __cpuid(leaf);
     (eax, ebx)
diff --git a/coresimd/src/x86/i586/rdtsc.rs b/coresimd/src/x86/i586/rdtsc.rs
index f9929aaa6b..9649562cdc 100644
--- a/coresimd/src/x86/i586/rdtsc.rs
+++ b/coresimd/src/x86/i586/rdtsc.rs
@@ -15,7 +15,7 @@ use stdsimd_test::assert_instr;
 ///
 /// On processors that support the Intel 64 architecture, the
 /// high-order 32 bits of each of RAX and RDX are cleared.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rdtsc))]
 pub unsafe fn _rdtsc() -> u64 {
     rdtsc()
@@ -35,7 +35,7 @@ pub unsafe fn _rdtsc() -> u64 {
 ///
 /// On processors that support the Intel 64 architecture, the
 /// high-order 32 bits of each of RAX, RDX, and RCX are cleared.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(rdtscp))]
 pub unsafe fn _rdtscp(aux: *mut u32) -> u64 {
     rdtscp(aux as *mut _)
diff --git a/coresimd/src/x86/i586/sse.rs b/coresimd/src/x86/i586/sse.rs
index 8911429a48..57b3f42a24 100644
--- a/coresimd/src/x86/i586/sse.rs
+++ b/coresimd/src/x86/i586/sse.rs
@@ -13,7 +13,7 @@ use stdsimd_test::assert_instr;
 
 /// Adds the first component of `a` and `b`, the other components are copied
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(addss))]
 pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
@@ -21,7 +21,7 @@ pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Adds __m128 vectors.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(addps))]
 pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
@@ -30,7 +30,7 @@ pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Subtracts the first component of `b` from `a`, the other components are
 /// copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(subss))]
 pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
@@ -38,7 +38,7 @@ pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Subtracts __m128 vectors.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(subps))]
 pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
@@ -47,7 +47,7 @@ pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Multiplies the first component of `a` and `b`, the other components are
 /// copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(mulss))]
 pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
@@ -55,7 +55,7 @@ pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Multiplies __m128 vectors.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(mulps))]
 pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
@@ -64,7 +64,7 @@ pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Divides the first component of `b` by `a`, the other components are
 /// copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(divss))]
 pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
@@ -72,7 +72,7 @@ pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Divides __m128 vectors.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(divps))]
 pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
@@ -81,7 +81,7 @@ pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Return the square root of the first single-precision (32-bit)
 /// floating-point element in `a`, the other elements are unchanged.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sqrtss))]
 pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
@@ -90,7 +90,7 @@ pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
 
 /// Return the square root of packed single-precision (32-bit) floating-point
 /// elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sqrtps))]
 pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
@@ -99,7 +99,7 @@ pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
 
 /// Return the approximate reciprocal of the first single-precision
 /// (32-bit) floating-point element in `a`, the other elements are unchanged.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rcpss))]
 pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
@@ -108,7 +108,7 @@ pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
 
 /// Return the approximate reciprocal of packed single-precision (32-bit)
 /// floating-point elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rcpps))]
 pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
@@ -117,7 +117,7 @@ pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
 
 /// Return the approximate reciprocal square root of the fist single-precision
 /// (32-bit) floating-point elements in `a`, the other elements are unchanged.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rsqrtss))]
 pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
@@ -126,7 +126,7 @@ pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
 
 /// Return the approximate reciprocal square root of packed single-precision
 /// (32-bit) floating-point elements in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(rsqrtps))]
 pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
@@ -136,7 +136,7 @@ pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
 /// Compare the first single-precision (32-bit) floating-point element of `a`
 /// and `b`, and return the minimum value in the first element of the return
 /// value, the other elements are copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(minss))]
 pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
@@ -145,7 +145,7 @@ pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`, and return the corresponding minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(minps))]
 pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
@@ -155,7 +155,7 @@ pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare the first single-precision (32-bit) floating-point element of `a`
 /// and `b`, and return the maximum value in the first element of the return
 /// value, the other elements are copied from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(maxss))]
 pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
@@ -164,7 +164,7 @@ pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
 
 /// Compare packed single-precision (32-bit) floating-point elements in `a` and
 /// `b`, and return the corresponding maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(maxps))]
 pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
@@ -172,7 +172,7 @@ pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Bitwise AND of packed single-precision (32-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `and` instructions, so ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
@@ -187,7 +187,7 @@ pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
 /// elements.
 ///
 /// Computes `!a & b` for each bit in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `not` and `and` instructions, so ignore
 // it.
@@ -201,7 +201,7 @@ pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
 }
 
 /// Bitwise OR of packed single-precision (32-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `or` instructions, so we ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
@@ -214,7 +214,7 @@ pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
 /// elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // i586 only seems to generate plain `xor` instructions, so we ignore it.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2")),
@@ -228,7 +228,7 @@ pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare the lowest `f32` of both inputs for equality. The lowest 32 bits of
 /// the result will be `0xffffffff` if the two inputs are equal, or `0`
 /// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpeqss))]
 pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
@@ -239,7 +239,7 @@ pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if `a.extract(0)` is less than
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltss))]
 pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
@@ -250,7 +250,7 @@ pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
 /// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
 /// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpless))]
 pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
@@ -261,7 +261,7 @@ pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltss))]
 pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
@@ -272,7 +272,7 @@ pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
 /// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
 /// of the result are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpless))]
 pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
@@ -283,7 +283,7 @@ pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpneqss))]
 pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
@@ -294,7 +294,7 @@ pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
 /// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
 /// upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltss))]
 pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
@@ -305,7 +305,7 @@ pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
 /// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
 /// of the result are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnless))]
 pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
@@ -316,7 +316,7 @@ pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
 /// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
 /// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
 /// the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltss))]
 pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
@@ -327,7 +327,7 @@ pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
 /// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
 /// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
 /// bits of the result are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnless))]
 pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
@@ -338,7 +338,7 @@ pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
 /// the result will be `0xffffffff` if neither of `a.extract(0)` or
 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpordss))]
 pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
@@ -349,7 +349,7 @@ pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
 /// of the result will be `0xffffffff` if any of `a.extract(0)` or
 /// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
 /// are the upper 96 bits of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpunordss))]
 pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
@@ -359,7 +359,7 @@ pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input elements
 /// were equal, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpeqps))]
 pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
@@ -369,7 +369,7 @@ pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is less than the corresponding element in `b`, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltps))]
 pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
@@ -380,7 +380,7 @@ pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is less than or equal to the corresponding element in `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpleps))]
 pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
@@ -390,7 +390,7 @@ pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpltps))]
 pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
@@ -401,7 +401,7 @@ pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is greater than or equal to the corresponding element in `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpleps))]
 pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
@@ -411,7 +411,7 @@ pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
 /// Compare each of the four floats in `a` to the corresponding element in `b`.
 /// The result in the output vector will be `0xffffffff` if the input elements
 /// are *not* equal, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpneqps))]
 pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
@@ -422,7 +422,7 @@ pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* less than the corresponding element in `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltps))]
 pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
@@ -433,7 +433,7 @@ pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* less than or equal to the corresponding element in `b`, or
 /// `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnleps))]
 pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
@@ -444,7 +444,7 @@ pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* greater than the corresponding element in `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnltps))]
 pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
@@ -455,7 +455,7 @@ pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
 /// The result in the output vector will be `0xffffffff` if the input element
 /// in `a` is *not* greater than or equal to the corresponding element in `b`,
 /// or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpnleps))]
 pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
@@ -466,7 +466,7 @@ pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
 /// Returns four floats that have one of two possible bit patterns. The element
 /// in the output vector will be `0xffffffff` if the input elements in `a` and
 /// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpordps))]
 pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
@@ -477,7 +477,7 @@ pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
 /// Returns four floats that have one of two possible bit patterns. The element
 /// in the output vector will be `0xffffffff` if the input elements in `a` and
 /// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cmpunordps))]
 pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
@@ -486,7 +486,7 @@ pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are equal, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
@@ -495,7 +495,7 @@ pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
@@ -505,7 +505,7 @@ pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
@@ -515,7 +515,7 @@ pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is greater than the one from `b`, or `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
@@ -525,7 +525,7 @@ pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
 /// `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
@@ -534,7 +534,7 @@ pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
 
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are *not* equal, or `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(comiss))]
 pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
@@ -544,7 +544,7 @@ pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are equal, or `0` otherwise. This instruction will not signal
 /// an exception if either argument is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
@@ -555,7 +555,7 @@ pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
 /// This instruction will not signal an exception if either argument is a quiet
 /// NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
@@ -566,7 +566,7 @@ pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
 /// otherwise. This instruction will not signal an exception if either argument
 /// is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
@@ -577,7 +577,7 @@ pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is greater than the one from `b`, or `0`
 /// otherwise. This instruction will not signal an exception if either argument
 /// is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
@@ -588,7 +588,7 @@ pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
 /// `1` if the value from `a` is greater than or equal to the one from `b`, or
 /// `0` otherwise. This instruction will not signal an exception if either
 /// argument is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
@@ -598,7 +598,7 @@ pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
 /// Compare two 32-bit floats from the low-order bits of `a` and `b`. Returns
 /// `1` if they are *not* equal, or `0` otherwise. This instruction will not
 /// signal an exception if either argument is a quiet NaN.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ucomiss))]
 pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
@@ -613,7 +613,7 @@ pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
 /// unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
 pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
@@ -621,7 +621,7 @@ pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
 }
 
 /// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
 pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
@@ -638,7 +638,7 @@ pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
 /// exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
 pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
@@ -646,7 +646,7 @@ pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
 }
 
 /// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
 pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
@@ -654,7 +654,7 @@ pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
 }
 
 /// Extract the lowest 32 bit float from the input vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // No point in using assert_instrs. In Unix x86_64 calling convention this is a
 // no-op, and on Windows it's just a `mov`.
@@ -667,7 +667,7 @@ pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
 ///
 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
 /// input).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
@@ -675,7 +675,7 @@ pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
 }
 
 /// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
@@ -684,7 +684,7 @@ pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
 
 /// Construct a `__m128` with the lowest element set to `a` and the rest set to
 /// zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
@@ -692,7 +692,7 @@ pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
 }
 
 /// Construct a `__m128` with all element set to `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps))]
 pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
@@ -700,7 +700,7 @@ pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
 }
 
 /// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps))]
 pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
@@ -724,7 +724,7 @@ pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
 /// ```text
 /// let v = _mm_set_ps(d, c, b, a);
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpcklps))]
 pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
@@ -739,7 +739,7 @@ pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
 /// ```text
 /// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(unpcklps))]
 // On a 32-bit architecture it just copies the operands from the stack.
@@ -749,7 +749,7 @@ pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
 }
 
 /// Construct a `__m128` with all elements initialized to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(xorps))]
 pub unsafe fn _mm_setzero_ps() -> __m128 {
@@ -761,7 +761,7 @@ pub unsafe fn _mm_setzero_ps() -> __m128 {
 ///
 /// The lower half of result takes values from `a` and the higher half from
 /// `b`. Mask is split to 2 control bits each to index the element from inputs.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(shufps, mask = 3))]
 pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 {
@@ -812,7 +812,7 @@ pub unsafe fn _mm_shuffle_ps(a: __m128, b: __m128, mask: u32) -> __m128 {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the higher half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpckhps))]
 pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
@@ -821,7 +821,7 @@ pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Unpack and interleave single-precision (32-bit) floating-point elements
 /// from the lower half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(unpcklps))]
 pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
@@ -830,7 +830,7 @@ pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Combine higher half of `a` and `b`. The highwe half of `b` occupies the
 /// lower half of result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movhlps))]
 #[cfg_attr(all(test, windows), assert_instr(unpckhpd))]
@@ -841,7 +841,7 @@ pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Combine lower half of `a` and `b`. The lower half of `b` occupies the
 /// higher half of result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(all(test, target_feature = "sse2"), assert_instr(unpcklpd))]
 #[cfg_attr(all(test, not(target_feature = "sse2")), assert_instr(movlhps))]
@@ -853,7 +853,7 @@ pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
 ///
 /// The mask is stored in the 4 least significant bits of the return value.
 /// All other bits are set to `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movmskps))]
 pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
@@ -892,7 +892,7 @@ pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // TODO: generates MOVHPD if the CPU supports SSE2.
 // #[cfg_attr(test, assert_instr(movhps))]
@@ -943,7 +943,7 @@ pub unsafe fn _mm_loadh_pi(a: __m128, p: *const __m64) -> __m128 {
 /// #     }
 /// # }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // TODO: generates MOVLPD if the CPU supports SSE2.
 // #[cfg_attr(test, assert_instr(movlps))]
@@ -966,7 +966,7 @@ pub unsafe fn _mm_loadl_pi(a: __m128, p: *const __m64) -> __m128 {
 /// elements set to zero.
 ///
 /// This corresponds to instructions `VMOVSS` / `MOVSS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
@@ -978,7 +978,7 @@ pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
 ///
 /// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
 /// shuffling.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
@@ -987,7 +987,7 @@ pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
 }
 
 /// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
@@ -1002,7 +1002,7 @@ pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
 /// memory.
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
@@ -1016,7 +1016,7 @@ pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
 /// may be faster.
 ///
 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
@@ -1049,7 +1049,7 @@ pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
 /// shuffling.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
@@ -1061,7 +1061,7 @@ pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
 ///
 /// This intrinsic corresponds to the `MOVHPS` instruction. The compiler may
 /// choose to generate an equivalent sequence of other instructions.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // On i686 and up LLVM actually generates MOVHPD instead of MOVHPS, that's
 // fine.
@@ -1091,7 +1091,7 @@ pub unsafe fn _mm_storeh_pi(p: *mut __m64, a: __m128) {
 ///
 /// This intrinsic corresponds to the `MOVQ` instruction. The compiler may
 /// choose to generate an equivalent sequence of other instructions.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 // On i586 the codegen just generates plane MOVs. No need to test for that.
 #[cfg_attr(all(test, any(target_arch = "x86_64", target_feature = "sse2"),
@@ -1121,7 +1121,7 @@ pub unsafe fn _mm_storel_pi(p: *mut __m64, a: __m128) {
 /// Store the lowest 32 bit float of `a` into memory.
 ///
 /// This intrinsic corresponds to the `MOVSS` instruction.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
@@ -1144,7 +1144,7 @@ pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
 /// *p.offset(2) = x;
 /// *p.offset(3) = x;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
@@ -1153,7 +1153,7 @@ pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
 }
 
 /// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
@@ -1169,7 +1169,7 @@ pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
 /// memory.
 ///
 /// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
@@ -1181,7 +1181,7 @@ pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
 /// faster.
 ///
 /// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
@@ -1206,7 +1206,7 @@ pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
 /// *p.offset(2) = a.extract(1);
 /// *p.offset(3) = a.extract(0);
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
@@ -1221,7 +1221,7 @@ pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
 /// ```text
 /// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movss))]
 pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
@@ -1234,7 +1234,7 @@ pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
 /// Guarantees that every store instruction that precedes, in program order, is
 /// globally visible before any store instruction which follows the fence in
 /// program order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(sfence))]
 pub unsafe fn _mm_sfence() {
@@ -1244,7 +1244,7 @@ pub unsafe fn _mm_sfence() {
 /// Get the unsigned 32-bit value of the MXCSR control and status register.
 ///
 /// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(stmxcsr))]
 pub unsafe fn _mm_getcsr() -> u32 {
@@ -1378,7 +1378,7 @@ pub unsafe fn _mm_getcsr() -> u32 {
 /// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);  // turn on
 /// ```
 ///
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(ldmxcsr))]
 pub unsafe fn _mm_setcsr(val: u32) {
@@ -1435,7 +1435,7 @@ pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
 pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
@@ -1443,7 +1443,7 @@ pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
@@ -1451,7 +1451,7 @@ pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
@@ -1459,7 +1459,7 @@ pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
@@ -1467,7 +1467,7 @@ pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
@@ -1475,7 +1475,7 @@ pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
@@ -1483,7 +1483,7 @@ pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
@@ -1493,7 +1493,7 @@ pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
 }
 
 /// See [`_mm_setcsr`](fn._mm_setcsr.html)
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
@@ -1548,7 +1548,7 @@ pub const _MM_HINT_NTA: i8 = 0;
 /// * Prefetching may also fail if there are not enough memory-subsystem
 ///   resources (e.g., request buffers).
 ///
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(prefetcht0, strategy = _MM_HINT_T0))]
 #[cfg_attr(test, assert_instr(prefetcht1, strategy = _MM_HINT_T1))]
@@ -1573,7 +1573,7 @@ pub unsafe fn _mm_prefetch(p: *const u8, strategy: i8) {
 }
 
 /// Return vector of type __m128 with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 pub unsafe fn _mm_undefined_ps() -> __m128 {
     __m128(
@@ -1585,7 +1585,7 @@ pub unsafe fn _mm_undefined_ps() -> __m128 {
 }
 
 /// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
-#[inline(always)]
+#[inline]
 #[allow(non_snake_case)]
 #[target_feature(enable = "sse")]
 pub unsafe fn _MM_TRANSPOSE4_PS(
@@ -1684,7 +1684,7 @@ extern "C" {
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception _may_ be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(movntps))]
 pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
@@ -1693,7 +1693,7 @@ pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
 
 /// Store 64-bits of integer data from a into memory using a non-temporal
 /// memory hint.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse,mmx")]
 #[cfg_attr(test, assert_instr(movntq))]
 pub unsafe fn _mm_stream_pi(mem_addr: *mut __m64, a: __m64) {
diff --git a/coresimd/src/x86/i586/sse2.rs b/coresimd/src/x86/i586/sse2.rs
index c0555679bf..ab4a574e2d 100644
--- a/coresimd/src/x86/i586/sse2.rs
+++ b/coresimd/src/x86/i586/sse2.rs
@@ -16,7 +16,7 @@ use x86::*;
 ///
 /// This can help improve the performance and power consumption of spin-wait
 /// loops.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pause))]
 pub unsafe fn _mm_pause() {
@@ -25,7 +25,7 @@ pub unsafe fn _mm_pause() {
 
 /// Invalidate and flush the cache line that contains `p` from all levels of
 /// the cache hierarchy.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(clflush))]
 pub unsafe fn _mm_clflush(p: *mut u8) {
@@ -38,7 +38,7 @@ pub unsafe fn _mm_clflush(p: *mut u8) {
 /// Guarantees that every load instruction that precedes, in program order, is
 /// globally visible before any load instruction which follows the fence in
 /// program order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(lfence))]
 pub unsafe fn _mm_lfence() {
@@ -51,7 +51,7 @@ pub unsafe fn _mm_lfence() {
 /// Guarantees that every memory access that precedes, in program order, the
 /// memory fence instruction is globally visible before any memory instruction
 /// which follows the fence in program order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mfence))]
 pub unsafe fn _mm_mfence() {
@@ -59,7 +59,7 @@ pub unsafe fn _mm_mfence() {
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddb))]
 pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -67,7 +67,7 @@ pub unsafe fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddw))]
 pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -75,7 +75,7 @@ pub unsafe fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddd))]
 pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -83,7 +83,7 @@ pub unsafe fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 64-bit integers in `a` and "b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddq))]
 pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -91,7 +91,7 @@ pub unsafe fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddsb))]
 pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -99,7 +99,7 @@ pub unsafe fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddsw))]
 pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -107,7 +107,7 @@ pub unsafe fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddusb))]
 pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -115,7 +115,7 @@ pub unsafe fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(paddusw))]
 pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -123,7 +123,7 @@ pub unsafe fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Average packed unsigned 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pavgb))]
 pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -131,7 +131,7 @@ pub unsafe fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Average packed unsigned 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pavgw))]
 pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -143,7 +143,7 @@ pub unsafe fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
 /// Multiply packed signed 16-bit integers in `a` and `b`, producing
 /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of
 /// intermediate 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaddwd))]
 pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -152,7 +152,7 @@ pub unsafe fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaxsw))]
 pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -161,7 +161,7 @@ pub unsafe fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
 /// packed maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmaxub))]
 pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -170,7 +170,7 @@ pub unsafe fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 16-bit integers in `a` and `b`, and return the packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pminsw))]
 pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -179,7 +179,7 @@ pub unsafe fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 8-bit integers in `a` and `b`, and return the
 /// packed minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pminub))]
 pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -190,7 +190,7 @@ pub unsafe fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmulhw))]
 pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -201,7 +201,7 @@ pub unsafe fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// high 16 bits of the intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmulhuw))]
 pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -212,7 +212,7 @@ pub unsafe fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// The multiplication produces intermediate 32-bit integers, and returns the
 /// low 16 bits of the intermediate integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmullw))]
 pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -223,7 +223,7 @@ pub unsafe fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// in `a` and `b`.
 ///
 /// Return the unsigned 64-bit results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmuludq))]
 pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
@@ -236,7 +236,7 @@ pub unsafe fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i {
 /// and `b`, then horizontally sum each consecutive 8 differences to produce
 /// two unsigned 16-bit integers, and pack these unsigned 16-bit integers in
 /// the low 16 bits of 64-bit elements returned.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psadbw))]
 pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -244,7 +244,7 @@ pub unsafe fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubb))]
 pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -252,7 +252,7 @@ pub unsafe fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubw))]
 pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -260,7 +260,7 @@ pub unsafe fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubd))]
 pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -268,7 +268,7 @@ pub unsafe fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubq))]
 pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -277,7 +277,7 @@ pub unsafe fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubsb))]
 pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -286,7 +286,7 @@ pub unsafe fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubsw))]
 pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -295,7 +295,7 @@ pub unsafe fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubusb))]
 pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
@@ -304,7 +304,7 @@ pub unsafe fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned 16-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psubusw))]
 pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -312,7 +312,7 @@ pub unsafe fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Shift `a` left by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
 pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
@@ -355,7 +355,7 @@ pub unsafe fn _mm_slli_si128(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Shift `a` left by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslldq, imm8 = 1))]
 pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
@@ -363,7 +363,7 @@ pub unsafe fn _mm_bslli_si128(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Shift `a` right by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
 pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
@@ -371,7 +371,7 @@ pub unsafe fn _mm_bsrli_si128(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Shift packed 16-bit integers in `a` left by `imm8` while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllw))]
 pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -380,7 +380,7 @@ pub unsafe fn _mm_slli_epi16(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` left by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllw))]
 pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
@@ -388,7 +388,7 @@ pub unsafe fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i {
 }
 
 /// Shift packed 32-bit integers in `a` left by `imm8` while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslld))]
 pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i {
@@ -397,7 +397,7 @@ pub unsafe fn _mm_slli_epi32(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` left by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pslld))]
 pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -405,7 +405,7 @@ pub unsafe fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i {
 }
 
 /// Shift packed 64-bit integers in `a` left by `imm8` while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllq))]
 pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i {
@@ -414,7 +414,7 @@ pub unsafe fn _mm_slli_epi64(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 64-bit integers in `a` left by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psllq))]
 pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
@@ -423,7 +423,7 @@ pub unsafe fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in sign
 /// bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psraw))]
 pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -432,7 +432,7 @@ pub unsafe fn _mm_srai_epi16(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in sign
 /// bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psraw))]
 pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
@@ -441,7 +441,7 @@ pub unsafe fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in sign
 /// bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrad))]
 pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i {
@@ -450,7 +450,7 @@ pub unsafe fn _mm_srai_epi32(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in sign
 /// bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrad))]
 pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -458,7 +458,7 @@ pub unsafe fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i {
 }
 
 /// Shift `a` right by `imm8` bytes while shifting in zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrldq, imm8 = 1))]
 pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
@@ -502,7 +502,7 @@ pub unsafe fn _mm_srli_si128(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlw))]
 pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -511,7 +511,7 @@ pub unsafe fn _mm_srli_epi16(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 16-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlw))]
 pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
@@ -520,7 +520,7 @@ pub unsafe fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrld))]
 pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i {
@@ -529,7 +529,7 @@ pub unsafe fn _mm_srli_epi32(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 32-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrld))]
 pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
@@ -538,7 +538,7 @@ pub unsafe fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i {
 
 /// Shift packed 64-bit integers in `a` right by `imm8` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlq))]
 pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i {
@@ -547,7 +547,7 @@ pub unsafe fn _mm_srli_epi64(a: __m128i, imm8: i32) -> __m128i {
 
 /// Shift packed 64-bit integers in `a` right by `count` while shifting in
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(psrlq))]
 pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
@@ -556,7 +556,7 @@ pub unsafe fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i {
 
 /// Compute the bitwise AND of 128 bits (representing integer data) in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andps))]
 pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
@@ -565,7 +565,7 @@ pub unsafe fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compute the bitwise NOT of 128 bits (representing integer data) in `a` and
 /// then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andnps))]
 pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
@@ -574,7 +574,7 @@ pub unsafe fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compute the bitwise OR of 128 bits (representing integer data) in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(orps))]
 pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
@@ -583,7 +583,7 @@ pub unsafe fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compute the bitwise XOR of 128 bits (representing integer data) in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
 pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
@@ -591,7 +591,7 @@ pub unsafe fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqb))]
 pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -599,7 +599,7 @@ pub unsafe fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqw))]
 pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -607,7 +607,7 @@ pub unsafe fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpeqd))]
 pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -615,7 +615,7 @@ pub unsafe fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
 pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -623,7 +623,7 @@ pub unsafe fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
 pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -631,7 +631,7 @@ pub unsafe fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtd))]
 pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -639,7 +639,7 @@ pub unsafe fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 8-bit integers in `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
 pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -647,7 +647,7 @@ pub unsafe fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 16-bit integers in `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
 pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -655,7 +655,7 @@ pub unsafe fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 32-bit integers in `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pcmpgtd))]
 pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -664,7 +664,7 @@ pub unsafe fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Convert the lower two packed 32-bit integers in `a` to packed
 /// double-precision (64-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtdq2pd))]
 pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
@@ -674,7 +674,7 @@ pub unsafe fn _mm_cvtepi32_pd(a: __m128i) -> __m128d {
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
 pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
@@ -683,7 +683,7 @@ pub unsafe fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d {
 
 /// Convert packed 32-bit integers in `a` to packed single-precision (32-bit)
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtdq2ps))]
 pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
@@ -692,7 +692,7 @@ pub unsafe fn _mm_cvtepi32_ps(a: __m128i) -> __m128 {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a`
 /// to packed 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtps2dq))]
 pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
@@ -701,7 +701,7 @@ pub unsafe fn _mm_cvtps_epi32(a: __m128) -> __m128i {
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, target_arch = "x86_64"), assert_instr(movd))]
 pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
@@ -709,7 +709,7 @@ pub unsafe fn _mm_cvtsi32_si128(a: i32) -> __m128i {
 }
 
 /// Return the lowest element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movd))] // FIXME mov on windows
 pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
@@ -718,7 +718,7 @@ pub unsafe fn _mm_cvtsi128_si32(a: __m128i) -> i32 {
 
 /// Set packed 64-bit integers with the supplied values, from highest to
 /// lowest.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
@@ -726,7 +726,7 @@ pub unsafe fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i {
 }
 
 /// Set packed 32-bit integers with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
@@ -734,7 +734,7 @@ pub unsafe fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
 }
 
 /// Set packed 16-bit integers with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi16(
@@ -744,7 +744,7 @@ pub unsafe fn _mm_set_epi16(
 }
 
 /// Set packed 8-bit integers with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi8(
@@ -758,7 +758,7 @@ pub unsafe fn _mm_set_epi8(
 }
 
 /// Broadcast 64-bit integer `a` to all elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
@@ -766,7 +766,7 @@ pub unsafe fn _mm_set1_epi64x(a: i64) -> __m128i {
 }
 
 /// Broadcast 32-bit integer `a` to all elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
@@ -774,7 +774,7 @@ pub unsafe fn _mm_set1_epi32(a: i32) -> __m128i {
 }
 
 /// Broadcast 16-bit integer `a` to all elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
@@ -782,7 +782,7 @@ pub unsafe fn _mm_set1_epi16(a: i16) -> __m128i {
 }
 
 /// Broadcast 8-bit integer `a` to all elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
@@ -790,7 +790,7 @@ pub unsafe fn _mm_set1_epi8(a: i8) -> __m128i {
 }
 
 /// Set packed 32-bit integers with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
@@ -798,7 +798,7 @@ pub unsafe fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i {
 }
 
 /// Set packed 16-bit integers with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_setr_epi16(
@@ -808,7 +808,7 @@ pub unsafe fn _mm_setr_epi16(
 }
 
 /// Set packed 8-bit integers with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // no particular instruction to test
 pub unsafe fn _mm_setr_epi8(
@@ -822,7 +822,7 @@ pub unsafe fn _mm_setr_epi8(
 }
 
 /// Returns a vector with all elements set to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
 pub unsafe fn _mm_setzero_si128() -> __m128i {
@@ -830,7 +830,7 @@ pub unsafe fn _mm_setzero_si128() -> __m128i {
 }
 
 /// Load 64-bit integer from memory into first element of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movsd on windows
 #[cfg_attr(all(test, not(windows),
@@ -844,7 +844,7 @@ pub unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i {
 /// Load 128-bits of integer data from memory into a new vector.
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
@@ -854,7 +854,7 @@ pub unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i {
 /// Load 128-bits of integer data from memory into a new vector.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
@@ -875,7 +875,7 @@ pub unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i {
 ///
 /// `mem_addr` should correspond to a 128-bit memory location and does not need
 /// to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maskmovdqu))]
 pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8) {
@@ -885,7 +885,7 @@ pub unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8)
 /// Store 128-bits of integer data from `a` into memory.
 ///
 /// `mem_addr` must be aligned on a 16-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
@@ -895,7 +895,7 @@ pub unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Store 128-bits of integer data from `a` into memory.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))] // FIXME movdqu expected
 pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
@@ -905,7 +905,7 @@ pub unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Store the lower 64-bit integer `a` to a memory location.
 ///
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // FIXME mov on windows, movlps on i686
 #[cfg_attr(all(test, not(windows),
@@ -923,7 +923,7 @@ pub unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i) {
 /// Stores a 128-bit integer vector to a 128-bit aligned memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movntps))] // FIXME movntdq
 pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
@@ -933,7 +933,7 @@ pub unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i) {
 /// Stores a 32-bit integer value in the specified memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movnti))]
 pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
@@ -942,7 +942,7 @@ pub unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32) {
 
 /// Return a vector where the low element is extracted from `a` and its upper
 /// element is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // FIXME movd on windows, movd on i686
 #[cfg_attr(all(test, not(windows), target_arch = "x86_64"),
@@ -955,7 +955,7 @@ pub unsafe fn _mm_move_epi64(a: __m128i) -> __m128i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using signed saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packsswb))]
 pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -964,7 +964,7 @@ pub unsafe fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using signed saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packssdw))]
 pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -973,7 +973,7 @@ pub unsafe fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Convert packed 16-bit integers from `a` and `b` to packed 8-bit integers
 /// using unsigned saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(packuswb))]
 pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -981,7 +981,7 @@ pub unsafe fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Return the `imm8` element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pextrw, imm8 = 9))]
 pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
@@ -989,7 +989,7 @@ pub unsafe fn _mm_extract_epi16(a: __m128i, imm8: i32) -> i32 {
 }
 
 /// Return a new vector where the `imm8` element of `a` is replaced with `i`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pinsrw, imm8 = 9))]
 pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
@@ -997,7 +997,7 @@ pub unsafe fn _mm_insert_epi16(a: __m128i, i: i32, imm8: i32) -> __m128i {
 }
 
 /// Return a mask of the most significant bit of each element in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pmovmskb))]
 pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
@@ -1005,7 +1005,7 @@ pub unsafe fn _mm_movemask_epi8(a: __m128i) -> i32 {
 }
 
 /// Shuffle 32-bit integers in `a` using the control in `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshufd, imm8 = 9))]
 pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i {
@@ -1068,7 +1068,7 @@ pub unsafe fn _mm_shuffle_epi32(a: __m128i, imm8: i32) -> __m128i {
 ///
 /// Put the results in the high 64 bits of the returned vector, with the low 64
 /// bits being copied from from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshufhw, imm8 = 9))]
 pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -1126,7 +1126,7 @@ pub unsafe fn _mm_shufflehi_epi16(a: __m128i, imm8: i32) -> __m128i {
 ///
 /// Put the results in the low 64 bits of the returned vector, with the high 64
 /// bits being copied from from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(pshuflw, imm8 = 9))]
 pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i {
@@ -1179,7 +1179,7 @@ pub unsafe fn _mm_shufflelo_epi16(a: __m128i, imm8: i32) -> __m128i {
 }
 
 /// Unpack and interleave 8-bit integers from the high half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhbw))]
 pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -1191,7 +1191,7 @@ pub unsafe fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 16-bit integers from the high half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhwd))]
 pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -1200,7 +1200,7 @@ pub unsafe fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 32-bit integers from the high half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhdq))]
 pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -1208,7 +1208,7 @@ pub unsafe fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 64-bit integers from the high half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckhqdq))]
 pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -1216,7 +1216,7 @@ pub unsafe fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 8-bit integers from the low half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpcklbw))]
 pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -1228,7 +1228,7 @@ pub unsafe fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 16-bit integers from the low half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpcklwd))]
 pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -1237,7 +1237,7 @@ pub unsafe fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 32-bit integers from the low half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpckldq))]
 pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -1245,7 +1245,7 @@ pub unsafe fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Unpack and interleave 64-bit integers from the low half of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(punpcklqdq))]
 pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -1254,7 +1254,7 @@ pub unsafe fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i {
 
 /// Return a new vector with the low element of `a` replaced by the sum of the
 /// low elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(addsd))]
 pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1263,7 +1263,7 @@ pub unsafe fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Add packed double-precision (64-bit) floating-point elements in `a` and
 /// `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(addpd))]
 pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1272,7 +1272,7 @@ pub unsafe fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the result of
 /// diving the lower element of `a` by the lower element of `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(divsd))]
 pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1281,7 +1281,7 @@ pub unsafe fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Divide packed double-precision (64-bit) floating-point elements in `a` by
 /// packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(divpd))]
 pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1290,7 +1290,7 @@ pub unsafe fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the maximum
 /// of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maxsd))]
 pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1299,7 +1299,7 @@ pub unsafe fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the maximum values from corresponding elements in
 /// `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(maxpd))]
 pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1308,7 +1308,7 @@ pub unsafe fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the minimum
 /// of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(minsd))]
 pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1317,7 +1317,7 @@ pub unsafe fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the minimum values from corresponding elements in
 /// `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(minpd))]
 pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1326,7 +1326,7 @@ pub unsafe fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by multiplying the
 /// low elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mulsd))]
 pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1335,7 +1335,7 @@ pub unsafe fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Multiply packed double-precision (64-bit) floating-point elements in `a`
 /// and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(mulpd))]
 pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1344,7 +1344,7 @@ pub unsafe fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the square
 /// root of the lower element `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(sqrtsd))]
 pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1352,7 +1352,7 @@ pub unsafe fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Return a new vector with the square root of each of the values in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(sqrtpd))]
 pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
@@ -1361,7 +1361,7 @@ pub unsafe fn _mm_sqrt_pd(a: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by subtracting the
 /// low element by `b` from the low element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(subsd))]
 pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1370,7 +1370,7 @@ pub unsafe fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Subtract packed double-precision (64-bit) floating-point elements in `b`
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(subpd))]
 pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1379,7 +1379,7 @@ pub unsafe fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Compute the bitwise AND of packed double-precision (64-bit) floating-point
 /// elements in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andps))]
 pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1389,7 +1389,7 @@ pub unsafe fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise NOT of `a` and then AND with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(andnps))]
 pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1399,7 +1399,7 @@ pub unsafe fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise OR of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(orps))]
 pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1409,7 +1409,7 @@ pub unsafe fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compute the bitwise OR of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))]
 pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1420,7 +1420,7 @@ pub unsafe fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the equality
 /// comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpeqsd))]
 pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1429,7 +1429,7 @@ pub unsafe fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the less-than
 /// comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltsd))]
 pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1438,7 +1438,7 @@ pub unsafe fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// less-than-or-equal comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplesd))]
 pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1447,7 +1447,7 @@ pub unsafe fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// greater-than comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltsd))]
 pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1456,7 +1456,7 @@ pub unsafe fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// greater-than-or-equal comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplesd))]
 pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1467,7 +1467,7 @@ pub unsafe fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d {
 /// of comparing both of the lower elements of `a` and `b` to `NaN`. If
 /// neither are equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpordsd))]
 pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1477,7 +1477,7 @@ pub unsafe fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d {
 /// Return a new vector with the low element of `a` replaced by the result of
 /// comparing both of the lower elements of `a` and `b` to `NaN`. If either is
 /// equal to `NaN` then `0xFFFFFFFFFFFFFFFF` is used and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpunordsd))]
 pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1486,7 +1486,7 @@ pub unsafe fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the not-equal
 /// comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpneqsd))]
 pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1495,7 +1495,7 @@ pub unsafe fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-less-than comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltsd))]
 pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1504,7 +1504,7 @@ pub unsafe fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-less-than-or-equal comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlesd))]
 pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1513,7 +1513,7 @@ pub unsafe fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-greater-than comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltsd))]
 pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1522,7 +1522,7 @@ pub unsafe fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Return a new vector with the low element of `a` replaced by the
 /// not-greater-than-or-equal comparison of the lower elements of `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlesd))]
 pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -1530,7 +1530,7 @@ pub unsafe fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpeqpd))]
 pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1538,7 +1538,7 @@ pub unsafe fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltpd))]
 pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1546,7 +1546,7 @@ pub unsafe fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for less-than-or-equal
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplepd))]
 pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1554,7 +1554,7 @@ pub unsafe fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpltpd))]
 pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1562,7 +1562,7 @@ pub unsafe fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for greater-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmplepd))]
 pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1570,7 +1570,7 @@ pub unsafe fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` to see if neither is `NaN`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpordpd))]
 pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1578,7 +1578,7 @@ pub unsafe fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` to see if either is `NaN`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpunordpd))]
 pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1586,7 +1586,7 @@ pub unsafe fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for not-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpneqpd))]
 pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1594,7 +1594,7 @@ pub unsafe fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for not-less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltpd))]
 pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1602,7 +1602,7 @@ pub unsafe fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for not-less-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlepd))]
 pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1610,7 +1610,7 @@ pub unsafe fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare corresponding elements in `a` and `b` for not-greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnltpd))]
 pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1619,7 +1619,7 @@ pub unsafe fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Compare corresponding elements in `a` and `b` for
 /// not-greater-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cmpnlepd))]
 pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -1627,7 +1627,7 @@ pub unsafe fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d {
 }
 
 /// Compare the lower element of `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> bool {
@@ -1635,7 +1635,7 @@ pub unsafe fn _mm_comieq_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> bool {
@@ -1643,7 +1643,7 @@ pub unsafe fn _mm_comilt_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for less-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> bool {
@@ -1651,7 +1651,7 @@ pub unsafe fn _mm_comile_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> bool {
@@ -1659,7 +1659,7 @@ pub unsafe fn _mm_comigt_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> bool {
@@ -1667,7 +1667,7 @@ pub unsafe fn _mm_comige_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for not-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(comisd))]
 pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> bool {
@@ -1675,7 +1675,7 @@ pub unsafe fn _mm_comineq_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for equality.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> bool {
@@ -1683,7 +1683,7 @@ pub unsafe fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for less-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> bool {
@@ -1691,7 +1691,7 @@ pub unsafe fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for less-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> bool {
@@ -1699,7 +1699,7 @@ pub unsafe fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for greater-than.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> bool {
@@ -1707,7 +1707,7 @@ pub unsafe fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for greater-than-or-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> bool {
@@ -1715,7 +1715,7 @@ pub unsafe fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> bool {
 }
 
 /// Compare the lower element of `a` and `b` for not-equal.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(ucomisd))]
 pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> bool {
@@ -1724,7 +1724,7 @@ pub unsafe fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> bool {
 
 /// Convert packed double-precision (64-bit) floating-point elements in "a" to
 /// packed single-precision (32-bit) floating-point elements
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtpd2ps))]
 pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
@@ -1734,7 +1734,7 @@ pub unsafe fn _mm_cvtpd_ps(a: __m128d) -> __m128 {
 /// Convert packed single-precision (32-bit) floating-point elements in `a` to
 /// packed
 /// double-precision (64-bit) floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtps2pd))]
 pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
@@ -1743,7 +1743,7 @@ pub unsafe fn _mm_cvtps_pd(a: __m128) -> __m128d {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a` to
 /// packed 32-bit integers.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtpd2dq))]
 pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
@@ -1752,7 +1752,7 @@ pub unsafe fn _mm_cvtpd_epi32(a: __m128d) -> __m128i {
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to
 /// a 32-bit integer.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
 pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
@@ -1763,7 +1763,7 @@ pub unsafe fn _mm_cvtsd_si32(a: __m128d) -> i32 {
 /// to a single-precision (32-bit) floating-point element, store the result in
 /// the lower element of the return value, and copy the upper element from `a`
 /// to the upper element the return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2ss))]
 pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
@@ -1771,7 +1771,7 @@ pub unsafe fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128 {
 }
 
 /// Return the lower double-precision (64-bit) floating-point element of "a".
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, windows), assert_instr(movsd))] // FIXME movq/movlps/mov on other platform
 pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
@@ -1782,7 +1782,7 @@ pub unsafe fn _mm_cvtsd_f64(a: __m128d) -> f64 {
 /// to a double-precision (64-bit) floating-point element, store the result in
 /// the lower element of the return value, and copy the upper element from `a`
 /// to the upper element the return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtss2sd))]
 pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
@@ -1791,7 +1791,7 @@ pub unsafe fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d {
 
 /// Convert packed double-precision (64-bit) floating-point elements in `a` to
 /// packed 32-bit integers with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttpd2dq))]
 pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
@@ -1800,7 +1800,7 @@ pub unsafe fn _mm_cvttpd_epi32(a: __m128d) -> __m128i {
 
 /// Convert the lower double-precision (64-bit) floating-point element in `a`
 /// to a 32-bit integer with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
 pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
@@ -1809,7 +1809,7 @@ pub unsafe fn _mm_cvttsd_si32(a: __m128d) -> i32 {
 
 /// Convert packed single-precision (32-bit) floating-point elements in `a` to
 /// packed 32-bit integers with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttps2dq))]
 pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
@@ -1818,7 +1818,7 @@ pub unsafe fn _mm_cvttps_epi32(a: __m128) -> __m128i {
 
 /// Copy double-precision (64-bit) floating-point element `a` to the lower
 /// element of the packed 64-bit return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
     _mm_set_pd(0.0, a)
@@ -1826,7 +1826,7 @@ pub unsafe fn _mm_set_sd(a: f64) -> __m128d {
 
 /// Broadcast double-precision (64-bit) floating-point value a to all elements
 /// of the return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
     _mm_set_pd(a, a)
@@ -1834,7 +1834,7 @@ pub unsafe fn _mm_set1_pd(a: f64) -> __m128d {
 
 /// Broadcast double-precision (64-bit) floating-point value a to all elements
 /// of the return value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
     _mm_set_pd(a, a)
@@ -1842,7 +1842,7 @@ pub unsafe fn _mm_set_pd1(a: f64) -> __m128d {
 
 /// Set packed double-precision (64-bit) floating-point elements in the return
 /// value with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
     __m128d(b, a)
@@ -1850,7 +1850,7 @@ pub unsafe fn _mm_set_pd(a: f64, b: f64) -> __m128d {
 
 /// Set packed double-precision (64-bit) floating-point elements in the return
 /// value with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
     _mm_set_pd(b, a)
@@ -1858,7 +1858,7 @@ pub unsafe fn _mm_setr_pd(a: f64, b: f64) -> __m128d {
 
 /// Returns packed double-precision (64-bit) floating-point elements with all
 /// zeros.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(xorps))] // FIXME xorpd expected
 pub unsafe fn _mm_setzero_pd() -> __m128d {
@@ -1869,7 +1869,7 @@ pub unsafe fn _mm_setzero_pd() -> __m128d {
 ///
 /// The mask is stored in the 2 least significant bits of the return value.
 /// All other bits are set to `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movmskpd))]
 pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
@@ -1880,7 +1880,7 @@ pub unsafe fn _mm_movemask_pd(a: __m128d) -> i32 {
 /// floating-point elements) from memory into the returned vector.
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
@@ -1889,7 +1889,7 @@ pub unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d {
 
 /// Loads a 64-bit double-precision value to the low element of a
 /// 128-bit integer vector and clears the upper element.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movsd))]
 pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
@@ -1899,7 +1899,7 @@ pub unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d {
 /// Loads a double-precision value into the high-order bits of a 128-bit
 /// vector of [2 x double]. The low-order bits are copied from the low-order
 /// bits of the first operand.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movhpd))]
 pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
@@ -1909,7 +1909,7 @@ pub unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 /// Loads a double-precision value into the low-order bits of a 128-bit
 /// vector of [2 x double]. The high-order bits are copied from the
 /// high-order bits of the first operand.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movlpd))]
 pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
@@ -1920,7 +1920,7 @@ pub unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d {
 /// aligned memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movntps))] // FIXME movntpd
 pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
@@ -1929,7 +1929,7 @@ pub unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movsd only on windows
 pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
@@ -1939,7 +1939,7 @@ pub unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d) {
 /// Store 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory. `mem_addr` must be aligned
 /// on a 16-byte boundary or a general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movaps))]
 pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
@@ -1949,7 +1949,7 @@ pub unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from `a` into memory.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))] // FIXME movupd expected
 pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
@@ -1959,7 +1959,7 @@ pub unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store the lower double-precision (64-bit) floating-point element from `a`
 /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [0, 0]);
@@ -1969,7 +1969,7 @@ pub unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d) {
 /// Store the lower double-precision (64-bit) floating-point element from `a`
 /// into 2 contiguous elements in memory. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [0, 0]);
@@ -1980,7 +1980,7 @@ pub unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d) {
 /// memory in reverse order.
 /// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
 /// exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
     let b: __m128d = simd_shuffle2(a, a, [1, 0]);
@@ -1989,7 +1989,7 @@ pub unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movhpd))]
 pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
@@ -1998,7 +1998,7 @@ pub unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
 /// memory location.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movlps))] // FIXME movlpd (movsd on windows)
 pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
@@ -2007,7 +2007,7 @@ pub unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d) {
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // #[cfg_attr(test, assert_instr(movapd))] // FIXME LLVM uses different codegen
 pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
@@ -2017,7 +2017,7 @@ pub unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d {
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of returned vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 // #[cfg_attr(test, assert_instr(movapd))] // FIXME same as _mm_load1_pd
 pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
@@ -2027,7 +2027,7 @@ pub unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d {
 /// Load 2 double-precision (64-bit) floating-point elements from memory into
 /// the returned vector in reverse order. `mem_addr` must be aligned on a
 /// 16-byte boundary or a general-protection exception may be generated.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movapd))]
 pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
@@ -2038,7 +2038,7 @@ pub unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d {
 /// Load 128-bits (composed of 2 packed double-precision (64-bit)
 /// floating-point elements) from memory into the returned vector.
 /// `mem_addr` does not need to be aligned on any particular boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movups))]
 pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
@@ -2054,7 +2054,7 @@ pub unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d {
 /// Constructs a 128-bit floating-point vector of [2 x double] from two
 /// 128-bit vector parameters of [2 x double], using the immediate-value
 /// parameter as a specifier.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(shufpd, imm8 = 1))]
 pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
@@ -2069,7 +2069,7 @@ pub unsafe fn _mm_shuffle_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
 /// 64 bits are set to the lower 64 bits of the second parameter. The upper
 /// 64 bits are set to the upper 64 bits of the first parameter.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movsd))]
 pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -2078,7 +2078,7 @@ pub unsafe fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 /// floating-point vector of [4 x float].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
     mem::transmute(a)
@@ -2086,7 +2086,7 @@ pub unsafe fn _mm_castpd_ps(a: __m128d) -> __m128 {
 
 /// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
 /// integer vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
     mem::transmute::<i64x2, _>(simd_cast(a))
@@ -2094,7 +2094,7 @@ pub unsafe fn _mm_castpd_si128(a: __m128d) -> __m128i {
 
 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
 /// floating-point vector of [2 x double].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
     mem::transmute(a)
@@ -2102,7 +2102,7 @@ pub unsafe fn _mm_castps_pd(a: __m128) -> __m128d {
 
 /// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
 /// integer vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
     mem::transmute(a)
@@ -2110,7 +2110,7 @@ pub unsafe fn _mm_castps_si128(a: __m128) -> __m128i {
 
 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
 /// of [2 x double].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
     simd_cast(a.as_i64x2())
@@ -2118,21 +2118,21 @@ pub unsafe fn _mm_castsi128_pd(a: __m128i) -> __m128d {
 
 /// Casts a 128-bit integer vector into a 128-bit floating-point vector
 /// of [4 x float].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_castsi128_ps(a: __m128i) -> __m128 {
     mem::transmute(a)
 }
 
 /// Return vector of type __m128d with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_undefined_pd() -> __m128d {
     _mm_set1_pd(mem::uninitialized())
 }
 
 /// Return vector of type __m128i with undefined elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 pub unsafe fn _mm_undefined_si128() -> __m128i {
     _mm_set1_epi8(mem::uninitialized())
@@ -2143,7 +2143,7 @@ pub unsafe fn _mm_undefined_si128() -> __m128i {
 ///
 /// * The [127:64] bits are copied from the [127:64] bits of the second input
 /// * The [63:0] bits are copied from the [127:64] bits of the first input
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(unpckhpd))]
 pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -2155,7 +2155,7 @@ pub unsafe fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d {
 ///
 /// * The [127:64] bits are copied from the [63:0] bits of the second input
 /// * The [63:0] bits are copied from the [63:0] bits of the first input
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(unpcklpd))]
 pub unsafe fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d {
diff --git a/coresimd/src/x86/i586/sse3.rs b/coresimd/src/x86/i586/sse3.rs
index cf26319612..f74341dffe 100644
--- a/coresimd/src/x86/i586/sse3.rs
+++ b/coresimd/src/x86/i586/sse3.rs
@@ -9,7 +9,7 @@ use stdsimd_test::assert_instr;
 
 /// Alternatively add and subtract packed single-precision (32-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(addsubps))]
 pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
@@ -18,7 +18,7 @@ pub unsafe fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Alternatively add and subtract packed double-precision (64-bit)
 /// floating-point elements in `a` to/from packed elements in `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(addsubpd))]
 pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -27,7 +27,7 @@ pub unsafe fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Horizontally add adjacent pairs of double-precision (64-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(haddpd))]
 pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -36,7 +36,7 @@ pub unsafe fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Horizontally add adjacent pairs of single-precision (32-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(haddps))]
 pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
@@ -45,7 +45,7 @@ pub unsafe fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128 {
 
 /// Horizontally subtract adjacent pairs of double-precision (64-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(hsubpd))]
 pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
@@ -54,7 +54,7 @@ pub unsafe fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d {
 
 /// Horizontally add adjacent pairs of single-precision (32-bit)
 /// floating-point elements in `a` and `b`, and pack the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(hsubps))]
 pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
@@ -64,7 +64,7 @@ pub unsafe fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128 {
 /// Load 128-bits of integer data from unaligned memory.
 /// This intrinsic may perform better than `_mm_loadu_si128`
 /// when the data crosses a cache line boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(lddqu))]
 pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
@@ -73,7 +73,7 @@ pub unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i {
 
 /// Duplicate the low double-precision (64-bit) floating-point element
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movddup))]
 pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
@@ -82,7 +82,7 @@ pub unsafe fn _mm_movedup_pd(a: __m128d) -> __m128d {
 
 /// Load a double-precision (64-bit) floating-point element from memory
 /// into both elements of return vector.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movddup))]
 pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
@@ -91,7 +91,7 @@ pub unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d {
 
 /// Duplicate odd-indexed single-precision (32-bit) floating-point elements
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movshdup))]
 pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
@@ -100,7 +100,7 @@ pub unsafe fn _mm_movehdup_ps(a: __m128) -> __m128 {
 
 /// Duplicate even-indexed single-precision (32-bit) floating-point elements
 /// from `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse3")]
 #[cfg_attr(test, assert_instr(movsldup))]
 pub unsafe fn _mm_moveldup_ps(a: __m128) -> __m128 {
diff --git a/coresimd/src/x86/i586/sse41.rs b/coresimd/src/x86/i586/sse41.rs
index bd63ed3122..4ca8397bd8 100644
--- a/coresimd/src/x86/i586/sse41.rs
+++ b/coresimd/src/x86/i586/sse41.rs
@@ -47,7 +47,7 @@ pub const _MM_FROUND_NEARBYINT: i32 =
 /// The high bit of each corresponding mask byte determines the selection.
 /// If the high bit is set the element of `a` is selected. The element
 /// of `b` is selected otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pblendvb))]
 pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i {
@@ -59,7 +59,7 @@ pub unsafe fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i
 /// The mask bits determine the selection. A clear bit selects the
 /// corresponding element of `a`, and a set bit the corresponding
 /// element of `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pblendw, imm8 = 0xF0))]
 pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
@@ -73,7 +73,7 @@ pub unsafe fn _mm_blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 /// and `b` using `mask`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendvpd))]
 pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
@@ -82,7 +82,7 @@ pub unsafe fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 /// and `b` using `mask`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendvps))]
 pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
@@ -91,7 +91,7 @@ pub unsafe fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128 {
 
 /// Blend packed double-precision (64-bit) floating-point elements from `a`
 /// and `b` using control mask `imm2`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendpd, imm2 = 0b10))]
 pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
@@ -103,7 +103,7 @@ pub unsafe fn _mm_blend_pd(a: __m128d, b: __m128d, imm2: i32) -> __m128d {
 
 /// Blend packed single-precision (32-bit) floating-point elements from `a`
 /// and `b` using mask `imm4`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(blendps, imm4 = 0b0101))]
 pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
@@ -115,7 +115,7 @@ pub unsafe fn _mm_blend_ps(a: __m128, b: __m128, imm4: i32) -> __m128 {
 
 /// Extract a single-precision (32-bit) floating-point element from `a`,
 /// selected with `imm8`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(extractps, imm8 = 0))]
@@ -127,7 +127,7 @@ pub unsafe fn _mm_extract_ps(a: __m128, imm8: i32) -> i32 {
 /// integer containing the zero-extended integer data.
 ///
 /// See [LLVM commit D20468][https://reviews.llvm.org/D20468].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pextrb, imm8 = 0))]
 pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
@@ -136,7 +136,7 @@ pub unsafe fn _mm_extract_epi8(a: __m128i, imm8: i32) -> i32 {
 }
 
 /// Extract an 32-bit integer from `a` selected with `imm8`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(pextrd, imm8 = 1))]
@@ -167,7 +167,7 @@ pub unsafe fn _mm_extract_epi32(a: __m128i, imm8: i32) -> i32 {
 ///
 /// * Bits `[3:0]`: If any of these bits are set, the corresponding result
 /// element is cleared.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(insertps, imm8 = 0b1010))]
 pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
@@ -179,7 +179,7 @@ pub unsafe fn _mm_insert_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 
 /// Return a copy of `a` with the 8-bit integer from `i` inserted at a
 /// location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrb, imm8 = 0))]
 pub unsafe fn _mm_insert_epi8(a: __m128i, i: i8, imm8: i32) -> __m128i {
@@ -188,7 +188,7 @@ pub unsafe fn _mm_insert_epi8(a: __m128i, i: i8, imm8: i32) -> __m128i {
 
 /// Return a copy of `a` with the 32-bit integer from `i` inserted at a
 /// location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrd, imm8 = 0))]
 pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
@@ -197,7 +197,7 @@ pub unsafe fn _mm_insert_epi32(a: __m128i, i: i32, imm8: i32) -> __m128i {
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed maximum
 /// values in dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxsb))]
 pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -206,7 +206,7 @@ pub unsafe fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
 /// maximum.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxuw))]
 pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -215,7 +215,7 @@ pub unsafe fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 32-bit integers in `a` and `b`, and return packed maximum
 /// values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxsd))]
 pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -224,7 +224,7 @@ pub unsafe fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
 /// maximum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmaxud))]
 pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
@@ -233,7 +233,7 @@ pub unsafe fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 8-bit integers in `a` and `b` and return packed minimum
 /// values in dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminsb))]
 pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -242,7 +242,7 @@ pub unsafe fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 16-bit integers in `a` and `b`, and return packed
 /// minimum.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminuw))]
 pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
@@ -251,7 +251,7 @@ pub unsafe fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed 32-bit integers in `a` and `b`, and return packed minimum
 /// values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminsd))]
 pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -260,7 +260,7 @@ pub unsafe fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Compare packed unsigned 32-bit integers in `a` and `b`, and return packed
 /// minimum values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pminud))]
 pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
@@ -269,7 +269,7 @@ pub unsafe fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Convert packed 32-bit integers from `a` and `b` to packed 16-bit integers
 /// using unsigned saturation
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(packusdw))]
 pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -277,7 +277,7 @@ pub unsafe fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Compare packed 64-bit integers in `a` and `b` for equality
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pcmpeqq))]
 pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
@@ -285,7 +285,7 @@ pub unsafe fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 8-bit integers in `a` to packed 16-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbw))]
 pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
@@ -295,7 +295,7 @@ pub unsafe fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 8-bit integers in `a` to packed 32-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbd))]
 pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
@@ -306,7 +306,7 @@ pub unsafe fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i {
 
 /// Sign extend packed 8-bit integers in the low 8 bytes of `a` to packed
 /// 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxbq))]
 pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
@@ -316,7 +316,7 @@ pub unsafe fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 16-bit integers in `a` to packed 32-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxwd))]
 pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
@@ -326,7 +326,7 @@ pub unsafe fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 16-bit integers in `a` to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxwq))]
 pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
@@ -336,7 +336,7 @@ pub unsafe fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i {
 }
 
 /// Sign extend packed 32-bit integers in `a` to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovsxdq))]
 pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
@@ -346,7 +346,7 @@ pub unsafe fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 16-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbw))]
 pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
@@ -356,7 +356,7 @@ pub unsafe fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 32-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbd))]
 pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
@@ -366,7 +366,7 @@ pub unsafe fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i {
 }
 
 /// Zero extend packed unsigned 8-bit integers in `a` to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxbq))]
 pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
@@ -377,7 +377,7 @@ pub unsafe fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 16-bit integers in `a`
 /// to packed 32-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxwd))]
 pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
@@ -388,7 +388,7 @@ pub unsafe fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 16-bit integers in `a`
 /// to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxwq))]
 pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
@@ -399,7 +399,7 @@ pub unsafe fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i {
 
 /// Zero extend packed unsigned 32-bit integers in `a`
 /// to packed 64-bit integers
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmovzxdq))]
 pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
@@ -415,7 +415,7 @@ pub unsafe fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i {
 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 /// the dot product will be stored in the return value component. Otherwise if
 /// the broadcast mask bit is zero then the return component will be zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(dppd, imm8 = 0))]
 pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
@@ -432,7 +432,7 @@ pub unsafe fn _mm_dp_pd(a: __m128d, b: __m128d, imm8: i32) -> __m128d {
 /// replaced by a value of `0.0`. If a broadcast mask bit is one, the result of
 /// the dot product will be stored in the return value component. Otherwise if
 /// the broadcast mask bit is zero then the return component will be zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(dpps, imm8 = 0))]
 pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
@@ -445,7 +445,7 @@ pub unsafe fn _mm_dp_ps(a: __m128, b: __m128, imm8: i32) -> __m128 {
 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 /// down to an integer value, and store the results as packed double-precision
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd))]
 pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
@@ -455,7 +455,7 @@ pub unsafe fn _mm_floor_pd(a: __m128d) -> __m128d {
 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 /// down to an integer value, and store the results as packed single-precision
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps))]
 pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
@@ -467,7 +467,7 @@ pub unsafe fn _mm_floor_ps(a: __m128) -> __m128 {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper element from `a` to the upper element of the intrinsic
 /// result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd))]
 pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -479,7 +479,7 @@ pub unsafe fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper 3 packed elements from `a` to the upper elements
 /// of the intrinsic result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss))]
 pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
@@ -489,7 +489,7 @@ pub unsafe fn _mm_floor_ss(a: __m128, b: __m128) -> __m128 {
 /// Round the packed double-precision (64-bit) floating-point elements in `a`
 /// up to an integer value, and store the results as packed double-precision
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd))]
 pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
@@ -499,7 +499,7 @@ pub unsafe fn _mm_ceil_pd(a: __m128d) -> __m128d {
 /// Round the packed single-precision (32-bit) floating-point elements in `a`
 /// up to an integer value, and store the results as packed single-precision
 /// floating-point elements.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps))]
 pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
@@ -511,7 +511,7 @@ pub unsafe fn _mm_ceil_ps(a: __m128) -> __m128 {
 /// floating-point element in the lower element of the intrisic result,
 /// and copy the upper element from `a` to the upper element
 /// of the intrinsic result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd))]
 pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
@@ -523,7 +523,7 @@ pub unsafe fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d {
 /// floating-point element in the lower element of the intrinsic result,
 /// and copy the upper 3 packed elements from `a` to the upper elements
 /// of the intrinsic result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss))]
 pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
@@ -549,7 +549,7 @@ pub unsafe fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128 {
 /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
 /// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundpd, rounding = 0))]
 pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
@@ -578,7 +578,7 @@ pub unsafe fn _mm_round_pd(a: __m128d, rounding: i32) -> __m128d {
 /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
 /// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundps, rounding = 0))]
 pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
@@ -609,7 +609,7 @@ pub unsafe fn _mm_round_ps(a: __m128, rounding: i32) -> __m128 {
 /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
 /// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundsd, rounding = 0))]
 pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
@@ -640,7 +640,7 @@ pub unsafe fn _mm_round_sd(a: __m128d, b: __m128d, rounding: i32) -> __m128d {
 /// // use MXCSR.RC; see `vendor::_MM_SET_ROUNDING_MODE`:
 /// vendor::_MM_FROUND_CUR_DIRECTION;
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(roundss, rounding = 0))]
 pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
@@ -669,7 +669,7 @@ pub unsafe fn _mm_round_ss(a: __m128, b: __m128, rounding: i32) -> __m128 {
 /// * bits `[15:0]` - contain the minimum value found in parameter `a`,
 /// * bits `[18:16]` - contain the index of the minimum value
 /// * remaining bits are set to `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(phminposuw))]
 pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
@@ -678,7 +678,7 @@ pub unsafe fn _mm_minpos_epu16(a: __m128i) -> __m128i {
 
 /// Multiply the low 32-bit integers from each packed 64-bit
 /// element in `a` and `b`, and return the signed 64-bit result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmuldq))]
 pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -691,7 +691,7 @@ pub unsafe fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// __m128i::splat(2)` returns the obvious `__m128i::splat(4)`, due to wrapping
 /// arithmetic `pmulld __m128i::splat(i32::MAX), __m128i::splat(2)` would return a
 /// negative number.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pmulld))]
 pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -729,7 +729,7 @@ pub unsafe fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i {
 ///
 /// * A `__m128i` vector containing the sums of the sets of
 ///   absolute differences between both operands.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(mpsadbw, imm8 = 0))]
 pub unsafe fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
diff --git a/coresimd/src/x86/i586/sse42.rs b/coresimd/src/x86/i586/sse42.rs
index f358426d31..f850306d29 100644
--- a/coresimd/src/x86/i586/sse42.rs
+++ b/coresimd/src/x86/i586/sse42.rs
@@ -48,7 +48,7 @@ pub const _SIDD_UNIT_MASK: i32 = 0b0100_0000;
 
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return the generated mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistrm, imm8 = 0))]
 pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
@@ -258,7 +258,7 @@ pub unsafe fn _mm_cmpistrm(a: __m128i, b: __m128i, imm8: i32) -> __m128i {
 /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
 /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
 /// [`_mm_cmpestri`]: fn._mm_cmpestri.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -273,7 +273,7 @@ pub unsafe fn _mm_cmpistri(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if any character in `b` was null.
 /// and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -288,7 +288,7 @@ pub unsafe fn _mm_cmpistrz(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if the resulting mask was non-zero,
 /// and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -303,7 +303,7 @@ pub unsafe fn _mm_cmpistrc(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and returns `1` if any character in `a` was null,
 /// and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -317,7 +317,7 @@ pub unsafe fn _mm_cmpistrs(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return bit `0` of the resulting bit mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -332,7 +332,7 @@ pub unsafe fn _mm_cmpistro(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 /// Compare packed strings with implicit lengths in `a` and `b` using the
 /// control in `imm8`, and return `1` if `b` did not contain a null
 /// character and the resulting mask was zero, and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpistri, imm8 = 0))]
 pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
@@ -346,7 +346,7 @@ pub unsafe fn _mm_cmpistra(a: __m128i, b: __m128i, imm8: i32) -> i32 {
 
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return the generated mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestrm, imm8 = 0))]
 pub unsafe fn _mm_cmpestrm(
@@ -439,7 +439,7 @@ pub unsafe fn _mm_cmpestrm(
 /// [`_SIDD_LEAST_SIGNIFICANT`]: constant._SIDD_LEAST_SIGNIFICANT.html
 /// [`_SIDD_MOST_SIGNIFICANT`]: constant._SIDD_MOST_SIGNIFICANT.html
 /// [`_mm_cmpistri`]: fn._mm_cmpistri.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestri(
@@ -456,7 +456,7 @@ pub unsafe fn _mm_cmpestri(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if any character in
 /// `b` was null, and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrz(
@@ -473,7 +473,7 @@ pub unsafe fn _mm_cmpestrz(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if the resulting mask
 /// was non-zero, and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrc(
@@ -490,7 +490,7 @@ pub unsafe fn _mm_cmpestrc(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return `1` if any character in
 /// a was null, and `0` otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestrs(
@@ -507,7 +507,7 @@ pub unsafe fn _mm_cmpestrs(
 /// Compare packed strings in `a` and `b` with lengths `la` and `lb`
 /// using the control in `imm8`, and return bit `0` of the resulting
 /// bit mask.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestro(
@@ -525,7 +525,7 @@ pub unsafe fn _mm_cmpestro(
 /// using the control in `imm8`, and return `1` if `b` did not
 /// contain a null character and the resulting mask was zero, and `0`
 /// otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpestri, imm8 = 0))]
 pub unsafe fn _mm_cmpestra(
@@ -541,7 +541,7 @@ pub unsafe fn _mm_cmpestra(
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 8-bit integer `v`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
 pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
@@ -550,7 +550,7 @@ pub unsafe fn _mm_crc32_u8(crc: u32, v: u8) -> u32 {
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 16-bit integer `v`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
 pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
@@ -559,7 +559,7 @@ pub unsafe fn _mm_crc32_u16(crc: u32, v: u16) -> u32 {
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 32-bit integer `v`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
 pub unsafe fn _mm_crc32_u32(crc: u32, v: u32) -> u32 {
diff --git a/coresimd/src/x86/i586/ssse3.rs b/coresimd/src/x86/i586/ssse3.rs
index f0498ef531..01e461bd79 100644
--- a/coresimd/src/x86/i586/ssse3.rs
+++ b/coresimd/src/x86/i586/ssse3.rs
@@ -11,7 +11,7 @@ use x86::*;
 
 /// Compute the absolute value of packed 8-bit signed integers in `a` and
 /// return the unsigned results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsb))]
 pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
@@ -21,7 +21,7 @@ pub unsafe fn _mm_abs_epi8(a: __m128i) -> __m128i {
 /// Compute the absolute value of each of the packed 16-bit signed integers in
 /// `a` and
 /// return the 16-bit unsigned integer
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsw))]
 pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
@@ -31,7 +31,7 @@ pub unsafe fn _mm_abs_epi16(a: __m128i) -> __m128i {
 /// Compute the absolute value of each of the packed 32-bit signed integers in
 /// `a` and
 /// return the 32-bit unsigned integer
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pabsd))]
 pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
@@ -62,7 +62,7 @@ pub unsafe fn _mm_abs_epi32(a: __m128i) -> __m128i {
 ///     r
 /// }
 /// ```
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pshufb))]
 pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -71,7 +71,7 @@ pub unsafe fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i {
 
 /// Concatenate 16-byte blocks in `a` and `b` into a 32-byte temporary result,
 /// shift the result right by `n` bytes, and return the low 16 bytes.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(palignr, n = 15))]
 pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
@@ -129,7 +129,7 @@ pub unsafe fn _mm_alignr_epi8(a: __m128i, b: __m128i, n: i32) -> __m128i {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [8 x i16].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddw))]
 pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -139,7 +139,7 @@ pub unsafe fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [8 x i16]. Positive sums greater than 7FFFh are
 /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddsw))]
 pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -148,7 +148,7 @@ pub unsafe fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 128-bit vectors of [4 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phaddd))]
 pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -157,7 +157,7 @@ pub unsafe fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i {
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of [8 x i16].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubw))]
 pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -168,7 +168,7 @@ pub unsafe fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// packed 128-bit vectors of [8 x i16]. Positive differences greater than
 /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 /// saturated to 8000h.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubsw))]
 pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -177,7 +177,7 @@ pub unsafe fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 
 /// Horizontally subtract the adjacent pairs of values contained in 2
 /// packed 128-bit vectors of [4 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(phsubd))]
 pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
@@ -189,7 +189,7 @@ pub unsafe fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i {
 /// integer values contained in the second source operand, add pairs of
 /// contiguous products with signed saturation, and writes the 16-bit sums to
 /// the corresponding bits in the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pmaddubsw))]
 pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -199,7 +199,7 @@ pub unsafe fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// Multiply packed 16-bit signed integer values, truncate the 32-bit
 /// product to the 18 most significant bits by right-shifting, round the
 /// truncated value by adding 1, and write bits [16:1] to the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(pmulhrsw))]
 pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -210,7 +210,7 @@ pub unsafe fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the result.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignb))]
 pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
@@ -221,7 +221,7 @@ pub unsafe fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the results.
 /// Elements in result are zeroed out when the corresponding element in `b`
 /// is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignw))]
 pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
@@ -232,7 +232,7 @@ pub unsafe fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b`
 /// is zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3")]
 #[cfg_attr(test, assert_instr(psignd))]
 pub unsafe fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i {
diff --git a/coresimd/src/x86/i586/tbm.rs b/coresimd/src/x86/i586/tbm.rs
index 30019673a2..1a9b48ca29 100644
--- a/coresimd/src/x86/i586/tbm.rs
+++ b/coresimd/src/x86/i586/tbm.rs
@@ -27,7 +27,7 @@ extern "C" {
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
     _bextr2_u32(a, (start & 0xffu32) | ((len & 0xffu32) << 8u32))
@@ -35,7 +35,7 @@ pub fn _bextr_u32(a: u32, start: u32, len: u32) -> u32 {
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
     _bextr2_u64(a, (start & 0xffu64) | ((len & 0xffu64) << 8u64))
@@ -46,7 +46,7 @@ pub fn _bextr_u64(a: u64, start: u64, len: u64) -> u64 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
     unsafe { x86_tbm_bextri_u32(a, control) }
@@ -57,7 +57,7 @@ pub fn _bextr2_u32(a: u32, control: u32) -> u32 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
     unsafe { x86_tbm_bextri_u64(a, control) }
@@ -67,7 +67,7 @@ pub fn _bextr2_u64(a: u64, control: u64) -> u64 {
 /// Clears all bits below the least significant zero bit of `x`.
 ///
 /// If there is no zero bit in `x`, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
 pub unsafe fn _blcfill_u32(x: u32) -> u32 {
@@ -77,7 +77,7 @@ pub unsafe fn _blcfill_u32(x: u32) -> u32 {
 /// Clears all bits below the least significant zero bit of `x`.
 ///
 /// If there is no zero bit in `x`, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -88,7 +88,7 @@ pub unsafe fn _blcfill_u64(x: u64) -> u64 {
 /// Sets all bits of `x` to 1 except for the least significant zero bit.
 ///
 /// If there is no zero bit in `x`, it sets all bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
 pub unsafe fn _blci_u32(x: u32) -> u32 {
@@ -98,7 +98,7 @@ pub unsafe fn _blci_u32(x: u32) -> u32 {
 /// Sets all bits of `x` to 1 except for the least significant zero bit.
 ///
 /// If there is no zero bit in `x`, it sets all bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blci))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -109,7 +109,7 @@ pub unsafe fn _blci_u64(x: u64) -> u64 {
 /// Sets the least significant zero bit of `x` and clears all other bits.
 ///
 /// If there is no zero bit in `x`, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
 pub unsafe fn _blcic_u32(x: u32) -> u32 {
@@ -119,7 +119,7 @@ pub unsafe fn _blcic_u32(x: u32) -> u32 {
 /// Sets the least significant zero bit of `x` and clears all other bits.
 ///
 /// If there is no zero bit in `x`, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -131,7 +131,7 @@ pub unsafe fn _blcic_u64(x: u64) -> u64 {
 /// that bit.
 ///
 /// If there is no zero bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
 pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
@@ -142,7 +142,7 @@ pub unsafe fn _blcmsk_u32(x: u32) -> u32 {
 /// that bit.
 ///
 /// If there is no zero bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -153,7 +153,7 @@ pub unsafe fn _blcmsk_u64(x: u64) -> u64 {
 /// Sets the least significant zero bit of `x`.
 ///
 /// If there is no zero bit in `x`, it returns `x`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
 pub unsafe fn _blcs_u32(x: u32) -> u32 {
@@ -163,7 +163,7 @@ pub unsafe fn _blcs_u32(x: u32) -> u32 {
 /// Sets the least significant zero bit of `x`.
 ///
 /// If there is no zero bit in `x`, it returns `x`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blcs))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -174,7 +174,7 @@ pub unsafe fn _blcs_u64(x: u64) -> u64 {
 /// Sets all bits of `x` below the least significant one.
 ///
 /// If there is no set bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
 pub unsafe fn _blsfill_u32(x: u32) -> u32 {
@@ -184,7 +184,7 @@ pub unsafe fn _blsfill_u32(x: u32) -> u32 {
 /// Sets all bits of `x` below the least significant one.
 ///
 /// If there is no set bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsfill))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -195,7 +195,7 @@ pub unsafe fn _blsfill_u64(x: u64) -> u64 {
 /// Clears least significant bit and sets all other bits.
 ///
 /// If there is no set bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
 pub unsafe fn _blsic_u32(x: u32) -> u32 {
@@ -205,7 +205,7 @@ pub unsafe fn _blsic_u32(x: u32) -> u32 {
 /// Clears least significant bit and sets all other bits.
 ///
 /// If there is no set bit in `x`, it sets all the bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(blsic))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -217,7 +217,7 @@ pub unsafe fn _blsic_u64(x: u64) -> u64 {
 /// bits.
 ///
 /// If the least significant bit of `x` is 0, it sets all bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
 pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
@@ -228,7 +228,7 @@ pub unsafe fn _t1mskc_u32(x: u32) -> u32 {
 /// bits.
 ///
 /// If the least significant bit of `x` is 0, it sets all bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(t1mskc))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -240,7 +240,7 @@ pub unsafe fn _t1mskc_u64(x: u64) -> u64 {
 /// bits.
 ///
 /// If the least significant bit of `x` is 1, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
 pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
@@ -251,7 +251,7 @@ pub unsafe fn _tzmsk_u32(x: u32) -> u32 {
 /// bits.
 ///
 /// If the least significant bit of `x` is 1, it returns zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "tbm")]
 #[cfg_attr(test, assert_instr(tzmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
diff --git a/coresimd/src/x86/i586/xsave.rs b/coresimd/src/x86/i586/xsave.rs
index 9a7611a82e..ead6cd09f7 100644
--- a/coresimd/src/x86/i586/xsave.rs
+++ b/coresimd/src/x86/i586/xsave.rs
@@ -33,7 +33,7 @@ extern "C" {
 ///
 /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
 /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsave))]
 pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
@@ -46,7 +46,7 @@ pub unsafe fn _xsave(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xrstor))]
 pub unsafe fn _xrstor(mem_addr: *const u8, rs_mask: u64) {
@@ -62,7 +62,7 @@ const _XCR_XFEATURE_ENABLED_MASK: u32 = 0;
 /// by `a`.
 ///
 /// Currently only `XFEATURE_ENABLED_MASK` `XCR` is supported.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsetbv))]
 pub unsafe fn _xsetbv(a: u32, val: u64) {
@@ -71,7 +71,7 @@ pub unsafe fn _xsetbv(a: u32, val: u64) {
 
 /// Reads the contents of the extended control register `XCR`
 /// specified in `xcr_no`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xgetbv))]
 pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
@@ -85,7 +85,7 @@ pub unsafe fn _xgetbv(xcr_no: u32) -> u64 {
 /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
 /// the manner in which data is saved. The performance of this instruction will
 /// be equal to or better than using the `XSAVE` instruction.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaveopt")]
 #[cfg_attr(test, assert_instr(xsaveopt))]
 pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
@@ -98,7 +98,7 @@ pub unsafe fn _xsaveopt(mem_addr: *mut u8, save_mask: u64) {
 /// `xsavec` differs from `xsave` in that it uses compaction and that it may
 /// use init optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsavec")]
 #[cfg_attr(test, assert_instr(xsavec))]
 pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
@@ -112,7 +112,7 @@ pub unsafe fn _xsavec(mem_addr: *mut u8, save_mask: u64) {
 /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
 /// modified optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xsaves))]
 pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
@@ -128,7 +128,7 @@ pub unsafe fn _xsaves(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xrstors))]
 pub unsafe fn _xrstors(mem_addr: *const u8, rs_mask: u64) {
diff --git a/coresimd/src/x86/i686/mmx.rs b/coresimd/src/x86/i686/mmx.rs
index e013fb3dc6..c5d69bcec0 100644
--- a/coresimd/src/x86/i686/mmx.rs
+++ b/coresimd/src/x86/i686/mmx.rs
@@ -16,7 +16,7 @@ use core::mem;
 use stdsimd_test::assert_instr;
 
 /// Constructs a 64-bit integer vector initialized to zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 // FIXME: this produces a movl instead of xorps on x86
 // FIXME: this produces a xor intrinsic instead of xorps on x86_64
@@ -26,7 +26,7 @@ pub unsafe fn _mm_setzero_si64() -> __m64 {
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddb))]
 pub unsafe fn _mm_add_pi8(a: __m64, b: __m64) -> __m64 {
@@ -34,7 +34,7 @@ pub unsafe fn _mm_add_pi8(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 8-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddb))]
 pub unsafe fn _m_paddb(a: __m64, b: __m64) -> __m64 {
@@ -42,7 +42,7 @@ pub unsafe fn _m_paddb(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddw))]
 pub unsafe fn _mm_add_pi16(a: __m64, b: __m64) -> __m64 {
@@ -50,7 +50,7 @@ pub unsafe fn _mm_add_pi16(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 16-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddw))]
 pub unsafe fn _m_paddw(a: __m64, b: __m64) -> __m64 {
@@ -58,7 +58,7 @@ pub unsafe fn _m_paddw(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddd))]
 pub unsafe fn _mm_add_pi32(a: __m64, b: __m64) -> __m64 {
@@ -66,7 +66,7 @@ pub unsafe fn _mm_add_pi32(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 32-bit integers in `a` and `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddd))]
 pub unsafe fn _m_paddd(a: __m64, b: __m64) -> __m64 {
@@ -74,7 +74,7 @@ pub unsafe fn _m_paddd(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddsb))]
 pub unsafe fn _mm_adds_pi8(a: __m64, b: __m64) -> __m64 {
@@ -82,7 +82,7 @@ pub unsafe fn _mm_adds_pi8(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddsb))]
 pub unsafe fn _m_paddsb(a: __m64, b: __m64) -> __m64 {
@@ -90,7 +90,7 @@ pub unsafe fn _m_paddsb(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddsw))]
 pub unsafe fn _mm_adds_pi16(a: __m64, b: __m64) -> __m64 {
@@ -98,7 +98,7 @@ pub unsafe fn _mm_adds_pi16(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddsw))]
 pub unsafe fn _m_paddsw(a: __m64, b: __m64) -> __m64 {
@@ -106,7 +106,7 @@ pub unsafe fn _m_paddsw(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddusb))]
 pub unsafe fn _mm_adds_pu8(a: __m64, b: __m64) -> __m64 {
@@ -114,7 +114,7 @@ pub unsafe fn _mm_adds_pu8(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed unsigned 8-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddusb))]
 pub unsafe fn _m_paddusb(a: __m64, b: __m64) -> __m64 {
@@ -122,7 +122,7 @@ pub unsafe fn _m_paddusb(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddusw))]
 pub unsafe fn _mm_adds_pu16(a: __m64, b: __m64) -> __m64 {
@@ -130,7 +130,7 @@ pub unsafe fn _mm_adds_pu16(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Add packed unsigned 16-bit integers in `a` and `b` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(paddusw))]
 pub unsafe fn _m_paddusw(a: __m64, b: __m64) -> __m64 {
@@ -138,7 +138,7 @@ pub unsafe fn _m_paddusw(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubb))]
 pub unsafe fn _mm_sub_pi8(a: __m64, b: __m64) -> __m64 {
@@ -146,7 +146,7 @@ pub unsafe fn _mm_sub_pi8(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubb))]
 pub unsafe fn _m_psubb(a: __m64, b: __m64) -> __m64 {
@@ -154,7 +154,7 @@ pub unsafe fn _m_psubb(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubw))]
 pub unsafe fn _mm_sub_pi16(a: __m64, b: __m64) -> __m64 {
@@ -162,7 +162,7 @@ pub unsafe fn _mm_sub_pi16(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubw))]
 pub unsafe fn _m_psubw(a: __m64, b: __m64) -> __m64 {
@@ -170,7 +170,7 @@ pub unsafe fn _m_psubw(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubd))]
 pub unsafe fn _mm_sub_pi32(a: __m64, b: __m64) -> __m64 {
@@ -178,7 +178,7 @@ pub unsafe fn _mm_sub_pi32(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubd))]
 pub unsafe fn _m_psubd(a: __m64, b: __m64) -> __m64 {
@@ -187,7 +187,7 @@ pub unsafe fn _m_psubd(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubsb))]
 pub unsafe fn _mm_subs_pi8(a: __m64, b: __m64) -> __m64 {
@@ -196,7 +196,7 @@ pub unsafe fn _mm_subs_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubsb))]
 pub unsafe fn _m_psubsb(a: __m64, b: __m64) -> __m64 {
@@ -205,7 +205,7 @@ pub unsafe fn _m_psubsb(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubsw))]
 pub unsafe fn _mm_subs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -214,7 +214,7 @@ pub unsafe fn _mm_subs_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
 /// using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubsw))]
 pub unsafe fn _m_psubsw(a: __m64, b: __m64) -> __m64 {
@@ -223,7 +223,7 @@ pub unsafe fn _m_psubsw(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubusb))]
 pub unsafe fn _mm_subs_pu8(a: __m64, b: __m64) -> __m64 {
@@ -232,7 +232,7 @@ pub unsafe fn _mm_subs_pu8(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed unsigned 8-bit integers in `b` from packed unsigned 8-bit
 /// integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubusb))]
 pub unsafe fn _m_psubusb(a: __m64, b: __m64) -> __m64 {
@@ -241,7 +241,7 @@ pub unsafe fn _m_psubusb(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned
 /// 16-bit integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubusw))]
 pub unsafe fn _mm_subs_pu16(a: __m64, b: __m64) -> __m64 {
@@ -250,7 +250,7 @@ pub unsafe fn _mm_subs_pu16(a: __m64, b: __m64) -> __m64 {
 
 /// Subtract packed unsigned 16-bit integers in `b` from packed unsigned
 /// 16-bit integers in `a` using saturation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(psubusw))]
 pub unsafe fn _m_psubusw(a: __m64, b: __m64) -> __m64 {
@@ -262,7 +262,7 @@ pub unsafe fn _m_psubusw(a: __m64, b: __m64) -> __m64 {
 ///
 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values
 /// less than 0x80 are saturated to 0x80.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(packsswb))]
 pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -274,7 +274,7 @@ pub unsafe fn _mm_packs_pi16(a: __m64, b: __m64) -> __m64 {
 ///
 /// Positive values greater than 0x7F are saturated to 0x7F. Negative values
 /// less than 0x80 are saturated to 0x80.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(packssdw))]
 pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 {
@@ -283,7 +283,7 @@ pub unsafe fn _mm_packs_pi32(a: __m64, b: __m64) -> __m64 {
 
 /// Compares whether each element of `a` is greater than the corresponding
 /// element of `b` returning `0` for `false` and `-1` for `true`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(pcmpgtb))]
 pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 {
@@ -292,7 +292,7 @@ pub unsafe fn _mm_cmpgt_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Compares whether each element of `a` is greater than the corresponding
 /// element of `b` returning `0` for `false` and `-1` for `true`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(pcmpgtw))]
 pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 {
@@ -301,7 +301,7 @@ pub unsafe fn _mm_cmpgt_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Compares whether each element of `a` is greater than the corresponding
 /// element of `b` returning `0` for `false` and `-1` for `true`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(pcmpgtd))]
 pub unsafe fn _mm_cmpgt_pi32(a: __m64, b: __m64) -> __m64 {
@@ -310,7 +310,7 @@ pub unsafe fn _mm_cmpgt_pi32(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the upper two elements from two `i16x4` vectors and interleaves
 /// them into the result: `[a.2, b.2, a.3, b.3]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpckhwd))] // FIXME punpcklbw expected
 pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 {
@@ -319,7 +319,7 @@ pub unsafe fn _mm_unpackhi_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the upper four elements from two `i8x8` vectors and interleaves
 /// them into the result: `[a.4, b.4, a.5, b.5, a.6, b.6, a.7, b.7]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpckhbw))]
 pub unsafe fn _mm_unpackhi_pi8(a: __m64, b: __m64) -> __m64 {
@@ -328,7 +328,7 @@ pub unsafe fn _mm_unpackhi_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the lower four elements from two `i8x8` vectors and interleaves
 /// them into the result: `[a.0, b.0, a.1, b.1, a.2, b.2, a.3, b.3]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpcklbw))]
 pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 {
@@ -337,7 +337,7 @@ pub unsafe fn _mm_unpacklo_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the lower two elements from two `i16x4` vectors and interleaves
 /// them into the result: `[a.0 b.0 a.1 b.1]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpcklwd))]
 pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 {
@@ -346,7 +346,7 @@ pub unsafe fn _mm_unpacklo_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the upper element from two `i32x2` vectors and interleaves them
 /// into the result: `[a.1, b.1]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpckhdq))]
 pub unsafe fn _mm_unpackhi_pi32(a: __m64, b: __m64) -> __m64 {
@@ -355,7 +355,7 @@ pub unsafe fn _mm_unpackhi_pi32(a: __m64, b: __m64) -> __m64 {
 
 /// Unpacks the lower element from two `i32x2` vectors and interleaves them
 /// into the result: `[a.0, b.0]`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 #[cfg_attr(test, assert_instr(punpckldq))]
 pub unsafe fn _mm_unpacklo_pi32(a: __m64, b: __m64) -> __m64 {
@@ -363,21 +363,21 @@ pub unsafe fn _mm_unpacklo_pi32(a: __m64, b: __m64) -> __m64 {
 }
 
 /// Set packed 16-bit integers in dst with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set_pi16(e3: i16, e2: i16, e1: i16, e0: i16) -> __m64 {
     _mm_setr_pi16(e0, e1, e2, e3)
 }
 
 /// Set packed 32-bit integers in dst with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set_pi32(e1: i32, e0: i32) -> __m64 {
     _mm_setr_pi32(e0, e1)
 }
 
 /// Set packed 8-bit integers in dst with the supplied values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set_pi8(
     e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8
@@ -386,21 +386,21 @@ pub unsafe fn _mm_set_pi8(
 }
 
 /// Broadcast 16-bit integer a to all all elements of dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set1_pi16(a: i16) -> __m64 {
     _mm_setr_pi16(a, a, a, a)
 }
 
 /// Broadcast 32-bit integer a to all all elements of dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set1_pi32(a: i32) -> __m64 {
     _mm_setr_pi32(a, a)
 }
 
 /// Broadcast 8-bit integer a to all all elements of dst.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_set1_pi8(a: i8) -> __m64 {
     _mm_setr_pi8(a, a, a, a, a, a, a, a)
@@ -408,7 +408,7 @@ pub unsafe fn _mm_set1_pi8(a: i8) -> __m64 {
 
 /// Set packed 16-bit integers in dst with the supplied values in reverse
 /// order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_setr_pi16(e0: i16, e1: i16, e2: i16, e3: i16) -> __m64 {
     mem::transmute(i16x4::new(e0, e1, e2, e3))
@@ -416,14 +416,14 @@ pub unsafe fn _mm_setr_pi16(e0: i16, e1: i16, e2: i16, e3: i16) -> __m64 {
 
 /// Set packed 32-bit integers in dst with the supplied values in reverse
 /// order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_setr_pi32(e0: i32, e1: i32) -> __m64 {
     mem::transmute(i32x2::new(e0, e1))
 }
 
 /// Set packed 8-bit integers in dst with the supplied values in reverse order.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "mmx")]
 pub unsafe fn _mm_setr_pi8(
     e0: i8, e1: i8, e2: i8, e3: i8, e4: i8, e5: i8, e6: i8, e7: i8
diff --git a/coresimd/src/x86/i686/sse2.rs b/coresimd/src/x86/i686/sse2.rs
index f82c4372a9..ba34838c03 100644
--- a/coresimd/src/x86/i686/sse2.rs
+++ b/coresimd/src/x86/i686/sse2.rs
@@ -10,7 +10,7 @@ use stdsimd_test::assert_instr;
 
 /// Adds two signed or unsigned 64-bit integer values, returning the
 /// lower 64 bits of the sum.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(paddq))]
 pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 {
@@ -20,7 +20,7 @@ pub unsafe fn _mm_add_si64(a: __m64, b: __m64) -> __m64 {
 /// Multiplies 32-bit unsigned integer values contained in the lower bits
 /// of the two 64-bit integer vectors and returns the 64-bit unsigned
 /// product.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(pmuludq))]
 pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 {
@@ -29,7 +29,7 @@ pub unsafe fn _mm_mul_su32(a: __m64, b: __m64) -> __m64 {
 
 /// Subtracts signed or unsigned 64-bit integer values and writes the
 /// difference to the corresponding bits in the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(psubq))]
 pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 {
@@ -39,7 +39,7 @@ pub unsafe fn _mm_sub_si64(a: __m64, b: __m64) -> __m64 {
 /// Converts the two signed 32-bit integer elements of a 64-bit vector of
 /// [2 x i32] into two double-precision floating-point values, returned in a
 /// 128-bit vector of [2 x double].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(cvtpi2pd))]
 pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d {
@@ -48,7 +48,7 @@ pub unsafe fn _mm_cvtpi32_pd(a: __m64) -> __m128d {
 
 /// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
 /// the specified 64-bit integer values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // no particular instruction to test
 pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i {
@@ -57,7 +57,7 @@ pub unsafe fn _mm_set_epi64(e1: __m64, e0: __m64) -> __m128i {
 
 /// Initializes both values in a 128-bit vector of [2 x i64] with the
 /// specified 64-bit value.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // no particular instruction to test
 pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i {
@@ -66,7 +66,7 @@ pub unsafe fn _mm_set1_epi64(a: __m64) -> __m128i {
 
 /// Constructs a 128-bit integer vector, initialized in reverse order
 /// with the specified 64-bit integral values.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // no particular instruction to test
 pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i {
@@ -75,7 +75,7 @@ pub unsafe fn _mm_setr_epi64(e1: __m64, e0: __m64) -> __m128i {
 
 /// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
 /// integer.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // #[cfg_attr(test, assert_instr(movdq2q))] // FIXME: llvm codegens wrong
 // instr?
@@ -85,7 +85,7 @@ pub unsafe fn _mm_movepi64_pi64(a: __m128i) -> __m64 {
 
 /// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
 /// upper bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 // #[cfg_attr(test, assert_instr(movq2dq))] // FIXME: llvm codegens wrong
 // instr?
@@ -96,7 +96,7 @@ pub unsafe fn _mm_movpi64_epi64(a: __m64) -> __m128i {
 /// Converts the two double-precision floating-point elements of a
 /// 128-bit vector of [2 x double] into two signed 32-bit integer values,
 /// returned in a 64-bit vector of [2 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(cvtpd2pi))]
 pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 {
@@ -108,7 +108,7 @@ pub unsafe fn _mm_cvtpd_pi32(a: __m128d) -> __m64 {
 /// returned in a 64-bit vector of [2 x i32].
 /// If the result of either conversion is inexact, the result is truncated
 /// (rounded towards zero) regardless of the current MXCSR setting.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2,mmx")]
 #[cfg_attr(test, assert_instr(cvttpd2pi))]
 pub unsafe fn _mm_cvttpd_pi32(a: __m128d) -> __m64 {
diff --git a/coresimd/src/x86/i686/sse41.rs b/coresimd/src/x86/i686/sse41.rs
index a8dd65cfe0..3f35305b18 100644
--- a/coresimd/src/x86/i686/sse41.rs
+++ b/coresimd/src/x86/i686/sse41.rs
@@ -29,7 +29,7 @@ extern "C" {
 ///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
@@ -49,7 +49,7 @@ pub unsafe fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are all ones,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
@@ -69,7 +69,7 @@ pub unsafe fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
@@ -89,7 +89,7 @@ pub unsafe fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are all zeros,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
@@ -107,7 +107,7 @@ pub unsafe fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32 {
 ///
 /// * `1` - if the bits specified in the operand are all set to 1,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pcmpeqd))]
 #[cfg_attr(test, assert_instr(ptest))]
@@ -128,7 +128,7 @@ pub unsafe fn _mm_test_all_ones(a: __m128i) -> i32 {
 ///
 /// * `1` - if the specified bits are neither all zeros nor all ones,
 /// * `0` - otherwise.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(ptest))]
 pub unsafe fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32 {
diff --git a/coresimd/src/x86/i686/sse42.rs b/coresimd/src/x86/i686/sse42.rs
index 301dd3ea77..f092fe412f 100644
--- a/coresimd/src/x86/i686/sse42.rs
+++ b/coresimd/src/x86/i686/sse42.rs
@@ -9,7 +9,7 @@ use stdsimd_test::assert_instr;
 
 /// Compare packed 64-bit integers in `a` and `b` for greater-than,
 /// return the results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(pcmpgtq))]
 pub unsafe fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i {
diff --git a/coresimd/src/x86/i686/sse4a.rs b/coresimd/src/x86/i686/sse4a.rs
index f35ffb3e50..5e226322f8 100644
--- a/coresimd/src/x86/i686/sse4a.rs
+++ b/coresimd/src/x86/i686/sse4a.rs
@@ -33,7 +33,7 @@ extern "C" {
 ///
 /// If `length == 0 && index > 0` or `lenght + index > 64` the result is
 /// undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(extrq))]
 pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
@@ -49,7 +49,7 @@ pub unsafe fn _mm_extract_si64(x: __m128i, y: __m128i) -> __m128i {
 ///
 /// If the `length` is zero it is interpreted as `64`. If `index + length > 64`
 /// or `index > 0 && length == 0` the result is undefined.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(insertq))]
 pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
@@ -57,7 +57,7 @@ pub unsafe fn _mm_insert_si64(x: __m128i, y: __m128i) -> __m128i {
 }
 
 /// Non-temporal store of `a.0` into `p`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(movntsd))]
 pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
@@ -65,7 +65,7 @@ pub unsafe fn _mm_stream_sd(p: *mut f64, a: __m128d) {
 }
 
 /// Non-temporal store of `a.0` into `p`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4a")]
 #[cfg_attr(test, assert_instr(movntss))]
 pub unsafe fn _mm_stream_ss(p: *mut f32, a: __m128) {
diff --git a/coresimd/src/x86/i686/ssse3.rs b/coresimd/src/x86/i686/ssse3.rs
index 647074096c..c386d8a0a4 100644
--- a/coresimd/src/x86/i686/ssse3.rs
+++ b/coresimd/src/x86/i686/ssse3.rs
@@ -7,7 +7,7 @@ use x86::*;
 
 /// Compute the absolute value of packed 8-bit integers in `a` and
 /// return the unsigned results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pabsb))]
 pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
@@ -16,7 +16,7 @@ pub unsafe fn _mm_abs_pi8(a: __m64) -> __m64 {
 
 /// Compute the absolute value of packed 8-bit integers in `a`, and return the
 /// unsigned results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pabsw))]
 pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
@@ -25,7 +25,7 @@ pub unsafe fn _mm_abs_pi16(a: __m64) -> __m64 {
 
 /// Compute the absolute value of packed 32-bit integers in `a`, and return the
 /// unsigned results.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pabsd))]
 pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
@@ -34,7 +34,7 @@ pub unsafe fn _mm_abs_pi32(a: __m64) -> __m64 {
 
 /// Shuffle packed 8-bit integers in `a` according to shuffle control mask in
 /// the corresponding 8-bit element of `b`, and return the results
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pshufb))]
 pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
@@ -43,7 +43,7 @@ pub unsafe fn _mm_shuffle_pi8(a: __m64, b: __m64) -> __m64 {
 
 /// Concatenates the two 64-bit integer vector operands, and right-shifts
 /// the result by the number of bytes specified in the immediate operand.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(palignr, n = 15))]
 pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
@@ -57,7 +57,7 @@ pub unsafe fn _mm_alignr_pi8(a: __m64, b: __m64, n: i32) -> __m64 {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 64-bit vectors of [4 x i16].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phaddw))]
 pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
@@ -66,7 +66,7 @@ pub unsafe fn _mm_hadd_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 64-bit vectors of [2 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phaddd))]
 pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
@@ -76,7 +76,7 @@ pub unsafe fn _mm_hadd_pi32(a: __m64, b: __m64) -> __m64 {
 /// Horizontally add the adjacent pairs of values contained in 2 packed
 /// 64-bit vectors of [4 x i16]. Positive sums greater than 7FFFh are
 /// saturated to 7FFFh. Negative sums less than 8000h are saturated to 8000h.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phaddsw))]
 pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
@@ -85,7 +85,7 @@ pub unsafe fn _mm_hadds_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
 /// packed 64-bit vectors of [4 x i16].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phsubw))]
 pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
@@ -94,7 +94,7 @@ pub unsafe fn _mm_hsub_pi16(a: __m64, b: __m64) -> __m64 {
 
 /// Horizontally subtracts the adjacent pairs of values contained in 2
 /// packed 64-bit vectors of [2 x i32].
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phsubd))]
 pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
@@ -105,7 +105,7 @@ pub unsafe fn _mm_hsub_pi32(a: __m64, b: __m64) -> __m64 {
 /// packed 64-bit vectors of [4 x i16]. Positive differences greater than
 /// 7FFFh are saturated to 7FFFh. Negative differences less than 8000h are
 /// saturated to 8000h.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(phsubsw))]
 pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -117,7 +117,7 @@ pub unsafe fn _mm_hsubs_pi16(a: __m64, b: __m64) -> __m64 {
 /// integer values contained in the second source operand, adds pairs of
 /// contiguous products with signed saturation, and writes the 16-bit sums to
 /// the corresponding bits in the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pmaddubsw))]
 pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -127,7 +127,7 @@ pub unsafe fn _mm_maddubs_pi16(a: __m64, b: __m64) -> __m64 {
 /// Multiplies packed 16-bit signed integer values, truncates the 32-bit
 /// products to the 18 most significant bits by right-shifting, rounds the
 /// truncated value by adding 1, and writes bits [16:1] to the destination.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(pmulhrsw))]
 pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
@@ -138,7 +138,7 @@ pub unsafe fn _mm_mulhrs_pi16(a: __m64, b: __m64) -> __m64 {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b` is
 /// zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(psignb))]
 pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
@@ -149,7 +149,7 @@ pub unsafe fn _mm_sign_pi8(a: __m64, b: __m64) -> __m64 {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b` is
 /// zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(psignw))]
 pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
@@ -160,7 +160,7 @@ pub unsafe fn _mm_sign_pi16(a: __m64, b: __m64) -> __m64 {
 /// integer in `b` is negative, and return the results.
 /// Element in result are zeroed out when the corresponding element in `b` is
 /// zero.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "ssse3,mmx")]
 #[cfg_attr(test, assert_instr(psignd))]
 pub unsafe fn _mm_sign_pi32(a: __m64, b: __m64) -> __m64 {
diff --git a/coresimd/src/x86/mod.rs b/coresimd/src/x86/mod.rs
index 05e99a9b9c..d1f42f6cc9 100644
--- a/coresimd/src/x86/mod.rs
+++ b/coresimd/src/x86/mod.rs
@@ -17,7 +17,7 @@ macro_rules! types {
         pub struct $name($($fields)*);
 
         impl Clone for $name {
-            #[inline(always)] // currently needed for correctness
+            #[inline] // currently needed for correctness
             fn clone(&self) -> $name {
                 *self
             }
@@ -307,49 +307,49 @@ pub use self::test::*;
 trait m128iExt: Sized {
     fn as_m128i(self) -> __m128i;
 
-    #[inline(always)]
+    #[inline]
     fn as_u8x16(self) -> ::v128::u8x16 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u16x8(self) -> ::v128::u16x8 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u32x4(self) -> ::v128::u32x4 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u64x2(self) -> ::v128::u64x2 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i8x16(self) -> ::v128::i8x16 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i16x8(self) -> ::v128::i16x8 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i32x4(self) -> ::v128::i32x4 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i64x2(self) -> ::v128::i64x2 {
         unsafe { mem::transmute(self.as_m128i()) }
     }
 }
 
 impl m128iExt for __m128i {
-    #[inline(always)]
+    #[inline]
     fn as_m128i(self) -> __m128i { self }
 }
 
@@ -358,49 +358,49 @@ impl m128iExt for __m128i {
 trait m256iExt: Sized {
     fn as_m256i(self) -> __m256i;
 
-    #[inline(always)]
+    #[inline]
     fn as_u8x32(self) -> ::v256::u8x32 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u16x16(self) -> ::v256::u16x16 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u32x8(self) -> ::v256::u32x8 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_u64x4(self) -> ::v256::u64x4 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i8x32(self) -> ::v256::i8x32 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i16x16(self) -> ::v256::i16x16 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i32x8(self) -> ::v256::i32x8 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 
-    #[inline(always)]
+    #[inline]
     fn as_i64x4(self) -> ::v256::i64x4 {
         unsafe { mem::transmute(self.as_m256i()) }
     }
 }
 
 impl m256iExt for __m256i {
-    #[inline(always)]
+    #[inline]
     fn as_m256i(self) -> __m256i { self }
 }
 
diff --git a/coresimd/src/x86/x86_64/abm.rs b/coresimd/src/x86/x86_64/abm.rs
index 9889501048..235fa8bb17 100644
--- a/coresimd/src/x86/x86_64/abm.rs
+++ b/coresimd/src/x86/x86_64/abm.rs
@@ -4,7 +4,7 @@ use stdsimd_test::assert_instr;
 /// Counts the leading most significant zero bits.
 ///
 /// When the operand is zero, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "lzcnt")]
 #[cfg_attr(test, assert_instr(lzcnt))]
 pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
@@ -12,7 +12,7 @@ pub unsafe fn _lzcnt_u64(x: u64) -> u64 {
 }
 
 /// Counts the bits that are set.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "popcnt")]
 #[cfg_attr(test, assert_instr(popcnt))]
 pub unsafe fn _popcnt64(x: i64) -> i32 {
diff --git a/coresimd/src/x86/x86_64/avx.rs b/coresimd/src/x86/x86_64/avx.rs
index ae92ccd711..3f9fda1451 100644
--- a/coresimd/src/x86/x86_64/avx.rs
+++ b/coresimd/src/x86/x86_64/avx.rs
@@ -5,7 +5,7 @@ use x86::*;
 
 /// Copy `a` to result, and insert the 64-bit integer `i` into result
 /// at the location specified by `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_insert_epi64(a: __m256i, i: i64, index: i32) -> __m256i {
diff --git a/coresimd/src/x86/x86_64/avx2.rs b/coresimd/src/x86/x86_64/avx2.rs
index 840569e7c9..6cdb542bfb 100644
--- a/coresimd/src/x86/x86_64/avx2.rs
+++ b/coresimd/src/x86/x86_64/avx2.rs
@@ -2,7 +2,7 @@ use simd_llvm::*;
 use x86::*;
 
 /// Extract a 64-bit integer from `a`, selected with `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "avx2")]
 // This intrinsic has no corresponding instruction.
 pub unsafe fn _mm256_extract_epi64(a: __m256i, imm8: i32) -> i64 {
diff --git a/coresimd/src/x86/x86_64/bmi.rs b/coresimd/src/x86/x86_64/bmi.rs
index cda1daa7e7..d8d8a74972 100644
--- a/coresimd/src/x86/x86_64/bmi.rs
+++ b/coresimd/src/x86/x86_64/bmi.rs
@@ -3,7 +3,7 @@ use stdsimd_test::assert_instr;
 
 /// Extracts bits in range [`start`, `start` + `length`) from `a` into
 /// the least significant bits of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
@@ -16,7 +16,7 @@ pub unsafe fn _bextr_u64(a: u64, start: u32, len: u32) -> u64 {
 ///
 /// Bits [7,0] of `control` specify the index to the first bit in the range to
 /// be extracted, and bits [15,8] specify the length of the range.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(bextr))]
 #[cfg(not(target_arch = "x86"))]
@@ -25,7 +25,7 @@ pub unsafe fn _bextr2_u64(a: u64, control: u64) -> u64 {
 }
 
 /// Bitwise logical `AND` of inverted `a` with `b`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(andn))]
 pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
@@ -33,7 +33,7 @@ pub unsafe fn _andn_u64(a: u64, b: u64) -> u64 {
 }
 
 /// Extract lowest set isolated bit.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsi))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -42,7 +42,7 @@ pub unsafe fn _blsi_u64(x: u64) -> u64 {
 }
 
 /// Get mask up to lowest set bit.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsmsk))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -53,7 +53,7 @@ pub unsafe fn _blsmsk_u64(x: u64) -> u64 {
 /// Resets the lowest set bit of `x`.
 ///
 /// If `x` is sets CF.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(blsr))]
 #[cfg(not(target_arch = "x86"))] // generates lots of instructions
@@ -64,7 +64,7 @@ pub unsafe fn _blsr_u64(x: u64) -> u64 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(tzcnt))]
 pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
@@ -74,7 +74,7 @@ pub unsafe fn _tzcnt_u64(x: u64) -> u64 {
 /// Counts the number of trailing least significant zero bits.
 ///
 /// When the source operand is 0, it returns its size in bits.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi")]
 #[cfg_attr(test, assert_instr(tzcnt))]
 pub unsafe fn _mm_tzcnt_64(x: u64) -> i64 {
diff --git a/coresimd/src/x86/x86_64/bmi2.rs b/coresimd/src/x86/x86_64/bmi2.rs
index 761fa5fec1..b1c74d15c8 100644
--- a/coresimd/src/x86/x86_64/bmi2.rs
+++ b/coresimd/src/x86/x86_64/bmi2.rs
@@ -5,7 +5,7 @@ use stdsimd_test::assert_instr;
 ///
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
-#[inline(always)]
+#[inline]
 #[cfg_attr(test, assert_instr(mulx))]
 #[target_feature(enable = "bmi2")]
 #[cfg(not(target_arch = "x86"))] // calls an intrinsic
@@ -16,7 +16,7 @@ pub unsafe fn _mulx_u64(a: u64, b: u64, hi: &mut u64) -> u64 {
 }
 
 /// Zero higher bits of `a` >= `index`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(bzhi))]
 #[cfg(not(target_arch = "x86"))]
@@ -26,7 +26,7 @@ pub unsafe fn _bzhi_u64(a: u64, index: u32) -> u64 {
 
 /// Scatter contiguous low order bits of `a` to the result at the positions
 /// specified by the `mask`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pdep))]
 #[cfg(not(target_arch = "x86"))]
@@ -36,7 +36,7 @@ pub unsafe fn _pdep_u64(a: u64, mask: u64) -> u64 {
 
 /// Gathers the bits of `x` specified by the `mask` into the contiguous low
 /// order bit positions of the result.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "bmi2")]
 #[cfg_attr(test, assert_instr(pext))]
 #[cfg(not(target_arch = "x86"))]
diff --git a/coresimd/src/x86/x86_64/fxsr.rs b/coresimd/src/x86/x86_64/fxsr.rs
index c2a7391a2b..d717db15dc 100644
--- a/coresimd/src/x86/x86_64/fxsr.rs
+++ b/coresimd/src/x86/x86_64/fxsr.rs
@@ -21,7 +21,7 @@ extern "C" {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxsave64))]
 pub unsafe fn _fxsave64(mem_addr: *mut u8) {
@@ -42,7 +42,7 @@ pub unsafe fn _fxsave64(mem_addr: *mut u8) {
 ///
 /// [fxsave]: http://www.felixcloutier.com/x86/FXSAVE.html
 /// [fxrstor]: http://www.felixcloutier.com/x86/FXRSTOR.html
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "fxsr")]
 #[cfg_attr(test, assert_instr(fxrstor64))]
 pub unsafe fn _fxrstor64(mem_addr: *const u8) {
diff --git a/coresimd/src/x86/x86_64/sse.rs b/coresimd/src/x86/x86_64/sse.rs
index ff7929afc9..4763e81b6f 100644
--- a/coresimd/src/x86/x86_64/sse.rs
+++ b/coresimd/src/x86/x86_64/sse.rs
@@ -24,7 +24,7 @@ extern "C" {
 /// [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTSS2SI` instruction (with 64 bit output).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtss2si))]
 pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
@@ -40,7 +40,7 @@ pub unsafe fn _mm_cvtss_si64(a: __m128) -> i64 {
 /// point exception if unmasked (see [`_mm_setcsr`](fn._mm_setcsr.html)).
 ///
 /// This corresponds to the `CVTTSS2SI` instruction (with 64 bit output).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvttss2si))]
 pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
@@ -52,7 +52,7 @@ pub unsafe fn _mm_cvttss_si64(a: __m128) -> i64 {
 ///
 /// This intrinsic corresponds to the `CVTSI2SS` instruction (with 64 bit
 /// input).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse")]
 #[cfg_attr(test, assert_instr(cvtsi2ss))]
 pub unsafe fn _mm_cvtsi64_ss(a: __m128, b: i64) -> __m128 {
diff --git a/coresimd/src/x86/x86_64/sse2.rs b/coresimd/src/x86/x86_64/sse2.rs
index d99a391157..ff16d1f957 100644
--- a/coresimd/src/x86/x86_64/sse2.rs
+++ b/coresimd/src/x86/x86_64/sse2.rs
@@ -16,7 +16,7 @@ extern "C" {
 
 /// Convert the lower double-precision (64-bit) floating-point element in a to
 /// a 64-bit integer.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
 pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 {
@@ -24,7 +24,7 @@ pub unsafe fn _mm_cvtsd_si64(a: __m128d) -> i64 {
 }
 
 /// Alias for [`_mm_cvtsd_si64`](fn._mm_cvtsd_si64_ss.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsd2si))]
 pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
@@ -33,7 +33,7 @@ pub unsafe fn _mm_cvtsd_si64x(a: __m128d) -> i64 {
 
 /// Convert the lower double-precision (64-bit) floating-point element in `a`
 /// to a 64-bit integer with truncation.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
 pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 {
@@ -41,7 +41,7 @@ pub unsafe fn _mm_cvttsd_si64(a: __m128d) -> i64 {
 }
 
 /// Alias for [`_mm_cvttsd_si64`](fn._mm_cvttsd_si64_ss.html).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvttsd2si))]
 pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
@@ -51,7 +51,7 @@ pub unsafe fn _mm_cvttsd_si64x(a: __m128d) -> i64 {
 /// Stores a 64-bit integer value in the specified memory location.
 /// To minimize caching, the data is flagged as non-temporal (unlikely to be
 /// used again soon).
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(movnti))]
 pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
@@ -60,7 +60,7 @@ pub unsafe fn _mm_stream_si64(mem_addr: *mut i64, a: i64) {
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i {
@@ -69,7 +69,7 @@ pub unsafe fn _mm_cvtsi64_si128(a: i64) -> __m128i {
 
 /// Return a vector whose lowest element is `a` and all higher elements are
 /// `0`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
@@ -77,7 +77,7 @@ pub unsafe fn _mm_cvtsi64x_si128(a: i64) -> __m128i {
 }
 
 /// Return the lowest element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
@@ -85,7 +85,7 @@ pub unsafe fn _mm_cvtsi128_si64(a: __m128i) -> i64 {
 }
 
 /// Return the lowest element of `a`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(all(test, not(windows)), assert_instr(movq))]
 pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
@@ -94,7 +94,7 @@ pub unsafe fn _mm_cvtsi128_si64x(a: __m128i) -> i64 {
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
 pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
@@ -103,7 +103,7 @@ pub unsafe fn _mm_cvtsi64_sd(a: __m128d, b: i64) -> __m128d {
 
 /// Return `a` with its lower element replaced by `b` after converting it to
 /// an `f64`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse2")]
 #[cfg_attr(test, assert_instr(cvtsi2sd))]
 pub unsafe fn _mm_cvtsi64x_sd(a: __m128d, b: i64) -> __m128d {
diff --git a/coresimd/src/x86/x86_64/sse41.rs b/coresimd/src/x86/x86_64/sse41.rs
index a7f25a4ae3..2747ad4471 100644
--- a/coresimd/src/x86/x86_64/sse41.rs
+++ b/coresimd/src/x86/x86_64/sse41.rs
@@ -9,7 +9,7 @@ use simd_llvm::*;
 use stdsimd_test::assert_instr;
 
 /// Extract an 64-bit integer from `a` selected with `imm8`
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 // TODO: Add test for Windows
 #[cfg_attr(all(test, not(windows)), assert_instr(pextrq, imm8 = 1))]
@@ -20,7 +20,7 @@ pub unsafe fn _mm_extract_epi64(a: __m128i, imm8: i32) -> i64 {
 
 /// Return a copy of `a` with the 64-bit integer from `i` inserted at a
 /// location specified by `imm8`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.1")]
 #[cfg_attr(test, assert_instr(pinsrq, imm8 = 0))]
 pub unsafe fn _mm_insert_epi64(a: __m128i, i: i64, imm8: i32) -> __m128i {
diff --git a/coresimd/src/x86/x86_64/sse42.rs b/coresimd/src/x86/x86_64/sse42.rs
index 12fd87ea2b..6fe79ea8c0 100644
--- a/coresimd/src/x86/x86_64/sse42.rs
+++ b/coresimd/src/x86/x86_64/sse42.rs
@@ -11,7 +11,7 @@ extern "C" {
 
 /// Starting with the initial value in `crc`, return the accumulated
 /// CRC32 value for unsigned 64-bit integer `v`.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "sse4.2")]
 #[cfg_attr(test, assert_instr(crc32))]
 pub unsafe fn _mm_crc32_u64(crc: u64, v: u64) -> u64 {
diff --git a/coresimd/src/x86/x86_64/xsave.rs b/coresimd/src/x86/x86_64/xsave.rs
index fc8b38ced6..0ddd8b1476 100644
--- a/coresimd/src/x86/x86_64/xsave.rs
+++ b/coresimd/src/x86/x86_64/xsave.rs
@@ -29,7 +29,7 @@ extern "C" {
 ///
 /// The format of the XSAVE area is detailed in Section 13.4, “XSAVE Area,” of
 /// Intel® 64 and IA-32 Architectures Software Developer’s Manual, Volume 1.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xsave64))]
 pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
@@ -42,7 +42,7 @@ pub unsafe fn _xsave64(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave")]
 #[cfg_attr(test, assert_instr(xrstor64))]
 pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
@@ -56,7 +56,7 @@ pub unsafe fn _xrstor64(mem_addr: *const u8, rs_mask: u64) {
 /// `mem_addr` must be aligned on a 64-byte boundary. The hardware may optimize
 /// the manner in which data is saved. The performance of this instruction will
 /// be equal to or better than using the `XSAVE64` instruction.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaveopt")]
 #[cfg_attr(test, assert_instr(xsaveopt64))]
 pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
@@ -69,7 +69,7 @@ pub unsafe fn _xsaveopt64(mem_addr: *mut u8, save_mask: u64) {
 /// `xsavec` differs from `xsave` in that it uses compaction and that it may
 /// use init optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsavec")]
 #[cfg_attr(test, assert_instr(xsavec64))]
 pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
@@ -83,7 +83,7 @@ pub unsafe fn _xsavec64(mem_addr: *mut u8, save_mask: u64) {
 /// corresponding to bits set in `IA32_XSS` `MSR` and that it may use the
 /// modified optimization. State is saved based on bits [62:0] in `save_mask`
 /// and `XCR0`. `mem_addr` must be aligned on a 64-byte boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xsaves64))]
 pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
@@ -99,7 +99,7 @@ pub unsafe fn _xsaves64(mem_addr: *mut u8, save_mask: u64) {
 /// State is restored based on bits [62:0] in `rs_mask`, `XCR0`, and
 /// `mem_addr.HEADER.XSTATE_BV`. `mem_addr` must be aligned on a 64-byte
 /// boundary.
-#[inline(always)]
+#[inline]
 #[target_feature(enable = "xsave,xsaves")]
 #[cfg_attr(test, assert_instr(xrstors64))]
 pub unsafe fn _xrstors64(mem_addr: *const u8, rs_mask: u64) {