From 374bc27b882b6857f169b5cd9cff7ad5bc4f49ee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20BRANSTETT?= <lolo.branstett@numericable.fr>
Date: Tue, 1 Feb 2022 19:19:01 +0100
Subject: [PATCH 01/12] Simplify the code of fixup by making it's code flow
 more natural

---
 compiler/rustc_middle/src/ty/layout.rs | 30 +++++++++++++++-----------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index 3b05e42a53ead..fa4e0e85a4e84 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -3360,7 +3360,22 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                 }
 
                 match arg.layout.abi {
-                    Abi::Aggregate { .. } => {}
+                    Abi::Aggregate { .. } => {
+                        // Pass and return structures up to 2 pointers in size by value,
+                        // matching `ScalarPair`. LLVM will usually pass these in 2 registers
+                        // which is more efficient than by-ref.
+                        let max_by_val_size = Pointer.size(self) * 2;
+                        let size = arg.layout.size;
+
+                        if arg.layout.is_unsized() || size > max_by_val_size {
+                            arg.make_indirect();
+                        } else {
+                            // We want to pass small aggregates as immediates, but using
+                            // a LLVM aggregate type for this leads to bad optimizations,
+                            // so we pick an appropriately sized integer type instead.
+                            arg.cast_to(Reg { kind: RegKind::Integer, size });
+                        }
+                    }
 
                     // This is a fun case! The gist of what this is doing is
                     // that we want callers and callees to always agree on the
@@ -3386,20 +3401,9 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                             && self.tcx.sess.target.simd_types_indirect =>
                     {
                         arg.make_indirect();
-                        return;
                     }
 
-                    _ => return,
-                }
-
-                let size = arg.layout.size;
-                if arg.layout.is_unsized() || size > Pointer.size(self) {
-                    arg.make_indirect();
-                } else {
-                    // We want to pass small aggregates as immediates, but using
-                    // a LLVM aggregate type for this leads to bad optimizations,
-                    // so we pick an appropriately sized integer type instead.
-                    arg.cast_to(Reg { kind: RegKind::Integer, size });
+                    _ => {},
                 }
             };
             fixup(&mut fn_abi.ret);

From bf97e79437100cb76003dd16b4626d045201154c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20BRANSTETT?= <lolo.branstett@numericable.fr>
Date: Tue, 1 Feb 2022 22:10:47 +0100
Subject: [PATCH 02/12] Don't aggregate homogeneous floats in the Rust ABI

---
 compiler/rustc_middle/src/ty/layout.rs        | 22 ++++++++-
 src/test/assembly/x86-64-homogenous-floats.rs | 45 ++++++++++++++++++
 src/test/codegen/homogeneous-floats.rs        | 32 +++++++++++++
 src/test/ui/abi/homogenous-floats.rs          | 46 +++++++++++++++++++
 4 files changed, 143 insertions(+), 2 deletions(-)
 create mode 100644 src/test/assembly/x86-64-homogenous-floats.rs
 create mode 100644 src/test/codegen/homogeneous-floats.rs
 create mode 100644 src/test/ui/abi/homogenous-floats.rs

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index fa4e0e85a4e84..847a0e3e58e34 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -14,7 +14,8 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
 use rustc_span::symbol::Symbol;
 use rustc_span::{Span, DUMMY_SP};
 use rustc_target::abi::call::{
-    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
+    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, HomogeneousAggregate, PassMode,
+    Reg, RegKind,
 };
 use rustc_target::abi::*;
 use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
@@ -3369,10 +3370,27 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
+                        } else if let Ok(HomogeneousAggregate::Homogeneous(Reg {
+                            kind: RegKind::Float,
+                            ..
+                        })) = arg.layout.homogeneous_aggregate(self)
+                        {
+                            // We don't want to aggregate floats as an aggregates of Integer
+                            // because this will hurt the generated assembly (#93490)
+                            //
+                            // As an optimization we want to pass homogeneous aggregate of floats
+                            // greater than pointer size as indirect
+                            if size > Pointer.size(self) {
+                                arg.make_indirect();
+                            }
                         } else {
                             // We want to pass small aggregates as immediates, but using
                             // a LLVM aggregate type for this leads to bad optimizations,
                             // so we pick an appropriately sized integer type instead.
+                            //
+                            // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
+                            // we could do ([f32; 2], u64) which is better but this is the best we
+                            // can do right now.
                             arg.cast_to(Reg { kind: RegKind::Integer, size });
                         }
                     }
@@ -3403,7 +3421,7 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                         arg.make_indirect();
                     }
 
-                    _ => {},
+                    _ => {}
                 }
             };
             fixup(&mut fn_abi.ret);
diff --git a/src/test/assembly/x86-64-homogenous-floats.rs b/src/test/assembly/x86-64-homogenous-floats.rs
new file mode 100644
index 0000000000000..5b725bab07790
--- /dev/null
+++ b/src/test/assembly/x86-64-homogenous-floats.rs
@@ -0,0 +1,45 @@
+// assembly-output: emit-asm
+// needs-llvm-components: x86
+// compile-flags: --target x86_64-unknown-linux-gnu
+// compile-flags: -C llvm-args=--x86-asm-syntax=intel
+// compile-flags: -C opt-level=3
+
+#![crate_type = "rlib"]
+#![no_std]
+
+// CHECK-LABEL: sum_f32:
+// CHECK:      addss xmm0, xmm1
+// CHECK-NEXT: ret
+#[no_mangle]
+pub fn sum_f32(a: f32, b: f32) -> f32 {
+    a + b
+}
+
+// CHECK-LABEL: sum_f32x2:
+// CHECK:      addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-NEXT: ret
+#[no_mangle]
+pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+    [
+        a[0] + b[0],
+        a[1] + b[1],
+    ]
+}
+
+// CHECK-LABEL: sum_f32x4:
+// CHECK:      mov     rax, [[PTR_IN:.*]]
+// CHECK-NEXT: movups  [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
+// CHECK-NEXT: movups  [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
+// CHECK-NEXT: addps   [[XMMB]], [[XMMA]]
+// CHECK-NEXT: movups  xmmword ptr {{\[}}[[PTR_IN]]{{\]}}, [[XMMB]]
+// CHECK-NEXT: ret
+#[no_mangle]
+pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    [
+        a[0] + b[0],
+        a[1] + b[1],
+        a[2] + b[2],
+        a[3] + b[3],
+    ]
+}
diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs
new file mode 100644
index 0000000000000..0b729156d2842
--- /dev/null
+++ b/src/test/codegen/homogeneous-floats.rs
@@ -0,0 +1,32 @@
+//! Check that small (less then 128bits on x86_64) homogeneous floats are either pass as an array
+//! or by a pointer
+
+// compile-flags: -C no-prepopulate-passes -O
+// only-x86_64
+
+#![crate_type = "lib"]
+
+pub struct Foo {
+    bar1: f32,
+    bar2: f32,
+    bar3: f32,
+    bar4: f32,
+}
+
+// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
+#[no_mangle]
+pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+    todo!()
+}
+
+// CHECK: define void @array_f32x4([4 x float]* {{.*}} sret([4 x float]) {{.*}} %0, [4 x float]* {{.*}} %a, [4 x float]* {{.*}} %b)
+#[no_mangle]
+pub fn array_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    todo!()
+}
+
+// CHECK: define void @array_f32x4_nested(%Foo* {{.*}} sret(%Foo) {{.*}} %0, %Foo* {{.*}} %a, %Foo* {{.*}} %b)
+#[no_mangle]
+pub fn array_f32x4_nested(a: Foo, b: Foo) -> Foo {
+    todo!()
+}
diff --git a/src/test/ui/abi/homogenous-floats.rs b/src/test/ui/abi/homogenous-floats.rs
new file mode 100644
index 0000000000000..cbbcd2a47e82c
--- /dev/null
+++ b/src/test/ui/abi/homogenous-floats.rs
@@ -0,0 +1,46 @@
+// This test that no matter the optimization level or the target feature enable, the non
+// aggregation of homogenous floats in the abi is sound and still produce the right answer.
+
+// revisions: opt-0 opt-0-native opt-1 opt-1-native opt-2 opt-2-native opt-3 opt-3-native
+// [opt-0]: compile-flags: -C opt-level=0
+// [opt-1]: compile-flags: -C opt-level=1
+// [opt-2]: compile-flags: -C opt-level=2
+// [opt-3]: compile-flags: -C opt-level=3
+// [opt-0-native]: compile-flags: -C target-cpu=native
+// [opt-1-native]: compile-flags: -C target-cpu=native
+// [opt-2-native]: compile-flags: -C target-cpu=native
+// [opt-3-native]: compile-flags: -C target-cpu=native
+// run-pass
+
+#![feature(core_intrinsics)]
+
+use std::intrinsics::black_box;
+
+pub fn sum_f32(a: f32, b: f32) -> f32 {
+    a + b
+}
+
+pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+    [a[0] + b[0], a[1] + b[1]]
+}
+
+pub fn sum_f32x3(a: [f32; 3], b: [f32; 3]) -> [f32; 3] {
+    [a[0] + b[0], a[1] + b[1], a[2] + b[2]]
+}
+
+pub fn sum_f32x4(a: [f32; 4], b: [f32; 4]) -> [f32; 4] {
+    [a[0] + b[0], a[1] + b[1], a[2] + b[2], a[3] + b[3]]
+}
+
+fn main() {
+    assert_eq!(1., black_box(sum_f32(black_box(0.), black_box(1.))));
+    assert_eq!([2., 2.], black_box(sum_f32x2(black_box([2., 0.]), black_box([0., 2.]))));
+    assert_eq!(
+        [3., 3., 3.],
+        black_box(sum_f32x3(black_box([1., 2., 3.]), black_box([2., 1., 0.])))
+    );
+    assert_eq!(
+        [4., 4., 4., 4.],
+        black_box(sum_f32x4(black_box([1., 2., 3., 4.]), black_box([3., 2., 1., 0.])))
+    );
+}

From dcc75bfc664160fde43b636ac49508ed98cbb247 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20BRANSTETT?= <lolo.branstett@numericable.fr>
Date: Mon, 21 Feb 2022 01:45:03 +0100
Subject: [PATCH 03/12] Test that target feature mix up with homogeneous floats
 is sound

This is basically is ripoff of src/test/ui/simd/target-feature-mixup.rs
but for floats and without #[repr(simd)]
---
 .../homogenous-floats-target-feature-mixup.rs | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 src/test/ui/abi/homogenous-floats-target-feature-mixup.rs

diff --git a/src/test/ui/abi/homogenous-floats-target-feature-mixup.rs b/src/test/ui/abi/homogenous-floats-target-feature-mixup.rs
new file mode 100644
index 0000000000000..536ad4522e742
--- /dev/null
+++ b/src/test/ui/abi/homogenous-floats-target-feature-mixup.rs
@@ -0,0 +1,184 @@
+// This test check that even if we mixup target feature of function with homogenous floats,
+// the abi is sound and still produce the right answer.
+//
+// This is basically the same test as src/test/ui/simd/target-feature-mixup.rs but for floats and
+// without #[repr(simd)]
+
+// run-pass
+// ignore-emscripten
+// ignore-sgx no processes
+
+#![feature(target_feature, cfg_target_feature)]
+#![feature(avx512_target_feature)]
+
+#![allow(overflowing_literals)]
+#![allow(unused_variables)]
+#![allow(stable_features)]
+
+use std::process::{Command, ExitStatus};
+use std::env;
+
+fn main() {
+    if let Some(level) = env::args().nth(1) {
+        return test::main(&level)
+    }
+
+    let me = env::current_exe().unwrap();
+    for level in ["sse", "avx", "avx512"].iter() {
+        let status = Command::new(&me).arg(level).status().unwrap();
+        if status.success() {
+            println!("success with {}", level);
+            continue
+        }
+
+        // We don't actually know if our computer has the requisite target features
+        // for the test below. Testing for that will get added to libstd later so
+        // for now just assume sigill means this is a machine that can't run this test.
+        if is_sigill(status) {
+            println!("sigill with {}, assuming spurious", level);
+            continue
+        }
+        panic!("invalid status at {}: {}", level, status);
+    }
+}
+
+#[cfg(unix)]
+fn is_sigill(status: ExitStatus) -> bool {
+    use std::os::unix::prelude::*;
+    status.signal() == Some(4)
+}
+
+#[cfg(windows)]
+fn is_sigill(status: ExitStatus) -> bool {
+    status.code() == Some(0xc000001d)
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[allow(nonstandard_style)]
+mod test {
+    #[derive(PartialEq, Debug, Clone, Copy)]
+    struct f32x2(f32, f32);
+
+    #[derive(PartialEq, Debug, Clone, Copy)]
+    struct f32x4(f32, f32, f32, f32);
+
+    #[derive(PartialEq, Debug, Clone, Copy)]
+    struct f32x8(f32, f32, f32, f32, f32, f32, f32, f32);
+
+    pub fn main(level: &str) {
+        unsafe {
+            main_normal(level);
+            main_sse(level);
+            if level == "sse" {
+                return
+            }
+            main_avx(level);
+            if level == "avx" {
+                return
+            }
+            main_avx512(level);
+        }
+    }
+
+    macro_rules! mains {
+        ($(
+            $(#[$attr:meta])*
+            unsafe fn $main:ident(level: &str) {
+                ...
+            }
+        )*) => ($(
+            $(#[$attr])*
+            unsafe fn $main(level: &str) {
+                let m128 = f32x2(1., 2.);
+                let m256 = f32x4(3., 4., 5., 6.);
+                let m512 = f32x8(7., 8., 9., 10., 11., 12., 13., 14.);
+                assert_eq!(id_sse_128(m128), m128);
+                assert_eq!(id_sse_256(m256), m256);
+                assert_eq!(id_sse_512(m512), m512);
+
+                if level == "sse" {
+                    return
+                }
+                assert_eq!(id_avx_128(m128), m128);
+                assert_eq!(id_avx_256(m256), m256);
+                assert_eq!(id_avx_512(m512), m512);
+
+                if level == "avx" {
+                    return
+                }
+                assert_eq!(id_avx512_128(m128), m128);
+                assert_eq!(id_avx512_256(m256), m256);
+                assert_eq!(id_avx512_512(m512), m512);
+            }
+        )*)
+    }
+
+    mains! {
+        unsafe fn main_normal(level: &str) { ... }
+        #[target_feature(enable = "sse2")]
+        unsafe fn main_sse(level: &str) { ... }
+        #[target_feature(enable = "avx")]
+        unsafe fn main_avx(level: &str) { ... }
+        #[target_feature(enable = "avx512bw")]
+        unsafe fn main_avx512(level: &str) { ... }
+    }
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn id_sse_128(a: f32x2) -> f32x2 {
+        assert_eq!(a, f32x2(1., 2.));
+        a.clone()
+    }
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn id_sse_256(a: f32x4) -> f32x4 {
+        assert_eq!(a, f32x4(3., 4., 5., 6.));
+        a.clone()
+    }
+
+    #[target_feature(enable = "sse2")]
+    unsafe fn id_sse_512(a: f32x8) -> f32x8 {
+        assert_eq!(a, f32x8(7., 8., 9., 10., 11., 12., 13., 14.));
+        a.clone()
+    }
+
+    #[target_feature(enable = "avx")]
+    unsafe fn id_avx_128(a: f32x2) -> f32x2 {
+        assert_eq!(a, f32x2(1., 2.));
+        a.clone()
+    }
+
+    #[target_feature(enable = "avx")]
+    unsafe fn id_avx_256(a: f32x4) -> f32x4 {
+        assert_eq!(a, f32x4(3., 4., 5., 6.));
+        a.clone()
+    }
+
+    #[target_feature(enable = "avx")]
+    unsafe fn id_avx_512(a: f32x8) -> f32x8 {
+        assert_eq!(a, f32x8(7., 8., 9., 10., 11., 12., 13., 14.));
+        a.clone()
+    }
+
+    #[target_feature(enable = "avx512bw")]
+    unsafe fn id_avx512_128(a: f32x2) -> f32x2 {
+        assert_eq!(a, f32x2(1., 2.));
+        a.clone()
+    }
+
+    #[target_feature(enable = "avx512bw")]
+    unsafe fn id_avx512_256(a: f32x4) -> f32x4 {
+        assert_eq!(a, f32x4(3., 4., 5., 6.));
+        a.clone()
+    }
+
+    #[target_feature(enable = "avx512bw")]
+    unsafe fn id_avx512_512(a: f32x8) -> f32x8 {
+        assert_eq!(a, f32x8(7., 8., 9., 10., 11., 12., 13., 14.));
+        a.clone()
+    }
+}
+
+#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))]
+mod test {
+    pub fn main(level: &str) {}
+}

From ec16a323e8b90f892dd42031657fafe850705edd Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Mon, 30 May 2022 20:37:10 +0200
Subject: [PATCH 04/12] Fix some codegen tests

---
 src/test/codegen/array-equality.rs | 3 +--
 src/test/codegen/union-abi.rs      | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/test/codegen/array-equality.rs b/src/test/codegen/array-equality.rs
index cd5e82a9205c1..b509bc20775b0 100644
--- a/src/test/codegen/array-equality.rs
+++ b/src/test/codegen/array-equality.rs
@@ -56,8 +56,7 @@ pub fn array_eq_zero_short(x: [u16; 3]) -> bool {
 #[no_mangle]
 pub fn array_eq_zero_mid(x: [u16; 8]) -> bool {
     // CHECK-NEXT: start:
-    // CHECK: %[[LOAD:.+]] = load i128,
-    // CHECK-NEXT: %[[EQ:.+]] = icmp eq i128 %[[LOAD]], 0
+    // CHECK-NEXT: %[[EQ:.+]] = icmp eq i128 %0, 0
     // CHECK-NEXT: ret i1 %[[EQ]]
     x == [0; 8]
 }
diff --git a/src/test/codegen/union-abi.rs b/src/test/codegen/union-abi.rs
index c18f2a49fc369..6e3866778a391 100644
--- a/src/test/codegen/union-abi.rs
+++ b/src/test/codegen/union-abi.rs
@@ -65,7 +65,7 @@ pub fn test_UnionU128(_: UnionU128) -> UnionU128 { loop {} }
 
 #[repr(C)]
 pub union CUnionU128{a:u128}
-// CHECK: define void @test_CUnionU128({{%CUnionU128\*|ptr}} {{.*}} %_1)
+// CHECK: define void @test_CUnionU128(i128 %0)
 #[no_mangle]
 pub fn test_CUnionU128(_: CUnionU128) { loop {} }
 

From 9ed05ed87f0b1627836ab555ba86af55eb38168e Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Mon, 30 May 2022 21:12:22 +0200
Subject: [PATCH 05/12] Use simpler heuristic for determining if a layout only
 floats

---
 compiler/rustc_middle/src/ty/layout.rs | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index 847a0e3e58e34..f638063c4408c 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -14,8 +14,7 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
 use rustc_span::symbol::Symbol;
 use rustc_span::{Span, DUMMY_SP};
 use rustc_target::abi::call::{
-    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, HomogeneousAggregate, PassMode,
-    Reg, RegKind,
+    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
 };
 use rustc_target::abi::*;
 use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
@@ -3341,6 +3340,17 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
         Ok(self.tcx.arena.alloc(fn_abi))
     }
 
+    /// Small heuristic for determining if layout has any float primitive
+    fn has_all_float(&self, layout: &'_ TyAndLayout<'tcx>) -> bool {
+        match layout.abi {
+            Abi::Uninhabited | Abi::Vector { .. } => false,
+            Abi::Scalar(scalar) => matches!(scalar.primitive(), Primitive::F32 | Primitive::F64),
+            Abi::ScalarPair(..) | Abi::Aggregate { .. } => {
+                (0..layout.fields.count()).all(|i| self.has_all_float(&layout.field(self, i)))
+            }
+        }
+    }
+
     fn fn_abi_adjust_for_abi(
         &self,
         fn_abi: &mut FnAbi<'tcx, Ty<'tcx>>,
@@ -3370,11 +3380,7 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        } else if let Ok(HomogeneousAggregate::Homogeneous(Reg {
-                            kind: RegKind::Float,
-                            ..
-                        })) = arg.layout.homogeneous_aggregate(self)
-                        {
+                        } else if unlikely!(self.has_all_float(&arg.layout)) {
                             // We don't want to aggregate floats as an aggregates of Integer
                             // because this will hurt the generated assembly (#93490)
                             //

From f1c72be1ebb734ff036390626178f04a4df71799 Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Tue, 31 May 2022 10:27:48 +0200
Subject: [PATCH 06/12] Use nbdd0121 suggestion for reducing the perf impact

---
 compiler/rustc_middle/src/ty/layout.rs        | 16 +++++++---------
 src/test/assembly/x86-64-homogenous-floats.rs | 11 +++++++----
 src/test/codegen/homogeneous-floats.rs        |  2 +-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index f638063c4408c..a9385595bfbe0 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -3375,20 +3375,18 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                         // Pass and return structures up to 2 pointers in size by value,
                         // matching `ScalarPair`. LLVM will usually pass these in 2 registers
                         // which is more efficient than by-ref.
-                        let max_by_val_size = Pointer.size(self) * 2;
+                        let ptr_size = Pointer.size(self);
+                        let max_by_val_size = ptr_size * 2;
                         let size = arg.layout.size;
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        } else if unlikely!(self.has_all_float(&arg.layout)) {
+                        } else if size > ptr_size && unlikely!(self.has_all_float(&arg.layout)) {
                             // We don't want to aggregate floats as an aggregates of Integer
-                            // because this will hurt the generated assembly (#93490)
-                            //
-                            // As an optimization we want to pass homogeneous aggregate of floats
-                            // greater than pointer size as indirect
-                            if size > Pointer.size(self) {
-                                arg.make_indirect();
-                            }
+                            // because this will hurt the generated assembly (#93490) but as an
+                            // optimization we want to pass homogeneous aggregate of floats
+                            // greater than pointer size as indirect.
+                            arg.make_indirect();
                         } else {
                             // We want to pass small aggregates as immediates, but using
                             // a LLVM aggregate type for this leads to bad optimizations,
diff --git a/src/test/assembly/x86-64-homogenous-floats.rs b/src/test/assembly/x86-64-homogenous-floats.rs
index 5b725bab07790..00b434a688d67 100644
--- a/src/test/assembly/x86-64-homogenous-floats.rs
+++ b/src/test/assembly/x86-64-homogenous-floats.rs
@@ -15,12 +15,15 @@ pub fn sum_f32(a: f32, b: f32) -> f32 {
     a + b
 }
 
-// CHECK-LABEL: sum_f32x2:
-// CHECK:      addss xmm{{[0-9]}}, xmm{{[0-9]}}
-// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-LABEL: sum_f64x2:
+// CHECK:      mov     rax, [[PTR_IN:.*]]
+// CHECK-NEXT: movupd  [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
+// CHECK-NEXT: movupd  [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
+// CHECK-NEXT: addpd   [[XMMB]], [[XMMA]]
+// CHECK-NEXT: movupd  xmmword ptr {{\[}}[[PTR_IN]]{{\]}}, [[XMMB]]
 // CHECK-NEXT: ret
 #[no_mangle]
-pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+pub fn sum_f64x2(a: [f64; 2], b: [f64; 2]) -> [f64; 2] {
     [
         a[0] + b[0],
         a[1] + b[1],
diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs
index 0b729156d2842..cf8a62e488ee1 100644
--- a/src/test/codegen/homogeneous-floats.rs
+++ b/src/test/codegen/homogeneous-floats.rs
@@ -13,7 +13,7 @@ pub struct Foo {
     bar4: f32,
 }
 
-// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
+// CHECK: define i64 @array_f32x2(i64 %0, i64 %1)
 #[no_mangle]
 pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
     todo!()

From 1be1d4a5402f8378df8537e0d103b1e18b4b6429 Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Fri, 1 Jul 2022 10:14:28 +0200
Subject: [PATCH 07/12] Remove undefined unlikely! macro

---
 compiler/rustc_middle/src/ty/layout.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index a9385595bfbe0..6696da5062256 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -3381,7 +3381,7 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        } else if size > ptr_size && unlikely!(self.has_all_float(&arg.layout)) {
+                        } else if size > ptr_size && self.has_all_float(&arg.layout) {
                             // We don't want to aggregate floats as an aggregates of Integer
                             // because this will hurt the generated assembly (#93490) but as an
                             // optimization we want to pass homogeneous aggregate of floats

From b2fba9a67f32bc2b8ba609eefd0b21061ca0f27f Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Fri, 1 Jul 2022 10:15:17 +0200
Subject: [PATCH 08/12] Revert "Use nbdd0121 suggestion for reducing the perf
 impact"

This reverts commit 2e0cf271285089316db55b995312712638126245.
---
 compiler/rustc_middle/src/ty/layout.rs        | 16 +++++++++-------
 src/test/assembly/x86-64-homogenous-floats.rs | 11 ++++-------
 src/test/codegen/homogeneous-floats.rs        |  2 +-
 3 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index 6696da5062256..f5fe117c403b6 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -3375,18 +3375,20 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                         // Pass and return structures up to 2 pointers in size by value,
                         // matching `ScalarPair`. LLVM will usually pass these in 2 registers
                         // which is more efficient than by-ref.
-                        let ptr_size = Pointer.size(self);
-                        let max_by_val_size = ptr_size * 2;
+                        let max_by_val_size = Pointer.size(self) * 2;
                         let size = arg.layout.size;
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        } else if size > ptr_size && self.has_all_float(&arg.layout) {
+                        } else if self.has_all_float(&arg.layout) {
                             // We don't want to aggregate floats as an aggregates of Integer
-                            // because this will hurt the generated assembly (#93490) but as an
-                            // optimization we want to pass homogeneous aggregate of floats
-                            // greater than pointer size as indirect.
-                            arg.make_indirect();
+                            // because this will hurt the generated assembly (#93490)
+                            //
+                            // As an optimization we want to pass homogeneous aggregate of floats
+                            // greater than pointer size as indirect
+                            if size > Pointer.size(self) {
+                                arg.make_indirect();
+                            }
                         } else {
                             // We want to pass small aggregates as immediates, but using
                             // a LLVM aggregate type for this leads to bad optimizations,
diff --git a/src/test/assembly/x86-64-homogenous-floats.rs b/src/test/assembly/x86-64-homogenous-floats.rs
index 00b434a688d67..5b725bab07790 100644
--- a/src/test/assembly/x86-64-homogenous-floats.rs
+++ b/src/test/assembly/x86-64-homogenous-floats.rs
@@ -15,15 +15,12 @@ pub fn sum_f32(a: f32, b: f32) -> f32 {
     a + b
 }
 
-// CHECK-LABEL: sum_f64x2:
-// CHECK:      mov     rax, [[PTR_IN:.*]]
-// CHECK-NEXT: movupd  [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
-// CHECK-NEXT: movupd  [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
-// CHECK-NEXT: addpd   [[XMMB]], [[XMMA]]
-// CHECK-NEXT: movupd  xmmword ptr {{\[}}[[PTR_IN]]{{\]}}, [[XMMB]]
+// CHECK-LABEL: sum_f32x2:
+// CHECK:      addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
 // CHECK-NEXT: ret
 #[no_mangle]
-pub fn sum_f64x2(a: [f64; 2], b: [f64; 2]) -> [f64; 2] {
+pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
     [
         a[0] + b[0],
         a[1] + b[1],
diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs
index cf8a62e488ee1..0b729156d2842 100644
--- a/src/test/codegen/homogeneous-floats.rs
+++ b/src/test/codegen/homogeneous-floats.rs
@@ -13,7 +13,7 @@ pub struct Foo {
     bar4: f32,
 }
 
-// CHECK: define i64 @array_f32x2(i64 %0, i64 %1)
+// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
 #[no_mangle]
 pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
     todo!()

From a84f4c9259aafa9fbd1aaa4ba212e45f7f1fac2a Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Sat, 2 Jul 2022 18:12:41 +0200
Subject: [PATCH 09/12] Let LLVM also handle small aggregate

---
 compiler/rustc_middle/src/ty/layout.rs | 58 +++++++++++++-------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index f5fe117c403b6..36c4cfdc38028 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -14,7 +14,7 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
 use rustc_span::symbol::Symbol;
 use rustc_span::{Span, DUMMY_SP};
 use rustc_target::abi::call::{
-    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
+    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, /* Reg, RegKind, */
 };
 use rustc_target::abi::*;
 use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
@@ -3340,16 +3340,16 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
         Ok(self.tcx.arena.alloc(fn_abi))
     }
 
-    /// Small heuristic for determining if layout has any float primitive
-    fn has_all_float(&self, layout: &'_ TyAndLayout<'tcx>) -> bool {
-        match layout.abi {
-            Abi::Uninhabited | Abi::Vector { .. } => false,
-            Abi::Scalar(scalar) => matches!(scalar.primitive(), Primitive::F32 | Primitive::F64),
-            Abi::ScalarPair(..) | Abi::Aggregate { .. } => {
-                (0..layout.fields.count()).all(|i| self.has_all_float(&layout.field(self, i)))
-            }
-        }
-    }
+    // /// Small heuristic for determining if layout has any float primitive
+    // fn has_all_float(&self, layout: &'_ TyAndLayout<'tcx>) -> bool {
+    //     match layout.abi {
+    //         Abi::Uninhabited | Abi::Vector { .. } => false,
+    //         Abi::Scalar(scalar) => matches!(scalar.primitive(), Primitive::F32 | Primitive::F64),
+    //         Abi::ScalarPair(..) | Abi::Aggregate { .. } => {
+    //             (0..layout.fields.count()).all(|i| self.has_all_float(&layout.field(self, i)))
+    //         }
+    //     }
+    // }
 
     fn fn_abi_adjust_for_abi(
         &self,
@@ -3380,24 +3380,24 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        } else if self.has_all_float(&arg.layout) {
-                            // We don't want to aggregate floats as an aggregates of Integer
-                            // because this will hurt the generated assembly (#93490)
-                            //
-                            // As an optimization we want to pass homogeneous aggregate of floats
-                            // greater than pointer size as indirect
-                            if size > Pointer.size(self) {
-                                arg.make_indirect();
-                            }
-                        } else {
-                            // We want to pass small aggregates as immediates, but using
-                            // a LLVM aggregate type for this leads to bad optimizations,
-                            // so we pick an appropriately sized integer type instead.
-                            //
-                            // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
-                            // we could do ([f32; 2], u64) which is better but this is the best we
-                            // can do right now.
-                            arg.cast_to(Reg { kind: RegKind::Integer, size });
+                        // } else if self.has_all_float(&arg.layout) {
+                        //     // We don't want to aggregate floats as an aggregates of Integer
+                        //     // because this will hurt the generated assembly (#93490)
+                        //     //
+                        //     // As an optimization we want to pass homogeneous aggregate of floats
+                        //     // greater than pointer size as indirect
+                        //     if size > Pointer.size(self) {
+                        //         arg.make_indirect();
+                        //     }
+                        // } else {
+                        //     // We want to pass small aggregates as immediates, but using
+                        //     // a LLVM aggregate type for this leads to bad optimizations,
+                        //     // so we pick an appropriately sized integer type instead.
+                        //     //
+                        //     // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
+                        //     // we could do ([f32; 2], u64) which is better but this is the best we
+                        //     // can do right now.
+                        //     arg.cast_to(Reg { kind: RegKind::Integer, size });
                         }
                     }
 

From 0c1451cd57c0312989c0a26f33722799667fe21f Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Sat, 2 Jul 2022 21:36:28 +0200
Subject: [PATCH 10/12] Revert "Revert "Use nbdd0121 suggestion for reducing
 the perf impact""

This reverts commit e136c3a9348200c261b9b3c1c50a2f6f6a68b4bd.
---
 compiler/rustc_middle/src/ty/layout.rs        | 58 +++++++++----------
 src/test/assembly/x86-64-homogenous-floats.rs | 11 ++--
 src/test/codegen/homogeneous-floats.rs        |  2 +-
 3 files changed, 36 insertions(+), 35 deletions(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index 36c4cfdc38028..6696da5062256 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -14,7 +14,7 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
 use rustc_span::symbol::Symbol;
 use rustc_span::{Span, DUMMY_SP};
 use rustc_target::abi::call::{
-    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, /* Reg, RegKind, */
+    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
 };
 use rustc_target::abi::*;
 use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
@@ -3340,16 +3340,16 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
         Ok(self.tcx.arena.alloc(fn_abi))
     }
 
-    // /// Small heuristic for determining if layout has any float primitive
-    // fn has_all_float(&self, layout: &'_ TyAndLayout<'tcx>) -> bool {
-    //     match layout.abi {
-    //         Abi::Uninhabited | Abi::Vector { .. } => false,
-    //         Abi::Scalar(scalar) => matches!(scalar.primitive(), Primitive::F32 | Primitive::F64),
-    //         Abi::ScalarPair(..) | Abi::Aggregate { .. } => {
-    //             (0..layout.fields.count()).all(|i| self.has_all_float(&layout.field(self, i)))
-    //         }
-    //     }
-    // }
+    /// Small heuristic for determining if layout has any float primitive
+    fn has_all_float(&self, layout: &'_ TyAndLayout<'tcx>) -> bool {
+        match layout.abi {
+            Abi::Uninhabited | Abi::Vector { .. } => false,
+            Abi::Scalar(scalar) => matches!(scalar.primitive(), Primitive::F32 | Primitive::F64),
+            Abi::ScalarPair(..) | Abi::Aggregate { .. } => {
+                (0..layout.fields.count()).all(|i| self.has_all_float(&layout.field(self, i)))
+            }
+        }
+    }
 
     fn fn_abi_adjust_for_abi(
         &self,
@@ -3375,29 +3375,27 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
                         // Pass and return structures up to 2 pointers in size by value,
                         // matching `ScalarPair`. LLVM will usually pass these in 2 registers
                         // which is more efficient than by-ref.
-                        let max_by_val_size = Pointer.size(self) * 2;
+                        let ptr_size = Pointer.size(self);
+                        let max_by_val_size = ptr_size * 2;
                         let size = arg.layout.size;
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        // } else if self.has_all_float(&arg.layout) {
-                        //     // We don't want to aggregate floats as an aggregates of Integer
-                        //     // because this will hurt the generated assembly (#93490)
-                        //     //
-                        //     // As an optimization we want to pass homogeneous aggregate of floats
-                        //     // greater than pointer size as indirect
-                        //     if size > Pointer.size(self) {
-                        //         arg.make_indirect();
-                        //     }
-                        // } else {
-                        //     // We want to pass small aggregates as immediates, but using
-                        //     // a LLVM aggregate type for this leads to bad optimizations,
-                        //     // so we pick an appropriately sized integer type instead.
-                        //     //
-                        //     // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
-                        //     // we could do ([f32; 2], u64) which is better but this is the best we
-                        //     // can do right now.
-                        //     arg.cast_to(Reg { kind: RegKind::Integer, size });
+                        } else if size > ptr_size && self.has_all_float(&arg.layout) {
+                            // We don't want to aggregate floats as an aggregates of Integer
+                            // because this will hurt the generated assembly (#93490) but as an
+                            // optimization we want to pass homogeneous aggregate of floats
+                            // greater than pointer size as indirect.
+                            arg.make_indirect();
+                        } else {
+                            // We want to pass small aggregates as immediates, but using
+                            // a LLVM aggregate type for this leads to bad optimizations,
+                            // so we pick an appropriately sized integer type instead.
+                            //
+                            // NOTE: This is sub-optimal because in the case of (f32, f32, u32, u32)
+                            // we could do ([f32; 2], u64) which is better but this is the best we
+                            // can do right now.
+                            arg.cast_to(Reg { kind: RegKind::Integer, size });
                         }
                     }
 
diff --git a/src/test/assembly/x86-64-homogenous-floats.rs b/src/test/assembly/x86-64-homogenous-floats.rs
index 5b725bab07790..00b434a688d67 100644
--- a/src/test/assembly/x86-64-homogenous-floats.rs
+++ b/src/test/assembly/x86-64-homogenous-floats.rs
@@ -15,12 +15,15 @@ pub fn sum_f32(a: f32, b: f32) -> f32 {
     a + b
 }
 
-// CHECK-LABEL: sum_f32x2:
-// CHECK:      addss xmm{{[0-9]}}, xmm{{[0-9]}}
-// CHECK-NEXT: addss xmm{{[0-9]}}, xmm{{[0-9]}}
+// CHECK-LABEL: sum_f64x2:
+// CHECK:      mov     rax, [[PTR_IN:.*]]
+// CHECK-NEXT: movupd  [[XMMA:xmm[0-9]]], xmmword ptr [rsi]
+// CHECK-NEXT: movupd  [[XMMB:xmm[0-9]]], xmmword ptr [rdx]
+// CHECK-NEXT: addpd   [[XMMB]], [[XMMA]]
+// CHECK-NEXT: movupd  xmmword ptr {{\[}}[[PTR_IN]]{{\]}}, [[XMMB]]
 // CHECK-NEXT: ret
 #[no_mangle]
-pub fn sum_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
+pub fn sum_f64x2(a: [f64; 2], b: [f64; 2]) -> [f64; 2] {
     [
         a[0] + b[0],
         a[1] + b[1],
diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs
index 0b729156d2842..cf8a62e488ee1 100644
--- a/src/test/codegen/homogeneous-floats.rs
+++ b/src/test/codegen/homogeneous-floats.rs
@@ -13,7 +13,7 @@ pub struct Foo {
     bar4: f32,
 }
 
-// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
+// CHECK: define i64 @array_f32x2(i64 %0, i64 %1)
 #[no_mangle]
 pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
     todo!()

From c7e8880f484f939a042ddbab964c57aa0237691e Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Sun, 3 Jul 2022 14:08:06 +0200
Subject: [PATCH 11/12] Retry with the homogeneous aggregate concept

---
 compiler/rustc_middle/src/ty/layout.rs | 31 ++++++++++++--------------
 src/test/codegen/homogeneous-floats.rs |  2 +-
 2 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index 6696da5062256..db385b0e1d2d3 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -14,7 +14,8 @@ use rustc_session::{config::OptLevel, DataTypeKind, FieldInfo, SizeKind, Variant
 use rustc_span::symbol::Symbol;
 use rustc_span::{Span, DUMMY_SP};
 use rustc_target::abi::call::{
-    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, PassMode, Reg, RegKind,
+    ArgAbi, ArgAttribute, ArgAttributes, ArgExtension, Conv, FnAbi, HomogeneousAggregate, PassMode,
+    Reg, RegKind,
 };
 use rustc_target::abi::*;
 use rustc_target::spec::{abi::Abi as SpecAbi, HasTargetSpec, PanicStrategy, Target};
@@ -3340,17 +3341,6 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
         Ok(self.tcx.arena.alloc(fn_abi))
     }
 
-    /// Small heuristic for determining if layout has any float primitive
-    fn has_all_float(&self, layout: &'_ TyAndLayout<'tcx>) -> bool {
-        match layout.abi {
-            Abi::Uninhabited | Abi::Vector { .. } => false,
-            Abi::Scalar(scalar) => matches!(scalar.primitive(), Primitive::F32 | Primitive::F64),
-            Abi::ScalarPair(..) | Abi::Aggregate { .. } => {
-                (0..layout.fields.count()).all(|i| self.has_all_float(&layout.field(self, i)))
-            }
-        }
-    }
-
     fn fn_abi_adjust_for_abi(
         &self,
         fn_abi: &mut FnAbi<'tcx, Ty<'tcx>>,
@@ -3381,12 +3371,19 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
                             arg.make_indirect();
-                        } else if size > ptr_size && self.has_all_float(&arg.layout) {
+                        } else if let Ok(HomogeneousAggregate::Homogeneous(Reg {
+                            kind: RegKind::Float,
+                            ..
+                        })) = arg.layout.homogeneous_aggregate(self)
+                        {
                             // We don't want to aggregate floats as an aggregates of Integer
-                            // because this will hurt the generated assembly (#93490) but as an
-                            // optimization we want to pass homogeneous aggregate of floats
-                            // greater than pointer size as indirect.
-                            arg.make_indirect();
+                            // because this will hurt the generated assembly (#93490)
+                            //
+                            // As an optimization we want to pass homogeneous aggregate of floats
+                            // greater than pointer size as indirect
+                            if size > ptr_size {
+                                arg.make_indirect();
+                            }
                         } else {
                             // We want to pass small aggregates as immediates, but using
                             // a LLVM aggregate type for this leads to bad optimizations,
diff --git a/src/test/codegen/homogeneous-floats.rs b/src/test/codegen/homogeneous-floats.rs
index cf8a62e488ee1..0b729156d2842 100644
--- a/src/test/codegen/homogeneous-floats.rs
+++ b/src/test/codegen/homogeneous-floats.rs
@@ -13,7 +13,7 @@ pub struct Foo {
     bar4: f32,
 }
 
-// CHECK: define i64 @array_f32x2(i64 %0, i64 %1)
+// CHECK: define [2 x float] @array_f32x2([2 x float] %0, [2 x float] %1)
 #[no_mangle]
 pub fn array_f32x2(a: [f32; 2], b: [f32; 2]) -> [f32; 2] {
     todo!()

From 683e13f3476ea488e77b2a9e7babcab051957aba Mon Sep 17 00:00:00 2001
From: Urgau <urgau@numericable.fr>
Date: Tue, 5 Jul 2022 11:43:42 +0200
Subject: [PATCH 12/12] Revert max_by_val_size * 2

---
 compiler/rustc_middle/src/ty/layout.rs | 6 ++----
 src/test/codegen/array-equality.rs     | 3 ++-
 src/test/codegen/issue-37945.rs        | 4 ++--
 src/test/codegen/union-abi.rs          | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/compiler/rustc_middle/src/ty/layout.rs b/compiler/rustc_middle/src/ty/layout.rs
index db385b0e1d2d3..498cbf0a462ba 100644
--- a/compiler/rustc_middle/src/ty/layout.rs
+++ b/compiler/rustc_middle/src/ty/layout.rs
@@ -3362,11 +3362,9 @@ impl<'tcx> LayoutCx<'tcx, TyCtxt<'tcx>> {
 
                 match arg.layout.abi {
                     Abi::Aggregate { .. } => {
-                        // Pass and return structures up to 2 pointers in size by value,
-                        // matching `ScalarPair`. LLVM will usually pass these in 2 registers
-                        // which is more efficient than by-ref.
+                        // Pass and return structures up to 1 pointers in size by value.
                         let ptr_size = Pointer.size(self);
-                        let max_by_val_size = ptr_size * 2;
+                        let max_by_val_size = ptr_size;
                         let size = arg.layout.size;
 
                         if arg.layout.is_unsized() || size > max_by_val_size {
diff --git a/src/test/codegen/array-equality.rs b/src/test/codegen/array-equality.rs
index b509bc20775b0..cd5e82a9205c1 100644
--- a/src/test/codegen/array-equality.rs
+++ b/src/test/codegen/array-equality.rs
@@ -56,7 +56,8 @@ pub fn array_eq_zero_short(x: [u16; 3]) -> bool {
 #[no_mangle]
 pub fn array_eq_zero_mid(x: [u16; 8]) -> bool {
     // CHECK-NEXT: start:
-    // CHECK-NEXT: %[[EQ:.+]] = icmp eq i128 %0, 0
+    // CHECK: %[[LOAD:.+]] = load i128,
+    // CHECK-NEXT: %[[EQ:.+]] = icmp eq i128 %[[LOAD]], 0
     // CHECK-NEXT: ret i1 %[[EQ]]
     x == [0; 8]
 }
diff --git a/src/test/codegen/issue-37945.rs b/src/test/codegen/issue-37945.rs
index 4234c26b5e89b..24d3dfb237e70 100644
--- a/src/test/codegen/issue-37945.rs
+++ b/src/test/codegen/issue-37945.rs
@@ -17,7 +17,7 @@ pub fn is_empty_1(xs: Iter<f32>) -> bool {
 // CHECK-NEXT:  start:
 // CHECK-NEXT:    [[A:%.*]] = icmp ne {{i32\*|ptr}} %xs.1, null
 // CHECK-NEXT:    tail call void @llvm.assume(i1 [[A]])
-// CHECK-NEXT:    [[B:%.*]] = icmp eq {{i32\*|ptr}} %xs.1, %xs.0
+// CHECK-NEXT:    [[B:%.*]] = icmp eq {{i32\*|ptr}} {{%xs.1, %xs.0|%xs.0, %xs.1}}
 // CHECK-NEXT:    ret i1 [[B:%.*]]
     {xs}.next().is_none()
 }
@@ -28,7 +28,7 @@ pub fn is_empty_2(xs: Iter<f32>) -> bool {
 // CHECK-NEXT:  start:
 // CHECK-NEXT:    [[C:%.*]] = icmp ne {{i32\*|ptr}} %xs.1, null
 // CHECK-NEXT:    tail call void @llvm.assume(i1 [[C]])
-// CHECK-NEXT:    [[D:%.*]] = icmp eq {{i32\*|ptr}} %xs.1, %xs.0
+// CHECK-NEXT:    [[D:%.*]] = icmp eq {{i32\*|ptr}} {{%xs.1, %xs.0|%xs.0, %xs.1}}
 // CHECK-NEXT:    ret i1 [[D:%.*]]
     xs.map(|&x| x).next().is_none()
 }
diff --git a/src/test/codegen/union-abi.rs b/src/test/codegen/union-abi.rs
index 6e3866778a391..c18f2a49fc369 100644
--- a/src/test/codegen/union-abi.rs
+++ b/src/test/codegen/union-abi.rs
@@ -65,7 +65,7 @@ pub fn test_UnionU128(_: UnionU128) -> UnionU128 { loop {} }
 
 #[repr(C)]
 pub union CUnionU128{a:u128}
-// CHECK: define void @test_CUnionU128(i128 %0)
+// CHECK: define void @test_CUnionU128({{%CUnionU128\*|ptr}} {{.*}} %_1)
 #[no_mangle]
 pub fn test_CUnionU128(_: CUnionU128) { loop {} }