From 4e557210b7f9fd669ff66c6958327eb2d4262d80 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Sun, 12 Jan 2025 16:48:53 +0000
Subject: [PATCH] Alpha: Optimize block moves coming from longword-aligned
 source

Now that we have proper alignment determination for block moves in place
the case of copying a block of longword-aligned data has become real, so
implement the merging of loaded data from pairs of SImode registers into
single DImode registers for the purpose of using with unaligned stores
efficiently, as suggested by a comment in `alpha_expand_block_move' and
discard the comment.  Provide test cases accordingly.

	gcc/
	* config/alpha/alpha.cc (alpha_expand_block_move): Merge loaded
	data from pairs of SImode registers into single DImode registers
	if to be used with unaligned stores.

	gcc/testsuite/
	* gcc.target/alpha/memcpy-si-aligned.c: New file.
	* gcc.target/alpha/memcpy-si-unaligned.c: New file.
	* gcc.target/alpha/memcpy-si-unaligned-dst.c: New file.
	* gcc.target/alpha/memcpy-si-unaligned-src.c: New file.
	* gcc.target/alpha/memcpy-si-unaligned-src-bwx.c: New file.
---
 gcc/config/alpha/alpha.cc                     | 45 +++++++++++++---
 .../gcc.target/alpha/memcpy-si-aligned.c      | 16 ++++++
 .../alpha/memcpy-si-unaligned-dst.c           | 16 ++++++
 .../alpha/memcpy-si-unaligned-src-bwx.c       | 11 ++++
 .../alpha/memcpy-si-unaligned-src.c           | 15 ++++++
 .../gcc.target/alpha/memcpy-si-unaligned.c    | 51 +++++++++++++++++++
 6 files changed, 146 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c
 create mode 100644 gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c
 create mode 100644 gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c
 create mode 100644 gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c
 create mode 100644 gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c

diff --git a/gcc/config/alpha/alpha.cc b/gcc/config/alpha/alpha.cc
index 958a785ffd0e..8ec9e8c5d399 100644
--- a/gcc/config/alpha/alpha.cc
+++ b/gcc/config/alpha/alpha.cc
@@ -3931,14 +3931,44 @@ alpha_expand_block_move (rtx operands[])
     {
       words = bytes / 4;
 
-      for (i = 0; i < words; ++i)
-	data_regs[nregs + i] = gen_reg_rtx (SImode);
+      /* Load an even quantity of SImode data pieces only.  */
+      unsigned int hwords = words / 2;
+      for (i = 0; i / 2 < hwords; ++i)
+	{
+	  data_regs[nregs + i] = gen_reg_rtx (SImode);
+	  emit_move_insn (data_regs[nregs + i],
+			  adjust_address (orig_src, SImode, ofs + i * 4));
+	}
 
-      for (i = 0; i < words; ++i)
-	emit_move_insn (data_regs[nregs + i],
-			adjust_address (orig_src, SImode, ofs + i * 4));
+      /* If we'll be using unaligned stores, merge data from pairs
+	 of SImode registers into DImode registers so that we can
+	 store it more efficiently via quadword unaligned stores.  */
+      unsigned int j;
+      if (dst_align < 32)
+	for (i = 0, j = 0; i < words / 2; ++i, j = i * 2)
+	  {
+	    rtx hi = expand_simple_binop (DImode, ASHIFT,
+					  data_regs[nregs + j + 1],
+					  GEN_INT (32), NULL_RTX,
+					  1, OPTAB_WIDEN);
+	    data_regs[nregs + i] = expand_simple_binop (DImode, IOR, hi,
+							data_regs[nregs + j],
+							NULL_RTX,
+							1, OPTAB_WIDEN);
+	  }
+      else
+	j = i;
 
-      nregs += words;
+      /* Take care of any remaining odd trailing SImode data piece.  */
+      if (j < words)
+	{
+	  data_regs[nregs + i] = gen_reg_rtx (SImode);
+	  emit_move_insn (data_regs[nregs + i],
+			  adjust_address (orig_src, SImode, ofs + j * 4));
+	  ++i;
+	}
+
+      nregs += i;
       bytes -= words * 4;
       ofs += words * 4;
     }
@@ -4057,13 +4087,12 @@ alpha_expand_block_move (rtx operands[])
     }
 
   /* Due to the above, this won't be aligned.  */
-  /* ??? If we have more than one of these, consider constructing full
-     words in registers and using alpha_expand_unaligned_store_words.  */
   while (i < nregs && GET_MODE (data_regs[i]) == SImode)
     {
       alpha_expand_unaligned_store (orig_dst, data_regs[i], 4, ofs);
       ofs += 4;
       i++;
+      gcc_assert (i == nregs || GET_MODE (data_regs[i]) != SImode);
     }
 
   if (dst_align >= 16)
diff --git a/gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c b/gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c
new file mode 100644
index 000000000000..2572a3187e9d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/alpha/memcpy-si-aligned.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned int aligned_src_si[17] = { [0 ... 16] = 0xeaebeced };
+unsigned int aligned_dst_si[17] = { [0 ... 16] = 0xdcdbdad9 };
+
+void
+memcpy_aligned_data_si (void)
+{
+  __builtin_memcpy (aligned_dst_si + 1, aligned_src_si + 1, 60);
+}
+
+/* { dg-final { scan-assembler-times "\\sldl\\s" 15 } } */
+/* { dg-final { scan-assembler-times "\\sstl\\s" 15 } } */
+/* { dg-final { scan-assembler-not "\\s(?:ldq_u|stq_u)\\s" } } */
diff --git a/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c b/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c
new file mode 100644
index 000000000000..a2efade87ca4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-dst.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned int unaligned_src_si[17] = { [0 ... 16] = 0xfefdfcfb };
+
+void
+memcpy_unaligned_dst_si (void *dst)
+{
+  __builtin_memcpy (dst, unaligned_src_si + 1, 60);
+}
+
+/* { dg-final { scan-assembler-times "\\sldl\\s" 15 } } */
+/* { dg-final { scan-assembler-times "\\sldq_u\\s" 4 } } */
+/* { dg-final { scan-assembler-times "\\sstq_u\\s" 10 } } */
+/* { dg-final { scan-assembler-not "\\sstl\\s" } } */
diff --git a/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c b/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c
new file mode 100644
index 000000000000..df25c772a6ca
--- /dev/null
+++ b/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src-bwx.c
@@ -0,0 +1,11 @@
+/* { dg-do compile } */
+/* { dg-options "-mbwx" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+#include "memcpy-si-unaligned-src.c"
+
+/* { dg-final { scan-assembler-times "\\sldbu\\s" 4 } } */
+/* { dg-final { scan-assembler-times "\\sldq_u\\s" 8 } } */
+/* { dg-final { scan-assembler-times "\\sstb\\s" 4 } } */
+/* { dg-final { scan-assembler-times "\\sstl\\s" 14 } } */
+/* { dg-final { scan-assembler-not "\\s(?:ldl|stq_u)\\s" } } */
diff --git a/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c b/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c
new file mode 100644
index 000000000000..5140d2f8f47e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned-src.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-options "-mno-bwx" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned int unaligned_dst_si[17] = { [0 ... 16] = 0xc8c9cacb };
+
+void
+memcpy_unaligned_src_si (const void *src)
+{
+  __builtin_memcpy (unaligned_dst_si + 1, src, 60);
+}
+
+/* { dg-final { scan-assembler-times "\\sldq_u\\s" 10 } } */
+/* { dg-final { scan-assembler-times "\\sstl\\s" 15 } } */
+/* { dg-final { scan-assembler-not "\\s(?:ldl|stq_u)\\s" } } */
diff --git a/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c b/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c
new file mode 100644
index 000000000000..9ce61ab801fb
--- /dev/null
+++ b/gcc/testsuite/gcc.target/alpha/memcpy-si-unaligned.c
@@ -0,0 +1,51 @@
+/* { dg-do run } */
+/* { dg-additional-sources memcpy-si-aligned.c } */
+/* { dg-additional-sources memcpy-si-unaligned-src.c } */
+/* { dg-additional-sources memcpy-si-unaligned-dst.c } */
+/* { dg-options "" } */
+
+void memcpy_aligned_data_si (void);
+void memcpy_unaligned_dst_si (void *);
+void memcpy_unaligned_src_si (const void *);
+
+extern unsigned int aligned_src_si[];
+extern unsigned int aligned_dst_si[];
+extern unsigned int unaligned_src_si[];
+extern unsigned int unaligned_dst_si[];
+
+int
+main (void)
+{
+  unsigned int v;
+  int i;
+
+  for (i = 1, v = 0x04030201; i < 16; i++, v += 0x04040404)
+    unaligned_src_si[i] = v;
+  asm ("" : : : "memory");
+  memcpy_unaligned_dst_si (aligned_src_si + 1);
+  asm ("" : : : "memory");
+  memcpy_aligned_data_si ();
+  asm ("" : : : "memory");
+  memcpy_unaligned_src_si (aligned_dst_si + 1);
+  asm ("" : : : "memory");
+  for (i = 1, v = 0x04030201; i < 16; i++, v += 0x04040404)
+    if (unaligned_dst_si[i] != v)
+      return 1;
+  if (unaligned_src_si[0] != 0xfefdfcfb)
+      return 1;
+  if (unaligned_src_si[16] != 0xfefdfcfb)
+      return 1;
+  if (aligned_src_si[0] != 0xeaebeced)
+      return 1;
+  if (aligned_src_si[16] != 0xeaebeced)
+      return 1;
+  if (aligned_dst_si[0] != 0xdcdbdad9)
+      return 1;
+  if (aligned_dst_si[16] != 0xdcdbdad9)
+      return 1;
+  if (unaligned_dst_si[0] != 0xc8c9cacb)
+      return 1;
+  if (unaligned_dst_si[16] != 0xc8c9cacb)
+      return 1;
+  return 0;
+}
-- 
GitLab