From ed8cd42d138fa048e0c0eff1ea28b39f5abe1c29 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@orcam.me.uk>
Date: Sun, 12 Jan 2025 16:48:54 +0000
Subject: [PATCH] Alpha: Fix a block move pessimisation with zero-extension
 after LDWU

For the BWX case we have a pessimisation in `alpha_expand_block_move'
for HImode loads where we place the data loaded into a HImode register
as well, therefore losing information that indeed the data loaded has
already been zero-extended to the full DImode width of the register.
Later on when we store this data in QImode quantities into an unaligned
destination, we zero-extend it again for the purpose of right-shifting,
such as with the test case included producing code at `-O2' as follows:

	ldah $2,unaligned_src_hi($29)		!gprelhigh
	lda $1,unaligned_src_hi($2)		!gprellow
	ldwu $6,unaligned_src_hi($2)		!gprellow
	ldwu $5,2($1)
	ldwu $4,4($1)
	bis $31,$31,$31
	zapnot $6,3,$3				# Redundant!
	ldbu $7,6($1)
	zapnot $5,3,$2				# Redundant!
	stb $6,0($16)
	zapnot $4,3,$1				# Redundant!
	stb $5,2($16)
	srl $3,8,$3
	stb $4,4($16)
	srl $2,8,$2
	stb $3,1($16)
	srl $1,8,$1
	stb $2,3($16)
	stb $1,5($16)
	stb $7,6($16)

The non-BWX case is unaffected, because there we use byte insertion, so
we don't care that data is held in a HImode register.

Address this by making the holding RTX a HImode subreg of the original
DImode register, which the RTL passes can then see through and eliminate
the zero-extension where otherwise required, resulting in this shortened
code:

	ldah $2,unaligned_src_hi($29)		!gprelhigh
	lda $1,unaligned_src_hi($2)		!gprellow
	ldwu $4,unaligned_src_hi($2)		!gprellow
	ldwu $3,2($1)
	ldwu $2,4($1)
	bis $31,$31,$31
	srl $4,8,$6
	ldbu $1,6($1)
	srl $3,8,$5
	stb $4,0($16)
	stb $6,1($16)
	srl $2,8,$4
	stb $3,2($16)
	stb $5,3($16)
	stb $2,4($16)
	stb $4,5($16)
	stb $1,6($16)

While at it reformat the enclosing do-while statement according to the
GNU Coding Standards, observing that in this case it does not obfuscate
the change owing to the odd original indentation.

	gcc/
	* config/alpha/alpha.cc (alpha_expand_block_move): Use a HImode
	subreg of a DImode register to hold data from an aligned HImode
	load.
---
 gcc/config/alpha/alpha.cc                       | 17 +++++++++++------
 .../gcc.target/alpha/memcpy-hi-unaligned-dst.c  | 16 ++++++++++++++++
 2 files changed, 27 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/alpha/memcpy-hi-unaligned-dst.c

diff --git a/gcc/config/alpha/alpha.cc b/gcc/config/alpha/alpha.cc
index 8ec9e8c5d399..6965ece16d0b 100644
--- a/gcc/config/alpha/alpha.cc
+++ b/gcc/config/alpha/alpha.cc
@@ -3999,14 +3999,19 @@ alpha_expand_block_move (rtx operands[])
   if (bytes >= 2)
     {
       if (src_align >= 16)
-	{
-	  do {
-	    data_regs[nregs++] = tmp = gen_reg_rtx (HImode);
-	    emit_move_insn (tmp, adjust_address (orig_src, HImode, ofs));
+	do
+	  {
+	    tmp = gen_reg_rtx (DImode);
+	    emit_move_insn (tmp,
+			    expand_simple_unop (DImode, SET,
+						adjust_address (orig_src,
+								HImode, ofs),
+						NULL_RTX, 1));
+	    data_regs[nregs++] = gen_rtx_SUBREG (HImode, tmp, 0);
 	    bytes -= 2;
 	    ofs += 2;
-	  } while (bytes >= 2);
-	}
+	  }
+	while (bytes >= 2);
       else if (! TARGET_BWX)
 	{
 	  data_regs[nregs++] = tmp = gen_reg_rtx (HImode);
diff --git a/gcc/testsuite/gcc.target/alpha/memcpy-hi-unaligned-dst.c b/gcc/testsuite/gcc.target/alpha/memcpy-hi-unaligned-dst.c
new file mode 100644
index 000000000000..4e3c02f5b906
--- /dev/null
+++ b/gcc/testsuite/gcc.target/alpha/memcpy-hi-unaligned-dst.c
@@ -0,0 +1,16 @@
+/* { dg-do compile } */
+/* { dg-options "-mbwx" } */
+/* { dg-skip-if "" { *-*-* } { "-O0" } } */
+
+unsigned short unaligned_src_hi[4];
+
+void
+memcpy_unaligned_dst_hi (void *dst)
+{
+  __builtin_memcpy (dst, unaligned_src_hi, 7);
+}
+
+/* { dg-final { scan-assembler-times "\\sldwu\\s" 3 } } */
+/* { dg-final { scan-assembler-times "\\sldbu\\s" 1 } } */
+/* { dg-final { scan-assembler-times "\\sstb\\s" 7 } } */
+/* { dg-final { scan-assembler-not "\\szapnot\\s" } } */
-- 
GitLab