From 606527f3d9e35164fc8139e5e2c53e66ff850b1a Mon Sep 17 00:00:00 2001
From: Richard Sandiford <richard.sandiford@arm.com>
Date: Mon, 3 Feb 2025 17:35:06 +0000
Subject: [PATCH] aarch64: Fix dupq_* testsuite failures

This patch fixes the dupq_* testsuite failures.  The tests were
introduced with r15-3669-ga92f54f580c3 (which was a nice improvement)
and Pengxuan originally had a follow-on patch to recognise INDEX
constants during vec_init.

I'd originally wanted to solve this a different way, using wildcards
when building a vector and letting vector_builder::finalize find the
best way of filling them in.  I no longer think that's the best
approach though.  Stepped constants are likely to be more expensive
than unstepped constants, so we should first try finding an unstepped
constant that is valid, even if it has a longer representation than
the stepped version.

This patch therefore uses a variant of Pengxuan's idea.

While there, I noticed that the (old) code for finding an unstepped
constant only tried varying one bit at a time.  So for index 0 in a
16-element constant, the code would try picking a constant from index 8,
4, 2, and then 1.  But since the goal is to create "fewer, larger,
repeating parts", it would be better to iterate over a bit-reversed
increment, so that after trying an XOR with 0 and 8, we try adding 4
to each previous attempt, then 2 to each previous attempt, and so on.
In the previous example this would give 8, 4, 12, 2, 10, 6, 14, ...

The test shows an example of this for 8 shorts.

gcc/
	* config/aarch64/aarch64.cc (aarch64_choose_vector_init_constant):
	New function, split out from...
	(aarch64_expand_vector_init_fallback): ...here.  Use a bit-
	reversed increment to find a constant index.  Add support for
	stepped constants.

gcc/testsuite/
	* gcc.target/aarch64/sve/acle/general/dupq_12.c: New test.
---
 gcc/config/aarch64/aarch64.cc                 | 110 +++++++++++++-----
 .../aarch64/sve/acle/general/dupq_12.c        |  13 +++
 2 files changed, 95 insertions(+), 28 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_12.c

diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index be99137b052d..16754fa9e7bd 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -24036,6 +24036,85 @@ aarch64_simd_make_constant (rtx vals)
     return NULL_RTX;
 }
 
+/* VALS is a PARALLEL rtx that contains element values for a vector of
+   mode MODE.  Return a constant that contains all the CONST_INT and
+   CONST_DOUBLE elements of VALS, using any convenient values for the
+   other elements.  */
+
+static rtx
+aarch64_choose_vector_init_constant (machine_mode mode, rtx vals)
+{
+  unsigned int n_elts = XVECLEN (vals, 0);
+
+  /* We really don't care what goes into the parts we will overwrite, but we're
+     more likely to be able to load the constant efficiently if it has fewer,
+     larger, repeating parts (see aarch64_simd_valid_imm).  */
+  rtvec copy = shallow_copy_rtvec (XVEC (vals, 0));
+  for (unsigned int i = 0; i < n_elts; ++i)
+    {
+      rtx x = RTVEC_ELT (copy, i);
+      if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
+	continue;
+      /* This is effectively a bit-reversed increment, e.g.: 8, 4, 12,
+	 2, 10, 6, 12, ... for n_elts == 16.  The early break makes the
+	 outer "i" loop O(n_elts * log(n_elts)).  */
+      unsigned int j = 0;
+      for (;;)
+	{
+	  for (unsigned int bit = n_elts / 2; bit > 0; bit /= 2)
+	    {
+	      j ^= bit;
+	      if (j & bit)
+		break;
+	    }
+	  rtx test = XVECEXP (vals, 0, i ^ j);
+	  if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
+	    {
+	      RTVEC_ELT (copy, i) = test;
+	      break;
+	    }
+	  gcc_assert (j != 0);
+	}
+    }
+
+  rtx c = gen_rtx_CONST_VECTOR (mode, copy);
+  if (aarch64_simd_valid_mov_imm (c))
+    return c;
+
+  /* Try generating a stepped sequence.  */
+  if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
+    for (unsigned int i = 0; i < n_elts; ++i)
+      if (CONST_INT_P (XVECEXP (vals, 0, i)))
+	{
+	  auto base = UINTVAL (XVECEXP (vals, 0, i));
+	  for (unsigned int j = i + 1; j < n_elts; ++j)
+	    if (CONST_INT_P (XVECEXP (vals, 0, j)))
+	      {
+		/* It doesn't matter whether this division is exact.
+		   All that matters is whether the constant we produce
+		   is valid.  */
+		HOST_WIDE_INT diff = UINTVAL (XVECEXP (vals, 0, j)) - base;
+		unsigned HOST_WIDE_INT step = diff / int (j - i);
+		rtx_vector_builder builder (mode, n_elts, 1);
+		for (unsigned int k = 0; k < n_elts; ++k)
+		  {
+		    rtx x = XVECEXP (vals, 0, k);
+		    if (!CONST_INT_P (x))
+		      x = gen_int_mode (int (k - i) * step + base,
+					GET_MODE_INNER (mode));
+		    builder.quick_push (x);
+		  }
+		rtx step_c = builder.build ();
+		if (aarch64_simd_valid_mov_imm (step_c))
+		  return step_c;
+		break;
+	      }
+	  break;
+	}
+
+  return c;
+}
+
 /* A subroutine of aarch64_expand_vector_init, with the same interface.
    The caller has already tried a divide-and-conquer approach, so do
    not consider that case here.  */
@@ -24049,7 +24128,6 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals)
   int n_elts = XVECLEN (vals, 0);
   /* The number of vector elements which are not constant.  */
   int n_var = 0;
-  rtx any_const = NULL_RTX;
   /* The first element of vals.  */
   rtx v0 = XVECEXP (vals, 0, 0);
   bool all_same = true;
@@ -24075,8 +24153,6 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals)
       rtx x = XVECEXP (vals, 0, i);
       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
 	++n_var;
-      else
-	any_const = x;
 
       all_same &= rtx_equal_p (x, v0);
     }
@@ -24220,31 +24296,9 @@ aarch64_expand_vector_init_fallback (rtx target, rtx vals)
      can.  */
   if (n_var != n_elts)
     {
-      rtx copy = copy_rtx (vals);
-
-      /* Load constant part of vector.  We really don't care what goes into the
-	 parts we will overwrite, but we're more likely to be able to load the
-	 constant efficiently if it has fewer, larger, repeating parts
-	 (see aarch64_simd_valid_imm).  */
-      for (int i = 0; i < n_elts; i++)
-	{
-	  rtx x = XVECEXP (vals, 0, i);
-	  if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
-	    continue;
-	  rtx subst = any_const;
-	  for (int bit = n_elts / 2; bit > 0; bit /= 2)
-	    {
-	      /* Look in the copied vector, as more elements are const.  */
-	      rtx test = XVECEXP (copy, 0, i ^ bit);
-	      if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
-		{
-		  subst = test;
-		  break;
-		}
-	    }
-	  XVECEXP (copy, 0, i) = subst;
-	}
-      aarch64_expand_vector_init_fallback (target, copy);
+      /* Load the constant part of the vector.  */
+      rtx constant = aarch64_choose_vector_init_constant (mode, vals);
+      emit_move_insn (target, constant);
     }
 
   /* Insert the variable lanes directly.  */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_12.c b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_12.c
new file mode 100644
index 000000000000..690cb134ad5c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/acle/general/dupq_12.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-require-effective-target aarch64_little_endian } */
+
+#include <arm_sve.h>
+
+svint16_t
+dupq (int x)
+{
+  return svdupq_s16 (x, 0, x, 0, x, 0, 11, 0);
+}
+
+/* { dg-final { scan-assembler {\tmovi\tv[0-9]+\.4s, #?(?:0xb|11)} } } */
-- 
GitLab