From cd676dfa57e643a4f7d8445e6ebad0f21cf3fd84 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 16 Dec 2020 13:08:07 +0100
Subject: [PATCH] bswap: Handle vector CONSTRUCTORs [PR96239]

The following patch teaches the bswap pass to handle for small (2/4/8 byte
long) vectors a CONSTRUCTOR by determining if the bytes of the constructor
come from non-vector sources and are either nop or bswap and changing the
CONSTRUCTOR in that case to VIEW_CONVERT_EXPR from scalar integer to
the vector type.

Unfortunately, as I found after the patch was written, due to pass ordering
this doesn't really fix the original testcase, just the one I wrote,
because both loop and slp vectorization is done only after the bswap pass.
A possible way out of that would be to perform just this particular bswap
optimization (i.e. for CONSTRUCTOR assignments with integral vector types
call find_bswap_or_nop and bswap_replace if successful) also during the
store merging pass, it isn't really a store, but the store merging pass
already performs bswapping when handling store, so it wouldn't be that big
hack.  What do you think?

2020-12-16  Jakub Jelinek  <jakub@redhat.com>

	PR tree-optimization/96239
	* gimple-ssa-store-merging.c (find_bswap_or_nop): Handle a vector
	CONSTRUCTOR.
	(bswap_replace): Likewise.

	* gcc.dg/pr96239.c: New test.
---
 gcc/gimple-ssa-store-merging.c | 91 ++++++++++++++++++++++++++++++++--
 gcc/testsuite/gcc.dg/pr96239.c | 54 ++++++++++++++++++++
 2 files changed, 140 insertions(+), 5 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr96239.c

diff --git a/gcc/gimple-ssa-store-merging.c b/gcc/gimple-ssa-store-merging.c
index 17a4250d77f4..2894205fa3c3 100644
--- a/gcc/gimple-ssa-store-merging.c
+++ b/gcc/gimple-ssa-store-merging.c
@@ -865,7 +865,66 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap)
   gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);
 
   if (!ins_stmt)
-    return NULL;
+    {
+      if (gimple_assign_rhs_code (stmt) != CONSTRUCTOR
+	  || BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
+	return NULL;
+      unsigned HOST_WIDE_INT sz = tree_to_uhwi (type_size) * BITS_PER_UNIT;
+      if (sz != 16 && sz != 32 && sz != 64)
+	return NULL;
+      tree rhs = gimple_assign_rhs1 (stmt);
+      tree eltype = TREE_TYPE (TREE_TYPE (rhs));
+      unsigned HOST_WIDE_INT eltsz
+	= int_size_in_bytes (eltype) * BITS_PER_UNIT;
+      if (TYPE_PRECISION (eltype) != eltsz)
+	return NULL;
+      constructor_elt *elt;
+      unsigned int i;
+      tree type = build_nonstandard_integer_type (sz, 1);
+      FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (rhs), i, elt)
+	{
+	  if (TREE_CODE (elt->value) != SSA_NAME
+	      || !INTEGRAL_TYPE_P (TREE_TYPE (elt->value)))
+	    return NULL;
+	  struct symbolic_number n1;
+	  gimple *source_stmt
+	    = find_bswap_or_nop_1 (SSA_NAME_DEF_STMT (elt->value), &n1,
+				   limit - 1);
+
+	  if (!source_stmt)
+	    return NULL;
+
+	  n1.type = type;
+	  if (!n1.base_addr)
+	    n1.range = sz / BITS_PER_UNIT;
+
+	  if (i == 0)
+	    {
+	      ins_stmt = source_stmt;
+	      *n = n1;
+	    }
+	  else
+	    {
+	      if (n->vuse != n1.vuse)
+		return NULL;
+
+	      struct symbolic_number n0 = *n;
+
+	      if (!BYTES_BIG_ENDIAN)
+		{
+		  if (!do_shift_rotate (LSHIFT_EXPR, &n1, i * eltsz))
+		    return NULL;
+		}
+	      else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz))
+		return NULL;
+	      ins_stmt
+		= perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n);
+
+	      if (!ins_stmt)
+		return NULL;
+	    }
+	}
+    }
 
   uint64_t cmpxchg, cmpnop;
   find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop);
@@ -939,11 +998,18 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
 {
   tree src, tmp, tgt = NULL_TREE;
   gimple *bswap_stmt;
+  tree_code conv_code = NOP_EXPR;
 
   gimple *cur_stmt = gsi_stmt (gsi);
   src = n->src;
   if (cur_stmt)
-    tgt = gimple_assign_lhs (cur_stmt);
+    {
+      tgt = gimple_assign_lhs (cur_stmt);
+      if (gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR
+	  && tgt
+	  && VECTOR_TYPE_P (TREE_TYPE (tgt)))
+	conv_code = VIEW_CONVERT_EXPR;
+    }
 
   /* Need to load the value from memory first.  */
   if (n->base_addr)
@@ -1031,7 +1097,9 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
 	      load_stmt = gimple_build_assign (val_tmp, val_expr);
 	      gimple_set_vuse (load_stmt, n->vuse);
 	      gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
-	      gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
+	      if (conv_code == VIEW_CONVERT_EXPR)
+		val_tmp = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), val_tmp);
+	      gimple_assign_set_rhs_with_ops (&gsi, conv_code, val_tmp);
 	      update_stmt (cur_stmt);
 	    }
 	  else if (cur_stmt)
@@ -1073,7 +1141,9 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
 	{
 	  if (!is_gimple_val (src))
 	    return NULL_TREE;
-	  g = gimple_build_assign (tgt, NOP_EXPR, src);
+	  if (conv_code == VIEW_CONVERT_EXPR)
+	    src = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), src);
+	  g = gimple_build_assign (tgt, conv_code, src);
 	}
       else if (cur_stmt)
 	g = gimple_build_assign (tgt, src);
@@ -1153,7 +1223,10 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl,
       gimple *convert_stmt;
 
       tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
-      convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
+      tree atmp = tmp;
+      if (conv_code == VIEW_CONVERT_EXPR)
+	atmp = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), tmp);
+      convert_stmt = gimple_build_assign (tgt, conv_code, atmp);
       gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
     }
 
@@ -1258,6 +1331,14 @@ pass_optimize_bswap::execute (function *fun)
 	      /* Fall through.  */
 	    case BIT_IOR_EXPR:
 	      break;
+	    case CONSTRUCTOR:
+	      {
+		tree rhs = gimple_assign_rhs1 (cur_stmt);
+		if (VECTOR_TYPE_P (TREE_TYPE (rhs))
+		    && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))))
+		  break;
+	      }
+	      continue;
 	    default:
 	      continue;
 	    }
diff --git a/gcc/testsuite/gcc.dg/pr96239.c b/gcc/testsuite/gcc.dg/pr96239.c
new file mode 100644
index 000000000000..8af56e12bb43
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr96239.c
@@ -0,0 +1,54 @@
+/* PR tree-optimization/96239 */
+/* { dg-do run { target { ilp32 || lp64 } } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times " r>> 8;" 1 "optimized" { target bswap } } } */
+/* { dg-final { scan-tree-dump-times " = __builtin_bswap64 " 1 "optimized" { target bswap } } } */
+/* { dg-final { scan-tree-dump-not " >> \(8\|16\|24\|32\|40\|48\|56\);" "optimized" { target bswap } } } */
+
+typedef unsigned char V __attribute__((vector_size (2)));
+typedef unsigned char W __attribute__((vector_size (8)));
+
+__attribute__((noipa)) void
+foo (unsigned short x, V *p)
+{
+  *p = (V) { x >> 8, x };
+}
+
+__attribute__((noipa)) void
+bar (unsigned long long x, W *p)
+{
+  *p = (W) { x >> 56, x >> 48, x >> 40, x >> 32, x >> 24, x >> 16, x >> 8, x };
+}
+
+__attribute__((noipa)) void
+baz (unsigned short x, V *p)
+{
+  *p = (V) { x, x >> 8 };
+}
+
+__attribute__((noipa)) void
+qux (unsigned long long x, W *p)
+{
+  *p = (W) { x, x >> 8, x >> 16, x >> 24, x >> 32, x >> 40, x >> 48, x >> 56 };
+}
+
+int
+main ()
+{
+  V a, c, e, g;
+  W b, d, f, h;
+  foo (0xcafe, &a);
+  bar (0xdeadbeefcafebabeULL, &b);
+  baz (0xdead, &c);
+  qux (0xfeedbac1beefdeadULL, &d);
+  e = (V) { 0xca, 0xfe };
+  f = (W) { 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe };
+  g = (V) { 0xad, 0xde };
+  h = (W) { 0xad, 0xde, 0xef, 0xbe, 0xc1, 0xba, 0xed, 0xfe };
+  if (__builtin_memcmp (&a, &e, sizeof (V))
+      || __builtin_memcmp (&b, &f, sizeof (W))
+      || __builtin_memcmp (&c, &g, sizeof (V))
+      || __builtin_memcmp (&d, &h, sizeof (W)))
+    __builtin_abort ();
+  return 0;
+}
-- 
GitLab