From 962a994d57f5b39c92d26b0446dbe6de9aa8910a Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Wed, 9 Oct 2024 14:38:48 +0200
Subject: [PATCH] Remove SLP_INSTANCE_UNROLLING_FACTOR, compute VF in
 vect_make_slp_decision

The following prepares us for SLP instances with a non-uniform number
of lanes.  We already have this with load permutation lowering, but
we managed to keep that within the constraints of the per SLP instance
computed VF based on its max_nunits (with a vector type fixed for
each node) and the instance group size which is the number of lanes
in the SLP instance root.  But in the case where arbitrary splitting
and merging SLP nodes at non-power-of-two lane boundaries is allowed
this simple calculation based on the outgoing group size falls apart.

The following, instead of computing a VF during SLP instance
discovery, computes it at vect_make_slp_decision time by walking
the SLP graph and looking at each SLP node in isolation.  We do
track max_nunits per node which could be a VF per node instead or
forgo with both completely (though for BB vectorization we need
to communicate a VF > 1 requirement upward, or compute that after
the fact).  In the end we'd like to delay vector type assignment
and only compute a minimum VF here, allowing vector types to
grow when the actual VF is bigger.

There's slight complication with permutes of externs / constants
as those get their vector type (and thus max_nunits) assigned late.
While we force them to have the same vector type as the result at
the moment their number of lanes can differ.  So those get handled
explicitly there right now to up the VF as needed - the alternative
is to fail vectorization, I have an addition to
vect_maybe_update_slp_op_vectype that would FAIL if the set
vector type isn't within the constraints of the VF.

	* tree-vectorizer.h (SLP_INSTANCE_UNROLLING_FACTOR): Remove.
	(slp_instance::unrolling_factor): Likewise.
	* tree-vect-slp.cc (vect_build_slp_instance): Do not set
	SLP_INSTANCE_UNROLLING_FACTOR.  Remove then dead code.
	Compute and set max_nunits from the RHS nodes merged.
	(vect_update_slp_vf_for_node): New function.
	(vect_make_slp_decision): Use vect_update_slp_vf_for_node
	to compute VF recursively.
	(vect_build_slp_store_interleaving): Get max_nunits and
	properly set that on the permute nodes built.
	(vect_analyze_slp): Do not set SLP_INSTANCE_UNROLLING_FACTOR.
---
 gcc/tree-vect-slp.cc  | 72 +++++++++++++++++++++++++++++++++----------
 gcc/tree-vectorizer.h |  4 ---
 2 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 8727246c27a6..f053198b86b9 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -3557,13 +3557,15 @@ vect_analyze_slp_instance (vec_info *vinfo,
 
 static slp_tree
 vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
-				   vec<stmt_vec_info> &scalar_stmts)
+				   vec<stmt_vec_info> &scalar_stmts,
+				   poly_uint64 max_nunits)
 {
   unsigned int group_size = scalar_stmts.length ();
   slp_tree node = vect_create_new_slp_node (scalar_stmts,
 					    SLP_TREE_CHILDREN
 					      (rhs_nodes[0]).length ());
   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+  node->max_nunits = max_nunits;
   for (unsigned l = 0;
        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
     {
@@ -3573,6 +3575,7 @@ vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
       SLP_TREE_CHILDREN (node).quick_push (perm);
       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
+      perm->max_nunits = max_nunits;
       SLP_TREE_LANES (perm) = group_size;
       /* ???  We should set this NULL but that's not expected.  */
       SLP_TREE_REPRESENTATIVE (perm)
@@ -3628,6 +3631,7 @@ vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
 	      SLP_TREE_LANES (permab) = n;
 	      SLP_TREE_LANE_PERMUTATION (permab).create (n);
 	      SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
+	      permab->max_nunits = max_nunits;
 	      /* ???  Should be NULL but that's not expected.  */
 	      SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
 	      SLP_TREE_CHILDREN (permab).quick_push (a);
@@ -3698,6 +3702,7 @@ vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
 	  SLP_TREE_LANES (permab) = n;
 	  SLP_TREE_LANE_PERMUTATION (permab).create (n);
 	  SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
+	  permab->max_nunits = max_nunits;
 	  /* ???  Should be NULL but that's not expected.  */
 	  SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
 	  SLP_TREE_CHILDREN (permab).quick_push (a);
@@ -3828,7 +3833,6 @@ vect_build_slp_instance (vec_info *vinfo,
 	  /* Create a new SLP instance.  */
 	  slp_instance new_instance = XNEW (class _slp_instance);
 	  SLP_INSTANCE_TREE (new_instance) = node;
-	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
 	  SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
 	  SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
@@ -4027,9 +4031,7 @@ vect_build_slp_instance (vec_info *vinfo,
 	  /* Analyze the stored values and pinch them together with
 	     a permute node so we can preserve the whole store group.  */
 	  auto_vec<slp_tree> rhs_nodes;
-
-	  /* Calculate the unrolling factor based on the smallest type.  */
-	  poly_uint64 unrolling_factor = 1;
+	  poly_uint64 max_nunits = 1;
 
 	  unsigned int rhs_common_nlanes = 0;
 	  unsigned int start = 0, end = i;
@@ -4046,13 +4048,8 @@ vect_build_slp_instance (vec_info *vinfo,
 					  matches, limit, &tree_size, bst_map);
 	      if (node)
 		{
-		  /* ???  Possibly not safe, but not sure how to check
-		     and fail SLP build?  */
-		  unrolling_factor
-		    = force_common_multiple (unrolling_factor,
-					     calculate_unrolling_factor
-					       (max_nunits, end - start));
 		  rhs_nodes.safe_push (node);
+		  vect_update_max_nunits (&max_nunits, node->max_nunits);
 		  if (start == 0)
 		    rhs_common_nlanes = SLP_TREE_LANES (node);
 		  else if (rhs_common_nlanes != SLP_TREE_LANES (node))
@@ -4116,6 +4113,7 @@ vect_build_slp_instance (vec_info *vinfo,
 					       SLP_TREE_CHILDREN
 						 (rhs_nodes[0]).length ());
 	      SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
+	      node->max_nunits = max_nunits;
 	      node->ldst_lanes = true;
 	      SLP_TREE_CHILDREN (node)
 		.reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
@@ -4132,7 +4130,8 @@ vect_build_slp_instance (vec_info *vinfo,
 		child->refcnt++;
 	    }
 	  else
-	    node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
+	    node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
+						      max_nunits);
 
 	  while (!rhs_nodes.is_empty ())
 	    vect_free_slp_tree (rhs_nodes.pop ());
@@ -4140,7 +4139,6 @@ vect_build_slp_instance (vec_info *vinfo,
 	  /* Create a new SLP instance.  */
 	  slp_instance new_instance = XNEW (class _slp_instance);
 	  SLP_INSTANCE_TREE (new_instance) = node;
-	  SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
 	  SLP_INSTANCE_LOADS (new_instance) = vNULL;
 	  SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
 	  SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
@@ -4881,7 +4879,6 @@ vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
 	      slp_tree invnode = vect_create_new_slp_node (ops);
 	      SLP_TREE_DEF_TYPE (invnode) = vect_external_def;
 	      SLP_INSTANCE_TREE (new_instance) = invnode;
-	      SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = 1;
 	      SLP_INSTANCE_LOADS (new_instance) = vNULL;
 	      SLP_INSTANCE_ROOT_STMTS (new_instance) = roots;
 	      SLP_INSTANCE_REMAIN_DEFS (new_instance) = vNULL;
@@ -7198,6 +7195,47 @@ vect_gather_slp_loads (vec_info *vinfo)
     }
 }
 
+/* For NODE update VF based on the number of lanes and the vector types
+   used.  */
+
+static void
+vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
+			     hash_set<slp_tree> &visited)
+{
+  if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
+    return;
+  if (visited.add (node))
+    return;
+
+  for (slp_tree child : SLP_TREE_CHILDREN (node))
+    vect_update_slp_vf_for_node (child, vf, visited);
+
+  /* We do not visit SLP nodes for constants or externals - those neither
+     have a vector type set yet (vectorizable_* does this) nor do they
+     have max_nunits set.  Instead we rely on internal nodes max_nunit
+     to cover constant/external operands.
+     Note that when we stop using fixed size vectors externs and constants
+     shouldn't influence the (minimum) vectorization factor, instead
+     vectorizable_* should honor the vectorization factor when trying to
+     assign vector types to constants and externals and cause iteration
+     to a higher vectorization factor when required.  */
+  poly_uint64 node_vf
+    = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
+  vf = force_common_multiple (vf, node_vf);
+
+  /* For permute nodes that are fed from externs or constants we have to
+     consider their number of lanes as well.  Likewise for store-lanes.  */
+  if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
+      || node->ldst_lanes)
+    for (slp_tree child : SLP_TREE_CHILDREN (node))
+      if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
+	{
+	  poly_uint64 child_vf
+	    = calculate_unrolling_factor (node->max_nunits,
+					  SLP_TREE_LANES (child));
+	  vf = force_common_multiple (vf, child_vf);
+	}
+}
 
 /* For each possible SLP instance decide whether to SLP it and calculate overall
    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
@@ -7215,6 +7253,7 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
 
   DUMP_VECT_SCOPE ("vect_make_slp_decision");
 
+  hash_set<slp_tree> visited;
   FOR_EACH_VEC_ELT (slp_instances, i, instance)
     {
       /* FORNOW: SLP if you can.  */
@@ -7223,9 +7262,8 @@ vect_make_slp_decision (loop_vec_info loop_vinfo)
 	   GET_MODE_SIZE (vinfo->vector_mode) * X
 
 	 for some rational X, so they must have a common multiple.  */
-      unrolling_factor
-	= force_common_multiple (unrolling_factor,
-				 SLP_INSTANCE_UNROLLING_FACTOR (instance));
+      vect_update_slp_vf_for_node (SLP_INSTANCE_TREE (instance),
+				   unrolling_factor, visited);
 
       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
 	 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index b7f2708fec0f..b51771f836ca 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -315,9 +315,6 @@ public:
      otherwise.  */
   vec<tree> remain_defs;
 
-  /* The unrolling factor required to vectorized this SLP instance.  */
-  poly_uint64 unrolling_factor;
-
   /* The group of nodes that contain loads of this SLP instance.  */
   vec<slp_tree> loads;
 
@@ -340,7 +337,6 @@ public:
 
 /* Access Functions.  */
 #define SLP_INSTANCE_TREE(S)                     (S)->root
-#define SLP_INSTANCE_UNROLLING_FACTOR(S)         (S)->unrolling_factor
 #define SLP_INSTANCE_LOADS(S)                    (S)->loads
 #define SLP_INSTANCE_ROOT_STMTS(S)               (S)->root_stmts
 #define SLP_INSTANCE_REMAIN_DEFS(S)              (S)->remain_defs
-- 
GitLab