From 7cd2df4dc0b53c172d04bc7f238ce55775fbb7df Mon Sep 17 00:00:00 2001
From: Kewen Lin <linkw@linux.ibm.com>
Date: Fri, 18 Aug 2023 02:26:52 -0500
Subject: [PATCH] vect: Move VMAT_GATHER_SCATTER handlings from final loop nest

Following Richi's suggestion [1], this patch is to move the
handlings on VMAT_GATHER_SCATTER in the final loop nest
of function vectorizable_load to its own loop.  Basically
it duplicates the final loop nest, clean up some useless
set up code for the case of VMAT_GATHER_SCATTER, remove some
unreachable code.  Also remove the corresponding handlings
in the final loop nest.

[1] https://gcc.gnu.org/pipermail/gcc-patches/2023-June/623329.html

gcc/ChangeLog:

	* tree-vect-stmts.cc (vectorizable_load): Move the handlings on
	VMAT_GATHER_SCATTER in the final loop nest to its own loop,
	and update the final nest accordingly.
---
 gcc/tree-vect-stmts.cc | 358 +++++++++++++++++++++++++----------------
 1 file changed, 216 insertions(+), 142 deletions(-)

diff --git a/gcc/tree-vect-stmts.cc b/gcc/tree-vect-stmts.cc
index cd8e0a763746..935bc6037ff2 100644
--- a/gcc/tree-vect-stmts.cc
+++ b/gcc/tree-vect-stmts.cc
@@ -10518,6 +10518,215 @@ vectorizable_load (vec_info *vinfo,
       return true;
     }
 
+  if (memory_access_type == VMAT_GATHER_SCATTER)
+    {
+      gcc_assert (alignment_support_scheme == dr_aligned
+		  || alignment_support_scheme == dr_unaligned_supported);
+      gcc_assert (!grouped_load && !slp_perm);
+
+      unsigned int inside_cost = 0, prologue_cost = 0;
+      for (j = 0; j < ncopies; j++)
+	{
+	  /* 1. Create the vector or array pointer update chain.  */
+	  if (j == 0 && !costing_p)
+	    {
+	      if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
+					     slp_node, &gs_info, &dataref_ptr,
+					     &vec_offsets);
+	      else
+		dataref_ptr
+		  = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
+					      at_loop, offset, &dummy, gsi,
+					      &ptr_incr, false, bump);
+	    }
+	  else if (!costing_p)
+	    {
+	      gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
+	      if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+					       gsi, stmt_info, bump);
+	    }
+
+	  if (mask && !costing_p)
+	    vec_mask = vec_masks[j];
+
+	  gimple *new_stmt = NULL;
+	  for (i = 0; i < vec_num; i++)
+	    {
+	      tree final_mask = NULL_TREE;
+	      tree final_len = NULL_TREE;
+	      tree bias = NULL_TREE;
+	      if (!costing_p)
+		{
+		  if (loop_masks)
+		    final_mask
+		      = vect_get_loop_mask (loop_vinfo, gsi, loop_masks,
+					    vec_num * ncopies, vectype,
+					    vec_num * j + i);
+		  if (vec_mask)
+		    final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
+						   final_mask, vec_mask, gsi);
+
+		  if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
+						   gsi, stmt_info, bump);
+		}
+
+	      /* 2. Create the vector-load in the loop.  */
+	      unsigned HOST_WIDE_INT align;
+	      if (gs_info.ifn != IFN_LAST)
+		{
+		  if (costing_p)
+		    {
+		      unsigned int cnunits = vect_nunits_for_cost (vectype);
+		      inside_cost
+			= record_stmt_cost (cost_vec, cnunits, scalar_load,
+					    stmt_info, 0, vect_body);
+		      continue;
+		    }
+		  if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+		    vec_offset = vec_offsets[vec_num * j + i];
+		  tree zero = build_zero_cst (vectype);
+		  tree scale = size_int (gs_info.scale);
+
+		  if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
+		    {
+		      if (loop_lens)
+			final_len
+			  = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
+					       vec_num * ncopies, vectype,
+					       vec_num * j + i, 1);
+		      else
+			final_len
+			  = build_int_cst (sizetype,
+					   TYPE_VECTOR_SUBPARTS (vectype));
+		      signed char biasval
+			= LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
+		      bias = build_int_cst (intQI_type_node, biasval);
+		      if (!final_mask)
+			{
+			  mask_vectype = truth_type_for (vectype);
+			  final_mask = build_minus_one_cst (mask_vectype);
+			}
+		    }
+
+		  gcall *call;
+		  if (final_len && final_mask)
+		    call
+		      = gimple_build_call_internal (IFN_MASK_LEN_GATHER_LOAD, 7,
+						    dataref_ptr, vec_offset,
+						    scale, zero, final_mask,
+						    final_len, bias);
+		  else if (final_mask)
+		    call = gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
+						       dataref_ptr, vec_offset,
+						       scale, zero, final_mask);
+		  else
+		    call = gimple_build_call_internal (IFN_GATHER_LOAD, 4,
+						       dataref_ptr, vec_offset,
+						       scale, zero);
+		  gimple_call_set_nothrow (call, true);
+		  new_stmt = call;
+		  data_ref = NULL_TREE;
+		}
+	      else
+		{
+		  /* Emulated gather-scatter.  */
+		  gcc_assert (!final_mask);
+		  unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
+		  if (costing_p)
+		    {
+		      /* For emulated gathers N offset vector element
+			 offset add is consumed by the load).  */
+		      inside_cost = record_stmt_cost (cost_vec, const_nunits,
+						      vec_to_scalar, stmt_info,
+						      0, vect_body);
+		      /* N scalar loads plus gathering them into a
+			 vector.  */
+		      inside_cost
+			= record_stmt_cost (cost_vec, const_nunits, scalar_load,
+					    stmt_info, 0, vect_body);
+		      inside_cost
+			= record_stmt_cost (cost_vec, 1, vec_construct,
+					    stmt_info, 0, vect_body);
+		      continue;
+		    }
+		  unsigned HOST_WIDE_INT const_offset_nunits
+		    = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
+			.to_constant ();
+		  vec<constructor_elt, va_gc> *ctor_elts;
+		  vec_alloc (ctor_elts, const_nunits);
+		  gimple_seq stmts = NULL;
+		  /* We support offset vectors with more elements
+		     than the data vector for now.  */
+		  unsigned HOST_WIDE_INT factor
+		    = const_offset_nunits / const_nunits;
+		  vec_offset = vec_offsets[j / factor];
+		  unsigned elt_offset = (j % factor) * const_nunits;
+		  tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
+		  tree scale = size_int (gs_info.scale);
+		  align = get_object_alignment (DR_REF (first_dr_info->dr));
+		  tree ltype = build_aligned_type (TREE_TYPE (vectype), align);
+		  for (unsigned k = 0; k < const_nunits; ++k)
+		    {
+		      tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
+					      bitsize_int (k + elt_offset));
+		      tree idx
+			= gimple_build (&stmts, BIT_FIELD_REF, idx_type,
+					vec_offset, TYPE_SIZE (idx_type), boff);
+		      idx = gimple_convert (&stmts, sizetype, idx);
+		      idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
+					  scale);
+		      tree ptr = gimple_build (&stmts, PLUS_EXPR,
+					       TREE_TYPE (dataref_ptr),
+					       dataref_ptr, idx);
+		      ptr = gimple_convert (&stmts, ptr_type_node, ptr);
+		      tree elt = make_ssa_name (TREE_TYPE (vectype));
+		      tree ref = build2 (MEM_REF, ltype, ptr,
+					 build_int_cst (ref_type, 0));
+		      new_stmt = gimple_build_assign (elt, ref);
+		      gimple_set_vuse (new_stmt, gimple_vuse (gsi_stmt (*gsi)));
+		      gimple_seq_add_stmt (&stmts, new_stmt);
+		      CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
+		    }
+		  gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
+		  new_stmt = gimple_build_assign (
+		    NULL_TREE, build_constructor (vectype, ctor_elts));
+		  data_ref = NULL_TREE;
+		}
+
+	      vec_dest = vect_create_destination_var (scalar_dest, vectype);
+	      /* DATA_REF is null if we've already built the statement.  */
+	      if (data_ref)
+		{
+		  vect_copy_ref_info (data_ref, DR_REF (first_dr_info->dr));
+		  new_stmt = gimple_build_assign (vec_dest, data_ref);
+		}
+	      new_temp = make_ssa_name (vec_dest, new_stmt);
+	      gimple_set_lhs (new_stmt, new_temp);
+	      vect_finish_stmt_generation (vinfo, stmt_info, new_stmt, gsi);
+
+	      /* Store vector loads in the corresponding SLP_NODE.  */
+	      if (slp)
+		slp_node->push_vec_def (new_stmt);
+	    }
+
+	  if (!slp && !costing_p)
+	    STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
+	}
+
+      if (!slp && !costing_p)
+	*vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
+
+      if (costing_p && dump_enabled_p ())
+	dump_printf_loc (MSG_NOTE, vect_location,
+			 "vect_model_load_cost: inside_cost = %u, "
+			 "prologue_cost = %u .\n",
+			 inside_cost, prologue_cost);
+      return true;
+    }
+
   poly_uint64 group_elt = 0;
   unsigned int inside_cost = 0, prologue_cost = 0;
   for (j = 0; j < ncopies; j++)
@@ -10567,12 +10776,6 @@ vectorizable_load (vec_info *vinfo,
 		  gcc_assert (!compute_in_loop);
 		}
 	    }
-	  else if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-	    {
-	      vect_get_gather_scatter_ops (loop_vinfo, loop, stmt_info,
-					   slp_node, &gs_info, &dataref_ptr,
-					   &vec_offsets);
-	    }
 	  else
 	    dataref_ptr
 	      = vect_create_data_ref_ptr (vinfo, first_stmt_info, aggr_type,
@@ -10588,7 +10791,7 @@ vectorizable_load (vec_info *vinfo,
 	  if (dataref_offset)
 	    dataref_offset = int_const_binop (PLUS_EXPR, dataref_offset,
 					      bump);
-	  else if (!STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	  else
 	    dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr, gsi,
 					   stmt_info, bump);
 	  if (mask)
@@ -10614,7 +10817,7 @@ vectorizable_load (vec_info *vinfo,
 		final_mask = prepare_vec_mask (loop_vinfo, mask_vectype,
 					       final_mask, vec_mask, gsi);
 
-	      if (i > 0 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info))
+	      if (i > 0)
 		dataref_ptr = bump_vector_ptr (vinfo, dataref_ptr, ptr_incr,
 					       gsi, stmt_info, bump);
 	    }
@@ -10625,139 +10828,11 @@ vectorizable_load (vec_info *vinfo,
 	    case dr_aligned:
 	    case dr_unaligned_supported:
 	      {
-		unsigned int misalign;
-		unsigned HOST_WIDE_INT align;
-
-		if (memory_access_type == VMAT_GATHER_SCATTER
-		    && gs_info.ifn != IFN_LAST)
-		  {
-		    if (costing_p)
-		      {
-			unsigned int cnunits = vect_nunits_for_cost (vectype);
-			inside_cost
-			  = record_stmt_cost (cost_vec, cnunits, scalar_load,
-					      stmt_info, 0, vect_body);
-			break;
-		      }
-		    if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
-		      vec_offset = vec_offsets[vec_num * j + i];
-		    tree zero = build_zero_cst (vectype);
-		    tree scale = size_int (gs_info.scale);
-
-		    if (gs_info.ifn == IFN_MASK_LEN_GATHER_LOAD)
-		      {
-			if (loop_lens)
-			  final_len
-			    = vect_get_loop_len (loop_vinfo, gsi, loop_lens,
-						 vec_num * ncopies, vectype,
-						 vec_num * j + i, 1);
-			else
-			  final_len
-			    = build_int_cst (sizetype,
-					     TYPE_VECTOR_SUBPARTS (vectype));
-			signed char biasval
-			  = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
-			bias = build_int_cst (intQI_type_node, biasval);
-			if (!final_mask)
-			  {
-			    mask_vectype = truth_type_for (vectype);
-			    final_mask = build_minus_one_cst (mask_vectype);
-			  }
-		      }
-
-		    gcall *call;
-		    if (final_len && final_mask)
-		      call = gimple_build_call_internal (
-			IFN_MASK_LEN_GATHER_LOAD, 7, dataref_ptr, vec_offset,
-			scale, zero, final_mask, final_len, bias);
-		    else if (final_mask)
-		      call
-			= gimple_build_call_internal (IFN_MASK_GATHER_LOAD, 5,
-						      dataref_ptr, vec_offset,
-						      scale, zero, final_mask);
-		    else
-		      call
-			= gimple_build_call_internal (IFN_GATHER_LOAD, 4,
-						      dataref_ptr, vec_offset,
-						      scale, zero);
-		    gimple_call_set_nothrow (call, true);
-		    new_stmt = call;
-		    data_ref = NULL_TREE;
-		    break;
-		  }
-		else if (memory_access_type == VMAT_GATHER_SCATTER)
-		  {
-		    /* Emulated gather-scatter.  */
-		    gcc_assert (!final_mask);
-		    unsigned HOST_WIDE_INT const_nunits = nunits.to_constant ();
-		    if (costing_p)
-		      {
-			/* For emulated gathers N offset vector element
-			   offset add is consumed by the load).  */
-			inside_cost
-			  = record_stmt_cost (cost_vec, const_nunits,
-					      vec_to_scalar, stmt_info, 0,
-					      vect_body);
-			/* N scalar loads plus gathering them into a
-			   vector.  */
-			inside_cost = record_stmt_cost (cost_vec, const_nunits,
-							scalar_load, stmt_info,
-							0, vect_body);
-			inside_cost
-			  = record_stmt_cost (cost_vec, 1, vec_construct,
-					      stmt_info, 0, vect_body);
-			break;
-		      }
-		    unsigned HOST_WIDE_INT const_offset_nunits
-		      = TYPE_VECTOR_SUBPARTS (gs_info.offset_vectype)
-			  .to_constant ();
-		    vec<constructor_elt, va_gc> *ctor_elts;
-		    vec_alloc (ctor_elts, const_nunits);
-		    gimple_seq stmts = NULL;
-		    /* We support offset vectors with more elements
-		       than the data vector for now.  */
-		    unsigned HOST_WIDE_INT factor
-		      = const_offset_nunits / const_nunits;
-		    vec_offset = vec_offsets[j / factor];
-		    unsigned elt_offset = (j % factor) * const_nunits;
-		    tree idx_type = TREE_TYPE (TREE_TYPE (vec_offset));
-		    tree scale = size_int (gs_info.scale);
-		    align = get_object_alignment (DR_REF (first_dr_info->dr));
-		    tree ltype
-		      = build_aligned_type (TREE_TYPE (vectype), align);
-		    for (unsigned k = 0; k < const_nunits; ++k)
-		      {
-			tree boff = size_binop (MULT_EXPR, TYPE_SIZE (idx_type),
-						bitsize_int (k + elt_offset));
-			tree idx = gimple_build (&stmts, BIT_FIELD_REF,
-						 idx_type, vec_offset,
-						 TYPE_SIZE (idx_type), boff);
-			idx = gimple_convert (&stmts, sizetype, idx);
-			idx = gimple_build (&stmts, MULT_EXPR, sizetype, idx,
-					    scale);
-			tree ptr = gimple_build (&stmts, PLUS_EXPR,
-						 TREE_TYPE (dataref_ptr),
-						 dataref_ptr, idx);
-			ptr = gimple_convert (&stmts, ptr_type_node, ptr);
-			tree elt = make_ssa_name (TREE_TYPE (vectype));
-			tree ref = build2 (MEM_REF, ltype, ptr,
-					   build_int_cst (ref_type, 0));
-			new_stmt = gimple_build_assign (elt, ref);
-			gimple_set_vuse (new_stmt,
-					 gimple_vuse (gsi_stmt (*gsi)));
-			gimple_seq_add_stmt (&stmts, new_stmt);
-			CONSTRUCTOR_APPEND_ELT (ctor_elts, NULL_TREE, elt);
-		      }
-		    gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
-		    new_stmt = gimple_build_assign (
-		      NULL_TREE, build_constructor (vectype, ctor_elts));
-		    data_ref = NULL_TREE;
-		    break;
-		  }
-
 		if (costing_p)
 		  break;
 
+		unsigned int misalign;
+		unsigned HOST_WIDE_INT align;
 		align = known_alignment (DR_TARGET_ALIGNMENT (first_dr_info));
 		if (alignment_support_scheme == dr_aligned)
 		  misalign = 0;
@@ -11219,10 +11294,9 @@ vectorizable_load (vec_info *vinfo,
 
   if (costing_p)
     {
-      gcc_assert (memory_access_type != VMAT_INVARIANT
-		  && memory_access_type != VMAT_ELEMENTWISE
-		  && memory_access_type != VMAT_STRIDED_SLP
-		  && memory_access_type != VMAT_LOAD_STORE_LANES);
+      gcc_assert (memory_access_type == VMAT_CONTIGUOUS
+		  || memory_access_type == VMAT_CONTIGUOUS_REVERSE
+		  || memory_access_type == VMAT_CONTIGUOUS_PERMUTE);
       if (dump_enabled_p ())
 	dump_printf_loc (MSG_NOTE, vect_location,
 			 "vect_model_load_cost: inside_cost = %u, "
-- 
GitLab