From b2f26af32b5b031fce761aa090de9476a53e6e5a Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Wed, 20 May 2020 09:22:58 +0200
Subject: [PATCH] tree-optimization/95219 - improve IV selection for induction

This improves code generation with SSE2 for the testcase by
making sure to only generate a single IV when the group size
is a multiple of the vector size.  It also adjusts the testcase
which was passing before.

2020-05-20  Richard Biener  <rguenther@suse.de>

	PR tree-optimization/95219
	* tree-vect-loop.c (vectorizable_induction): Reduce
	group_size before computing the number of required IVs.

	* gcc.dg/vect/costmodel/x86_64/costmodel-pr30843.c: Adjust.
---
 gcc/ChangeLog                                      |  6 ++++++
 gcc/testsuite/ChangeLog                            |  6 ++++++
 .../vect/costmodel/x86_64/costmodel-pr30843.c      |  4 +++-
 gcc/tree-vect-loop.c                               | 14 +++++++++++++-
 4 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 2eba6db9a911..88b03be71353 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,3 +1,9 @@
+2020-05-20  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/95219
+	* tree-vect-loop.c (vectorizable_induction): Reduce
+	group_size before computing the number of required IVs.
+
 2020-05-20  Richard Biener  <rguenther@suse.de>
 
 	PR middle-end/95231
diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog
index 3d63c57f3089..9552d201b5e4 100644
--- a/gcc/testsuite/ChangeLog
+++ b/gcc/testsuite/ChangeLog
@@ -1,3 +1,9 @@
+2020-05-20  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/95219
+	* tree-vect-loop.c (vectorizable_induction): Reduce
+	group_size before computing the number of required IVs.
+
 2020-05-20  Richard Biener  <rguenther@suse.de>
 
 	PR middle-end/95231
diff --git a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr30843.c b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr30843.c
index 257d098cf8bc..9a75b987d6d8 100644
--- a/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr30843.c
+++ b/gcc/testsuite/gcc.dg/vect/costmodel/x86_64/costmodel-pr30843.c
@@ -20,4 +20,6 @@ void dacP98FillRGBMap (unsigned char *pBuffer)
     }
 }
 
-/* { dg-final { scan-tree-dump-times "vectorization not profitable" 1 "vect" { target vect_interleave } } } */
+/* Even with SSE2 we should only generate one IV for the induction.  */
+/* { dg-final { scan-tree-dump-times "# vect_vec_iv" 1 "vect" } } */
+/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */
diff --git a/gcc/tree-vect-loop.c b/gcc/tree-vect-loop.c
index f065acc12f50..ecce348b39c8 100644
--- a/gcc/tree-vect-loop.c
+++ b/gcc/tree-vect-loop.c
@@ -7528,7 +7528,13 @@ vectorizable_induction (loop_vec_info loop_vinfo,
       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
       unsigned elts = const_nunits * nvects;
-      unsigned nivs = least_common_multiple (group_size,
+      /* Compute the number of distinct IVs we need.  First reduce
+	 group_size if it is a multiple of const_nunits so we get
+	 one IV for a group_size of 4 but const_nunits 2.  */
+      unsigned group_sizep = group_size;
+      if (group_sizep % const_nunits == 0)
+	group_sizep = group_sizep / const_nunits;
+      unsigned nivs = least_common_multiple (group_sizep,
 					     const_nunits) / const_nunits;
       gcc_assert (elts % group_size == 0);
       tree elt = init_expr;
@@ -7576,6 +7582,12 @@ vectorizable_induction (loop_vec_info loop_vinfo,
 
 	  SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
 	}
+      /* Fill up to the number of vectors we need for the whole group.  */
+      nivs = least_common_multiple (group_size,
+				    const_nunits) / const_nunits;
+      for (; ivn < nivs; ++ivn)
+	SLP_TREE_VEC_STMTS (slp_node)
+	  .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
 
       /* Re-use IVs when we can.  */
       if (ivn < nvects)
-- 
GitLab