diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr101207.c b/gcc/testsuite/gcc.dg/vect/bb-slp-pr101207.c
new file mode 100644
index 0000000000000000000000000000000000000000..1f51d66a5fe81f9ad2fcbfc019197e77d449b8a4
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr101207.c
@@ -0,0 +1,25 @@
+/* { dg-do run } */
+/* { dg-additional-options "-ffast-math" } */
+
+#include "tree-vect.h"
+
+double a[2];
+double x, y;
+
+void __attribute__((noipa)) foo ()
+{
+  x = a[1] - a[0];
+  y = a[0] + a[1];
+}
+
+int main()
+{
+  check_vect ();
+
+  a[0] = 0.;
+  a[1] = 1.;
+  foo ();
+  if (x != 1. || y != 1.)
+    __builtin_abort ();
+  return 0;
+}
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 17fe5f23c0982a137ed9a274c75d4a5ac735e832..5401dbe4d5e614fcfaa734c8666bc8f3f2c72f92 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -3921,6 +3921,52 @@ vect_optimize_slp (vec_info *vinfo)
 	}
     }
 
+  /* Elide any permutations at BB reduction roots.  */
+  if (is_a <bb_vec_info> (vinfo))
+    {
+      for (slp_instance instance : vinfo->slp_instances)
+	{
+	  if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
+	    continue;
+	  slp_tree old = SLP_INSTANCE_TREE (instance);
+	  if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
+	      && SLP_TREE_CHILDREN (old).length () == 1)
+	    {
+	      slp_tree child = SLP_TREE_CHILDREN (old)[0];
+	      if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
+		{
+		  /* Preserve the special VEC_PERM we use to shield existing
+		     vector defs from the rest.  But make it a no-op.  */
+		  unsigned i = 0;
+		  for (std::pair<unsigned, unsigned> &p
+		       : SLP_TREE_LANE_PERMUTATION (old))
+		    p.second = i++;
+		}
+	      else
+		{
+		  SLP_INSTANCE_TREE (instance) = child;
+		  SLP_TREE_REF_COUNT (child)++;
+		  vect_free_slp_tree (old);
+		}
+	    }
+	  else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
+		   && SLP_TREE_REF_COUNT (old) == 1
+		   && vertices[old->vertex].materialize)
+	    {
+	      /* ???  For loads the situation is more complex since
+		 we can't modify the permute in place in case the
+		 node is used multiple times.  In fact for loads this
+		 should be somehow handled in the propagation engine.  */
+	      /* Apply the reverse permutation to our stmts.  */
+	      int perm = vertices[old->vertex].get_perm_in ();
+	      vect_slp_permute (perms[perm],
+				SLP_TREE_SCALAR_STMTS (old), true);
+	      vect_slp_permute (perms[perm],
+				SLP_TREE_LOAD_PERMUTATION (old), true);
+	    }
+	}
+    }
+
   /* Free the perms vector used for propagation.  */
   while (!perms.is_empty ())
     perms.pop ().release ();
@@ -3987,48 +4033,6 @@ vect_optimize_slp (vec_info *vinfo)
 	    }
 	}
     }
-
-  /* And any permutations of BB reductions.  */
-  if (is_a <bb_vec_info> (vinfo))
-    {
-      for (slp_instance instance : vinfo->slp_instances)
-	{
-	  if (SLP_INSTANCE_KIND (instance) != slp_inst_kind_bb_reduc)
-	    continue;
-	  slp_tree old = SLP_INSTANCE_TREE (instance);
-	  if (SLP_TREE_CODE (old) == VEC_PERM_EXPR
-	      && SLP_TREE_CHILDREN (old).length () == 1)
-	    {
-	      slp_tree child = SLP_TREE_CHILDREN (old)[0];
-	      if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
-		{
-		  /* Preserve the special VEC_PERM we use to shield existing
-		     vector defs from the rest.  But make it a no-op.  */
-		  unsigned i = 0;
-		  for (std::pair<unsigned, unsigned> &p
-		       : SLP_TREE_LANE_PERMUTATION (old))
-		    p.second = i++;
-		}
-	      else
-		{
-		  SLP_INSTANCE_TREE (instance) = child;
-		  SLP_TREE_REF_COUNT (child)++;
-		  vect_free_slp_tree (old);
-		}
-	    }
-	  else if (SLP_TREE_LOAD_PERMUTATION (old).exists ()
-		   && SLP_TREE_REF_COUNT (old) == 1)
-	    {
-	      /* ???  For loads the situation is more complex since
-		 we can't modify the permute in place in case the
-		 node is used multiple times.  In fact for loads this
-		 should be somehow handled in the propagation engine.  */
-	      auto fn = [] (const void *a, const void *b)
-			      { return *(const int *)a - *(const int *)b; };
-	      SLP_TREE_LOAD_PERMUTATION (old).qsort (fn);
-	    }
-	}
-    }
 }
 
 /* Gather loads reachable from the individual SLP graph entries.  */