From 7a6c31f0f84a7295433ebac09b94fae2d5cc2892 Mon Sep 17 00:00:00 2001
From: Richard Biener <rguenther@suse.de>
Date: Mon, 31 May 2021 13:19:01 +0200
Subject: [PATCH] Add x86 addsub SLP pattern

This addds SLP pattern recognition for the SSE3/AVX [v]addsubp{ds} v0, v1
instructions which compute { v0[0] - v1[0], v0[1], + v1[1], ... }
thus subtract, add alternating on lanes, starting with subtract.

It adds a corresponding optab and direct internal function,
vec_addsub$a3 and renames the existing i386 backend patterns to
the new canonical name.

The SLP pattern matches the exact alternating lane sequence rather
than trying to be clever and anticipating incoming permutes - we
could permute the two input vectors to the needed lane alternation,
do the addsub and then permute the result vector back but that's
only profitable in case the two input or the output permute will
vanish - something Tamars refactoring of SLP pattern recog should
make possible.

2021-06-17  Richard Biener  <rguenther@suse.de>

	* config/i386/sse.md (avx_addsubv4df3): Rename to
	vec_addsubv4df3.
	(avx_addsubv8sf3): Rename to vec_addsubv8sf3.
	(sse3_addsubv2df3): Rename to vec_addsubv2df3.
	(sse3_addsubv4sf3): Rename to vec_addsubv4sf3.
	* config/i386/i386-builtin.def: Adjust.
	* internal-fn.def (VEC_ADDSUB): New internal optab fn.
	* optabs.def (vec_addsub_optab): New optab.
	* tree-vect-slp-patterns.c (class addsub_pattern): New.
	(slp_patterns): Add addsub_pattern.
	* tree-vect-slp.c (vect_optimize_slp): Disable propagation
	across CFN_VEC_ADDSUB.
	* tree-vectorizer.h (vect_pattern::vect_pattern): Make
	m_ops optional.
	* doc/md.texi (vec_addsub<mode>3): Document.

	* gcc.target/i386/vect-addsubv2df.c: New testcase.
	* gcc.target/i386/vect-addsubv4sf.c: Likewise.
	* gcc.target/i386/vect-addsubv4df.c: Likewise.
	* gcc.target/i386/vect-addsubv8sf.c: Likewise.
	* gcc.target/i386/vect-addsub-2.c: Likewise.
	* gcc.target/i386/vect-addsub-3.c: Likewise.
---
 gcc/config/i386/i386-builtin.def              |   8 +-
 gcc/config/i386/sse.md                        |   8 +-
 gcc/doc/md.texi                               |   8 ++
 gcc/internal-fn.def                           |   1 +
 gcc/optabs.def                                |   1 +
 gcc/testsuite/gcc.target/i386/vect-addsub-2.c |  21 ++++
 gcc/testsuite/gcc.target/i386/vect-addsub-3.c |  38 +++++++
 .../gcc.target/i386/vect-addsubv2df.c         |  42 ++++++++
 .../gcc.target/i386/vect-addsubv4df.c         |  36 +++++++
 .../gcc.target/i386/vect-addsubv4sf.c         |  46 ++++++++
 .../gcc.target/i386/vect-addsubv8sf.c         |  46 ++++++++
 gcc/tree-vect-slp-patterns.c                  | 100 ++++++++++++++++++
 gcc/tree-vect-slp.c                           |   1 +
 gcc/tree-vectorizer.h                         |   3 +-
 14 files changed, 350 insertions(+), 9 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-addsub-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-addsub-3.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-addsubv2df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-addsubv4df.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-addsubv4sf.c
 create mode 100644 gcc/testsuite/gcc.target/i386/vect-addsubv8sf.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index 31df3a613dd3..ea79e0bdda27 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -855,8 +855,8 @@ BDESC (OPTION_MASK_ISA_SSE2 | OPTION_MASK_ISA_MMX, 0, CODE_FOR_mmx_subv1di3, "__
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_movshdup, "__builtin_ia32_movshdup", IX86_BUILTIN_MOVSHDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF)
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_movsldup, "__builtin_ia32_movsldup", IX86_BUILTIN_MOVSLDUP, UNKNOWN, (int) V4SF_FTYPE_V4SF)
 
-BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
-BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
+BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_vec_addsubv4sf3, "__builtin_ia32_addsubps", IX86_BUILTIN_ADDSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
+BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_vec_addsubv2df3, "__builtin_ia32_addsubpd", IX86_BUILTIN_ADDSUBPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_haddv4sf3, "__builtin_ia32_haddps", IX86_BUILTIN_HADDPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_haddv2df3, "__builtin_ia32_haddpd", IX86_BUILTIN_HADDPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF)
 BDESC (OPTION_MASK_ISA_SSE3, 0, CODE_FOR_sse3_hsubv4sf3, "__builtin_ia32_hsubps", IX86_BUILTIN_HSUBPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF)
@@ -996,8 +996,8 @@ BDESC (OPTION_MASK_ISA_SSE2, 0, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128
 /* AVX */
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_vec_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_vec_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_andnotv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF)
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 5bd65dd93121..1f1db8214ccf 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -2410,7 +2410,7 @@
    (set_attr "prefix" "<round_saeonly_scalar_prefix>")
    (set_attr "mode" "<ssescalarmode>")])
 
-(define_insn "avx_addsubv4df3"
+(define_insn "vec_addsubv4df3"
   [(set (match_operand:V4DF 0 "register_operand" "=x")
 	(vec_merge:V4DF
 	  (minus:V4DF
@@ -2424,7 +2424,7 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "V4DF")])
 
-(define_insn "sse3_addsubv2df3"
+(define_insn "vec_addsubv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
 	(vec_merge:V2DF
 	  (minus:V2DF
@@ -2442,7 +2442,7 @@
    (set_attr "prefix" "orig,vex")
    (set_attr "mode" "V2DF")])
 
-(define_insn "avx_addsubv8sf3"
+(define_insn "vec_addsubv8sf3"
   [(set (match_operand:V8SF 0 "register_operand" "=x")
 	(vec_merge:V8SF
 	  (minus:V8SF
@@ -2456,7 +2456,7 @@
    (set_attr "prefix" "vex")
    (set_attr "mode" "V8SF")])
 
-(define_insn "sse3_addsubv4sf3"
+(define_insn "vec_addsubv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_merge:V4SF
 	  (minus:V4SF
diff --git a/gcc/doc/md.texi b/gcc/doc/md.texi
index 00caf3844ccf..1b9181443305 100644
--- a/gcc/doc/md.texi
+++ b/gcc/doc/md.texi
@@ -5682,6 +5682,14 @@ signed/unsigned elements of size S@.  Subtract the high/low elements of 2 from
 1 and widen the resulting elements. Put the N/2 results of size 2*S in the
 output vector (operand 0).
 
+@cindex @code{vec_addsub@var{m}3} instruction pattern
+@item @samp{vec_addsub@var{m}3}
+Alternating subtract, add with even lanes doing subtract and odd
+lanes doing addition.  Operands 1 and 2 and the outout operand are vectors
+with mode @var{m}.
+
+These instructions are not allowed to @code{FAIL}.
+
 @cindex @code{mulhisi3} instruction pattern
 @item @samp{mulhisi3}
 Multiply operands 1 and 2, which have mode @code{HImode}, and store
diff --git a/gcc/internal-fn.def b/gcc/internal-fn.def
index b2f414d2131b..c3b8e730960c 100644
--- a/gcc/internal-fn.def
+++ b/gcc/internal-fn.def
@@ -281,6 +281,7 @@ DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT90, ECF_CONST, cadd90, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_ADD_ROT270, ECF_CONST, cadd270, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL, ECF_CONST, cmul, binary)
 DEF_INTERNAL_OPTAB_FN (COMPLEX_MUL_CONJ, ECF_CONST, cmul_conj, binary)
+DEF_INTERNAL_OPTAB_FN (VEC_ADDSUB, ECF_CONST, vec_addsub, binary)
 
 
 /* FP scales.  */
diff --git a/gcc/optabs.def b/gcc/optabs.def
index b192a9d070b8..41ab2598eb6c 100644
--- a/gcc/optabs.def
+++ b/gcc/optabs.def
@@ -407,6 +407,7 @@ OPTAB_D (vec_widen_usubl_hi_optab, "vec_widen_usubl_hi_$a")
 OPTAB_D (vec_widen_usubl_lo_optab, "vec_widen_usubl_lo_$a")
 OPTAB_D (vec_widen_uaddl_hi_optab, "vec_widen_uaddl_hi_$a")
 OPTAB_D (vec_widen_uaddl_lo_optab, "vec_widen_uaddl_lo_$a")
+OPTAB_D (vec_addsub_optab, "vec_addsub$a3")
 
 OPTAB_D (sync_add_optab, "sync_add$I$a")
 OPTAB_D (sync_and_optab, "sync_and$I$a")
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsub-2.c b/gcc/testsuite/gcc.target/i386/vect-addsub-2.c
new file mode 100644
index 000000000000..a6b941461e8e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsub-2.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-options "-O3 -msse3" } */
+
+float a[1024], b[1024];
+
+void foo()
+{
+  for (int i = 0; i < 256; i++)
+    {
+      a[4*i+0] = a[4*i+0] - b[4*i+0];
+      a[4*i+1] = a[4*i+1] + b[4*i+1];
+      a[4*i+2] = a[4*i+2] - b[4*i+2];
+      a[4*i+3] = a[4*i+3] + b[4*i+3];
+    }
+}
+
+/* We should be able to vectorize this with SLP using the addsub
+   SLP pattern.  */
+/* { dg-final { scan-assembler "addsubps" } } */
+/* { dg-final { scan-assembler-not "shuf" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsub-3.c b/gcc/testsuite/gcc.target/i386/vect-addsub-3.c
new file mode 100644
index 000000000000..b27ee56bd73e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsub-3.c
@@ -0,0 +1,38 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-options "-O3 -msse3" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse3-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse3_test
+#endif
+
+#include CHECK_H
+
+double a[2], b[2], c[2];
+
+void __attribute__((noipa))
+foo ()
+{
+  /* When we want to use addsubpd we have to keep permuting both
+     loads, if instead we blend the result of an add and a sub we
+     can combine the blend with the permute.  Both are similar in cost,
+     verify we did not wrongly apply both.  */
+  double tem0 = a[1] - b[1];
+  double tem1 = a[0] + b[0];
+  c[0] = tem0;
+  c[1] = tem1;
+}
+
+static void
+TEST (void)
+{
+  a[0] = 1.; a[1] = 2.;
+  b[0] = 2.; b[1] = 4.;
+  foo ();
+  if (c[0] != -2. || c[1] != 3.)
+    __builtin_abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsubv2df.c b/gcc/testsuite/gcc.target/i386/vect-addsubv2df.c
new file mode 100644
index 000000000000..547485d5519d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsubv2df.c
@@ -0,0 +1,42 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-options "-O3 -msse3 -fdump-tree-slp2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse3-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse3_test
+#endif
+
+#include CHECK_H
+
+double x[2], y[2], z[2];
+void __attribute__((noipa)) foo ()
+{
+  x[0] = y[0] - z[0];
+  x[1] = y[1] + z[1];
+}
+void __attribute__((noipa)) bar ()
+{
+  x[0] = y[0] + z[0];
+  x[1] = y[1] - z[1];
+}
+static void
+TEST (void)
+{
+  for (int i = 0; i < 2; ++i)
+    {
+      y[i] = i + 1;
+      z[i] = 2 * i + 1;
+    }
+  foo ();
+  if (x[0] != 0 || x[1] != 5)
+    __builtin_abort ();
+  bar ();
+  if (x[0] != 2 || x[1] != -1)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "ADDSUB" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsubv4df.c b/gcc/testsuite/gcc.target/i386/vect-addsubv4df.c
new file mode 100644
index 000000000000..e0a1b3d9d00f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsubv4df.c
@@ -0,0 +1,36 @@
+/* { dg-do run { target avx_runtime } } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx -fdump-tree-slp2" } */
+
+double x[4], y[4], z[4];
+void __attribute__((noipa)) foo ()
+{
+  x[0] = y[0] - z[0];
+  x[1] = y[1] + z[1];
+  x[2] = y[2] - z[2];
+  x[3] = y[3] + z[3];
+}
+void __attribute__((noipa)) bar ()
+{
+  x[0] = y[0] + z[0];
+  x[1] = y[1] - z[1];
+  x[2] = y[2] + z[2];
+  x[3] = y[3] - z[3];
+}
+int main()
+{
+  for (int i = 0; i < 4; ++i)
+    {
+      y[i] = i + 1;
+      z[i] = 2 * i + 1;
+    }
+  foo ();
+  if (x[0] != 0 || x[1] != 5 || x[2] != -2 || x[3] != 11)
+    __builtin_abort ();
+  bar ();
+  if (x[0] != 2 || x[1] != -1 || x[2] != 8 || x[3] != -3)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "ADDSUB" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsubv4sf.c b/gcc/testsuite/gcc.target/i386/vect-addsubv4sf.c
new file mode 100644
index 000000000000..b524f0c35a80
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsubv4sf.c
@@ -0,0 +1,46 @@
+/* { dg-do run } */
+/* { dg-require-effective-target sse3 } */
+/* { dg-options "-O3 -msse3 -fdump-tree-slp2" } */
+
+#ifndef CHECK_H
+#define CHECK_H "sse3-check.h"
+#endif
+
+#ifndef TEST
+#define TEST sse3_test
+#endif
+
+#include CHECK_H
+
+float x[4], y[4], z[4];
+void __attribute__((noipa)) foo ()
+{
+  x[0] = y[0] - z[0];
+  x[1] = y[1] + z[1];
+  x[2] = y[2] - z[2];
+  x[3] = y[3] + z[3];
+}
+void __attribute__((noipa)) bar ()
+{
+  x[0] = y[0] + z[0];
+  x[1] = y[1] - z[1];
+  x[2] = y[2] + z[2];
+  x[3] = y[3] - z[3];
+}
+static void
+TEST (void)
+{
+  for (int i = 0; i < 4; ++i)
+    {
+      y[i] = i + 1;
+      z[i] = 2 * i + 1;
+    }
+  foo ();
+  if (x[0] != 0 || x[1] != 5 || x[2] != -2 || x[3] != 11)
+    __builtin_abort ();
+  bar ();
+  if (x[0] != 2 || x[1] != -1 || x[2] != 8 || x[3] != -3)
+    __builtin_abort ();
+}
+
+/* { dg-final { scan-tree-dump-times "ADDSUB" 1 "slp2" } } */
diff --git a/gcc/testsuite/gcc.target/i386/vect-addsubv8sf.c b/gcc/testsuite/gcc.target/i386/vect-addsubv8sf.c
new file mode 100644
index 000000000000..0eed33b65319
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/vect-addsubv8sf.c
@@ -0,0 +1,46 @@
+/* { dg-do run { target avx_runtime } } */
+/* { dg-require-effective-target avx } */
+/* { dg-options "-O3 -mavx -fdump-tree-slp2" } */
+
+float x[8], y[8], z[8];
+void __attribute__((noipa)) foo ()
+{
+  x[0] = y[0] - z[0];
+  x[1] = y[1] + z[1];
+  x[2] = y[2] - z[2];
+  x[3] = y[3] + z[3];
+  x[4] = y[4] - z[4];
+  x[5] = y[5] + z[5];
+  x[6] = y[6] - z[6];
+  x[7] = y[7] + z[7];
+}
+void __attribute__((noipa)) bar ()
+{
+  x[0] = y[0] + z[0];
+  x[1] = y[1] - z[1];
+  x[2] = y[2] + z[2];
+  x[3] = y[3] - z[3];
+  x[4] = y[4] + z[4];
+  x[5] = y[5] - z[5];
+  x[6] = y[6] + z[6];
+  x[7] = y[7] - z[7];
+}
+int main()
+{
+  for (int i = 0; i < 8; ++i)
+    {
+      y[i] = i + 1;
+      z[i] = 2 * i + 1;
+    }
+  foo ();
+  if (x[0] != 0 || x[1] != 5 || x[2] != -2 || x[3] != 11
+      || x[4] != -4 || x[5] != 17 || x[6] != -6 || x[7] != 23)
+    __builtin_abort ();
+  bar ();
+  if (x[0] != 2 || x[1] != -1 || x[2] != 8 || x[3] != -3
+      || x[4] != 14 || x[5] != -5 || x[6] != 20 || x[7] != -7)
+    __builtin_abort ();
+  return 0;
+}
+
+/* { dg-final { scan-tree-dump-times "ADDSUB" 1 "slp2" } } */
diff --git a/gcc/tree-vect-slp-patterns.c b/gcc/tree-vect-slp-patterns.c
index 2ed49cd9edca..d536494a1bd7 100644
--- a/gcc/tree-vect-slp-patterns.c
+++ b/gcc/tree-vect-slp-patterns.c
@@ -1490,6 +1490,105 @@ complex_operations_pattern::build (vec_info * /* vinfo */)
   gcc_unreachable ();
 }
 
+
+/* The addsub_pattern.  */
+
+class addsub_pattern : public vect_pattern
+{
+  public:
+    addsub_pattern (slp_tree *node)
+	: vect_pattern (node, NULL, IFN_VEC_ADDSUB) {};
+
+    void build (vec_info *);
+
+    static vect_pattern*
+    recognize (slp_tree_to_load_perm_map_t *, slp_tree *);
+};
+
+vect_pattern *
+addsub_pattern::recognize (slp_tree_to_load_perm_map_t *, slp_tree *node_)
+{
+  slp_tree node = *node_;
+  if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
+      || SLP_TREE_CHILDREN (node).length () != 2)
+    return NULL;
+
+  /* Match a blend of a plus and a minus op with the same number of plus and
+     minus lanes on the same operands.  */
+  slp_tree sub = SLP_TREE_CHILDREN (node)[0];
+  slp_tree add = SLP_TREE_CHILDREN (node)[1];
+  bool swapped_p = false;
+  if (vect_match_expression_p (sub, PLUS_EXPR))
+    {
+      std::swap (add, sub);
+      swapped_p = true;
+    }
+  if (!(vect_match_expression_p (add, PLUS_EXPR)
+	&& vect_match_expression_p (sub, MINUS_EXPR)))
+    return NULL;
+  if (!((SLP_TREE_CHILDREN (sub)[0] == SLP_TREE_CHILDREN (add)[0]
+	 && SLP_TREE_CHILDREN (sub)[1] == SLP_TREE_CHILDREN (add)[1])
+	|| (SLP_TREE_CHILDREN (sub)[0] == SLP_TREE_CHILDREN (add)[1]
+	    && SLP_TREE_CHILDREN (sub)[1] == SLP_TREE_CHILDREN (add)[0])))
+    return NULL;
+
+  for (unsigned i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
+    {
+      std::pair<unsigned, unsigned> perm = SLP_TREE_LANE_PERMUTATION (node)[i];
+      if (swapped_p)
+	perm.first = perm.first == 0 ? 1 : 0;
+      /* It has to be alternating -, +, -, ...
+	 While we could permute the .ADDSUB inputs and the .ADDSUB output
+	 that's only profitable over the add + sub + blend if at least
+	 one of the permute is optimized which we can't determine here.  */
+      if (perm.first != (i & 1)
+	  || perm.second != i)
+	return NULL;
+    }
+
+  if (!vect_pattern_validate_optab (IFN_VEC_ADDSUB, node))
+    return NULL;
+
+  return new addsub_pattern (node_);
+}
+
+void
+addsub_pattern::build (vec_info *vinfo)
+{
+  slp_tree node = *m_node;
+
+  slp_tree sub = SLP_TREE_CHILDREN (node)[0];
+  slp_tree add = SLP_TREE_CHILDREN (node)[1];
+  if (vect_match_expression_p (sub, PLUS_EXPR))
+    std::swap (add, sub);
+
+  /* Modify the blend node in-place.  */
+  SLP_TREE_CHILDREN (node)[0] = SLP_TREE_CHILDREN (sub)[0];
+  SLP_TREE_CHILDREN (node)[1] = SLP_TREE_CHILDREN (sub)[1];
+  SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (node)[0])++;
+  SLP_TREE_REF_COUNT (SLP_TREE_CHILDREN (node)[1])++;
+
+  /* Build IFN_VEC_ADDSUB from the sub representative operands.  */
+  stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (sub);
+  gcall *call = gimple_build_call_internal (IFN_VEC_ADDSUB, 2,
+					    gimple_assign_rhs1 (rep->stmt),
+					    gimple_assign_rhs2 (rep->stmt));
+  gimple_call_set_lhs (call, make_ssa_name
+			       (TREE_TYPE (gimple_assign_lhs (rep->stmt))));
+  gimple_call_set_nothrow (call, true);
+  gimple_set_bb (call, gimple_bb (rep->stmt));
+  SLP_TREE_REPRESENTATIVE (node) = vinfo->add_pattern_stmt (call, rep);
+  STMT_VINFO_RELEVANT (SLP_TREE_REPRESENTATIVE (node)) = vect_used_in_scope;
+  STMT_SLP_TYPE (SLP_TREE_REPRESENTATIVE (node)) = pure_slp;
+  STMT_VINFO_VECTYPE (SLP_TREE_REPRESENTATIVE (node)) = SLP_TREE_VECTYPE (node);
+  STMT_VINFO_SLP_VECT_ONLY_PATTERN (SLP_TREE_REPRESENTATIVE (node)) = true;
+  SLP_TREE_CODE (node) = ERROR_MARK;
+  SLP_TREE_LANE_PERMUTATION (node).release ();
+
+  vect_free_slp_tree (sub);
+  vect_free_slp_tree (add);
+}
+
 /*******************************************************************************
  * Pattern matching definitions
  ******************************************************************************/
@@ -1502,6 +1601,7 @@ vect_pattern_decl_t slp_patterns[]
      overlap in what they can detect.  */
 
   SLP_PATTERN (complex_operations_pattern),
+  SLP_PATTERN (addsub_pattern)
 };
 #undef SLP_PATTERN
 
diff --git a/gcc/tree-vect-slp.c b/gcc/tree-vect-slp.c
index 69ee8faed094..227d6aa3ee87 100644
--- a/gcc/tree-vect-slp.c
+++ b/gcc/tree-vect-slp.c
@@ -3705,6 +3705,7 @@ vect_optimize_slp (vec_info *vinfo)
 	      case CFN_COMPLEX_ADD_ROT270:
 	      case CFN_COMPLEX_MUL:
 	      case CFN_COMPLEX_MUL_CONJ:
+	      case CFN_VEC_ADDSUB:
 		continue;
 	      default:;
 	      }
diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h
index 5c71fbc487f7..fa28336d4295 100644
--- a/gcc/tree-vectorizer.h
+++ b/gcc/tree-vectorizer.h
@@ -2100,7 +2100,8 @@ class vect_pattern
       this->m_ifn = ifn;
       this->m_node = node;
       this->m_ops.create (0);
-      this->m_ops.safe_splice (*m_ops);
+      if (m_ops)
+	this->m_ops.safe_splice (*m_ops);
     }
 
   public:
-- 
GitLab