From 3635e8c67e13e3da7e1e23a617dd9952218e93e0 Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Thu, 1 Jun 2023 15:10:09 +0100
Subject: [PATCH] PR target/109973: CCZmode and CCCmode variants of [v]ptest on
 x86.

This is my proposed minimal fix for PR target/109973 (hopefully suitable
for backporting) that follows Jakub Jelinek's suggestion that we introduce
CCZmode and CCCmode variants of ptest and vptest, so that the i386
backend treats [v]ptest instructions similarly to testl instructions;
using different CCmodes to indicate which condition flags are desired,
and then relying on the RTL cmpelim pass to eliminate redundant tests.

This conveniently matches Intel's intrinsics, that provide different
functions for retrieving different flags, _mm_testz_si128 tests the
Z flag, _mm_testc_si128 tests the carry flag.  Currently we use the
same instruction (pattern) for both, and unfortunately the *ptest<mode>_and
optimization is only valid when the ptest/vptest instruction is used to
set/test the Z flag.

The downside, as predicted by Jakub, is that GCC's cmpelim pass is
currently COMPARE-centric and not able to merge the ptests from expressions
such as _mm256_testc_si256 (a, b) + _mm256_testz_si256 (a, b), which is a
known issue, PR target/80040.

2023-06-01  Roger Sayle  <roger@nextmovesoftware.com>
	    Uros Bizjak  <ubizjak@gmail.com>

gcc/ChangeLog
	PR target/109973
	* config/i386/i386-builtin.def (__builtin_ia32_ptestz128): Use new
	CODE_for_sse4_1_ptestzv2di.
	(__builtin_ia32_ptestc128): Use new CODE_for_sse4_1_ptestcv2di.
	(__builtin_ia32_ptestz256): Use new CODE_for_avx_ptestzv4di.
	(__builtin_ia32_ptestc256): Use new CODE_for_avx_ptestcv4di.
	* config/i386/i386-expand.cc (ix86_expand_branch): Use CCZmode
	when expanding UNSPEC_PTEST to compare against zero.
	* config/i386/i386-features.cc (scalar_chain::convert_compare):
	Likewise generate CCZmode UNSPEC_PTESTs when converting comparisons.
	(general_scalar_chain::convert_insn): Use CCZmode for COMPARE result.
	(timode_scalar_chain::convert_insn): Use CCZmode for COMPARE result.
	* config/i386/i386-protos.h (ix86_match_ptest_ccmode): Prototype.
	* config/i386/i386.cc (ix86_match_ptest_ccmode): New predicate to
	check for suitable matching modes for the UNSPEC_PTEST pattern.
	* config/i386/sse.md (define_split): When splitting UNSPEC_MOVMSK
	to UNSPEC_PTEST, preserve the FLAG_REG mode as CCZ.
	(*<sse4_1>_ptest<mode>): Add asterisk to hide define_insn.  Remove
	":CC" mode of FLAGS_REG, instead use ix86_match_ptest_ccmode.
	(<sse4_1>_ptestz<mode>): New define_expand to specify CCZ.
	(<sse4_1>_ptestc<mode>): New define_expand to specify CCC.
	(<sse4_1>_ptest<mode>): A define_expand using CC to preserve the
	current behavior.
	(*ptest<mode>_and): Specify CCZ to only perform this optimization
	when only the Z flag is required.

gcc/testsuite/ChangeLog
	PR target/109973
	* gcc.target/i386/pr109973-1.c: New test case.
	* gcc.target/i386/pr109973-2.c: Likewise.
---
 gcc/config/i386/i386-builtin.def           |  8 +--
 gcc/config/i386/i386-expand.cc             |  4 +-
 gcc/config/i386/i386-features.cc           | 10 ++--
 gcc/config/i386/i386-protos.h              |  1 +
 gcc/config/i386/i386.cc                    | 23 +++++++++
 gcc/config/i386/sse.md                     | 58 +++++++++++++++-------
 gcc/testsuite/gcc.target/i386/pr109973-1.c | 13 +++++
 gcc/testsuite/gcc.target/i386/pr109973-2.c | 13 +++++
 8 files changed, 102 insertions(+), 28 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109973-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr109973-2.c

diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def
index c91e3809c75e..383b68a9bb8a 100644
--- a/gcc/config/i386/i386-builtin.def
+++ b/gcc/config/i386/i386-builtin.def
@@ -1004,8 +1004,8 @@ BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_roundps_sfix, "__builtin_ia32_
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv4sf2, "__builtin_ia32_roundps_az", IX86_BUILTIN_ROUNDPS_AZ, UNKNOWN, (int) V4SF_FTYPE_V4SF)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_roundv4sf2_sfix, "__builtin_ia32_roundps_az_sfix", IX86_BUILTIN_ROUNDPS_AZ_SFIX, UNKNOWN, (int) V4SI_FTYPE_V4SF)
 
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST)
-BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestzv2di, "__builtin_ia32_ptestz128", IX86_BUILTIN_PTESTZ, EQ, (int) INT_FTYPE_V2DI_V2DI_PTEST)
+BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestcv2di, "__builtin_ia32_ptestc128", IX86_BUILTIN_PTESTC, LTU, (int) INT_FTYPE_V2DI_V2DI_PTEST)
 BDESC (OPTION_MASK_ISA_SSE4_1, 0, CODE_FOR_sse4_1_ptestv2di, "__builtin_ia32_ptestnzc128", IX86_BUILTIN_PTESTNZC, GTU, (int) INT_FTYPE_V2DI_V2DI_PTEST)
 
 /* SSE4.2 */
@@ -1164,8 +1164,8 @@ BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzc
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestv4di, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST)
-BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestv4di, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestzv4di, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST)
+BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestcv4di, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST)
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_ptestv4di, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST)
 
 BDESC (OPTION_MASK_ISA_AVX, 0, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF )
diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 70c6971ac03f..ac674418b960 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -2370,8 +2370,8 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
       tmp = gen_reg_rtx (mode);
       emit_insn (gen_rtx_SET (tmp, gen_rtx_XOR (mode, op0, op1)));
       tmp = gen_lowpart (p_mode, tmp);
-      emit_insn (gen_rtx_SET (gen_rtx_REG (CCmode, FLAGS_REG),
-			      gen_rtx_UNSPEC (CCmode,
+      emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
+			      gen_rtx_UNSPEC (CCZmode,
 					      gen_rtvec (2, tmp, tmp),
 					      UNSPEC_PTEST)));
       tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index a0a73486d369..3417f6bd0d13 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -974,7 +974,7 @@ general_scalar_chain::convert_op (rtx *op, rtx_insn *insn)
     }
 }
 
-/* Convert COMPARE to vector mode.  */
+/* Convert CCZmode COMPARE to vector mode.  */
 
 rtx
 scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
@@ -1023,7 +1023,7 @@ scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
 	  emit_insn_before (gen_rtx_SET (tmp, op11), insn);
 	  op11 = tmp;
 	}
-      return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, op11, op12),
+      return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
 			     UNSPEC_PTEST);
     }
   else
@@ -1052,7 +1052,7 @@ scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
       src = tmp;
     }
 
-  return gen_rtx_UNSPEC (CCmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
+  return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
 }
 
 /* Helper function for converting INSN to vector mode.  */
@@ -1219,7 +1219,7 @@ general_scalar_chain::convert_insn (rtx_insn *insn)
       break;
 
     case COMPARE:
-      dst = gen_rtx_REG (CCmode, FLAGS_REG);
+      dst = gen_rtx_REG (CCZmode, FLAGS_REG);
       src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
       break;
 
@@ -1726,7 +1726,7 @@ timode_scalar_chain::convert_insn (rtx_insn *insn)
       break;
 
     case COMPARE:
-      dst = gen_rtx_REG (CCmode, FLAGS_REG);
+      dst = gen_rtx_REG (CCZmode, FLAGS_REG);
       src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
       break;
 
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index d0f5783173ee..af01299c1303 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -140,6 +140,7 @@ extern void ix86_expand_copysign (rtx []);
 extern void ix86_expand_xorsign (rtx []);
 extern bool ix86_unary_operator_ok (enum rtx_code, machine_mode, rtx[2]);
 extern bool ix86_match_ccmode (rtx, machine_mode);
+extern bool ix86_match_ptest_ccmode (rtx);
 extern void ix86_expand_branch (enum rtx_code, rtx, rtx, rtx);
 extern void ix86_expand_setcc (rtx, enum rtx_code, rtx, rtx);
 extern bool ix86_expand_int_movcc (rtx[]);
diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
index e548d6588289..d4ff56ee8dd8 100644
--- a/gcc/config/i386/i386.cc
+++ b/gcc/config/i386/i386.cc
@@ -15987,6 +15987,29 @@ ix86_cc_mode (enum rtx_code code, rtx op0, rtx op1)
     }
 }
 
+/* Return TRUE or FALSE depending on whether the ptest instruction
+   INSN has source and destination with suitable matching CC modes.  */
+
+bool
+ix86_match_ptest_ccmode (rtx insn)
+{
+  rtx set, src;
+  machine_mode set_mode;
+
+  set = PATTERN (insn);
+  gcc_assert (GET_CODE (set) == SET);
+  src = SET_SRC (set);
+  gcc_assert (GET_CODE (src) == UNSPEC
+	      && XINT (src, 1) == UNSPEC_PTEST);
+
+  set_mode = GET_MODE (src);
+  if (set_mode != CCZmode
+      && set_mode != CCCmode
+      && set_mode != CCmode)
+    return false;
+  return GET_MODE (SET_DEST (set)) == set_mode;
+}
+
 /* Return the fixed registers used for condition codes.  */
 
 static bool
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index a92f50e96b59..9bec09d354aa 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -20423,10 +20423,10 @@
 		    UNSPEC_MOVMSK)
 		 (match_operand 2 "const_int_operand")))]
   "TARGET_SSE4_1 && (INTVAL (operands[2]) == (int) (<vi1avx2const>))"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(match_dup 0)
-		    (match_dup 0)]
-		   UNSPEC_PTEST))])
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ [(match_dup 0)
+		     (match_dup 0)]
+		    UNSPEC_PTEST))])
 
 (define_expand "sse2_maskmovdqu"
   [(set (match_operand:V16QI 0 "memory_operand")
@@ -23078,13 +23078,13 @@
    (set_attr "mode" "<MODE>")])
 
 ;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG.
-;; But it is not a really compare instruction.
-(define_insn "<sse4_1>_ptest<mode>"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(match_operand:V_AVX 0 "register_operand" "Yr, *x, x")
-		    (match_operand:V_AVX 1 "vector_operand" "YrBm, *xBm, xm")]
-		   UNSPEC_PTEST))]
-  "TARGET_SSE4_1"
+;; But it is not really a compare instruction.
+(define_insn "*<sse4_1>_ptest<mode>"
+  [(set (reg FLAGS_REG)
+	(unspec [(match_operand:V_AVX 0 "register_operand" "Yr, *x, x")
+		 (match_operand:V_AVX 1 "vector_operand" "YrBm, *xBm, xm")]
+		UNSPEC_PTEST))]
+  "TARGET_SSE4_1 && ix86_match_ptest_ccmode (insn)"
   "%vptest\t{%1, %0|%0, %1}"
   [(set_attr "isa" "noavx,noavx,avx")
    (set_attr "type" "ssecomi")
@@ -23097,6 +23097,30 @@
      (const_string "*")))
    (set_attr "mode" "<sseinsnmode>")])
 
+;; Expand a ptest to set the Z flag.
+(define_expand "<sse4_1>_ptestz<mode>"
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ [(match_operand:V_AVX 0 "register_operand")
+		     (match_operand:V_AVX 1 "vector_operand")]
+		    UNSPEC_PTEST))]
+  "TARGET_SSE4_1")
+
+;; Expand a ptest to set the C flag
+(define_expand "<sse4_1>_ptestc<mode>"
+  [(set (reg:CCC FLAGS_REG)
+	(unspec:CCC [(match_operand:V_AVX 0 "register_operand")
+		     (match_operand:V_AVX 1 "vector_operand")]
+		    UNSPEC_PTEST))]
+  "TARGET_SSE4_1")
+
+;; Expand a ptest to set both the Z and C flags
+(define_expand "<sse4_1>_ptest<mode>"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:V_AVX 0 "register_operand")
+		    (match_operand:V_AVX 1 "vector_operand")]
+		   UNSPEC_PTEST))]
+  "TARGET_SSE4_1")
+
 (define_insn "ptesttf2"
   [(set (reg:CC FLAGS_REG)
 	(unspec:CC [(match_operand:TF 0 "register_operand" "Yr, *x, x")
@@ -23111,17 +23135,17 @@
    (set_attr "mode" "TI")])
 
 (define_insn_and_split "*ptest<mode>_and"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(and:V_AVX (match_operand:V_AVX 0 "register_operand")
-			       (match_operand:V_AVX 1 "vector_operand"))
-		    (and:V_AVX (match_dup 0) (match_dup 1))]
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ [(and:V_AVX (match_operand:V_AVX 0 "register_operand")
+				(match_operand:V_AVX 1 "vector_operand"))
+		     (and:V_AVX (match_dup 0) (match_dup 1))]
 		   UNSPEC_PTEST))]
   "TARGET_SSE4_1
    && ix86_pre_reload_split ()"
   "#"
   "&& 1"
-  [(set (reg:CC FLAGS_REG)
-	(unspec:CC [(match_dup 0) (match_dup 1)] UNSPEC_PTEST))])
+  [(set (reg:CCZ FLAGS_REG)
+	(unspec:CCZ [(match_dup 0) (match_dup 1)] UNSPEC_PTEST))])
 
 (define_expand "nearbyint<mode>2"
   [(set (match_operand:VFH 0 "register_operand")
diff --git a/gcc/testsuite/gcc.target/i386/pr109973-1.c b/gcc/testsuite/gcc.target/i386/pr109973-1.c
new file mode 100644
index 000000000000..a1b6136bfeaa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109973-1.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2" } */
+
+typedef long long __m256i __attribute__ ((__vector_size__ (32)));
+
+int
+foo (__m256i x, __m256i y)
+{
+  __m256i a = x & y;
+  return __builtin_ia32_ptestc256 (a, a);
+}
+
+/* { dg-final { scan-assembler "vpand" } } */
diff --git a/gcc/testsuite/gcc.target/i386/pr109973-2.c b/gcc/testsuite/gcc.target/i386/pr109973-2.c
new file mode 100644
index 000000000000..167f6ee02e25
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr109973-2.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -msse4.1" } */
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+
+int
+foo (__m128i x, __m128i y)
+{
+  __m128i a = x & y;
+  return __builtin_ia32_ptestc128 (a, a);
+}
+
+/* { dg-final { scan-assembler "pand" } } */
-- 
GitLab