From d92d26ff366c5ceb4795d1ad678847c1b3d88e63 Mon Sep 17 00:00:00 2001
From: Haochen Gui <guihaoc@gcc.gnu.org>
Date: Wed, 27 Dec 2023 10:32:21 +0800
Subject: [PATCH] rs6000: Clean up the pre-checkings of expand_block_compare

Remove P7 CPU test as only P7 above can enter this function and P7 LE is
excluded by the checking of targetm.slow_unaligned_access on word_mode.
Also performance test shows the expand of block compare is better than
library on P7 BE when the length is from 16 bytes to 64 bytes.

gcc/
	* config/rs6000/rs6000-string.cc (expand_block_compare): Assert
	only P7 above can enter this function.  Remove P7 CPU test and let
	P7 BE do the expand.

gcc/testsuite/
	* gcc.target/powerpc/block-cmp-4.c: New.
---
 gcc/config/rs6000/rs6000-string.cc            | 35 ++++++-------------
 .../gcc.target/powerpc/block-cmp-4.c          | 11 ++++++
 2 files changed, 21 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/block-cmp-4.c

diff --git a/gcc/config/rs6000/rs6000-string.cc b/gcc/config/rs6000/rs6000-string.cc
index fa80298470e5..133e5382af20 100644
--- a/gcc/config/rs6000/rs6000-string.cc
+++ b/gcc/config/rs6000/rs6000-string.cc
@@ -1947,11 +1947,8 @@ expand_block_compare_gpr(unsigned HOST_WIDE_INT bytes, unsigned int base_align,
 bool
 expand_block_compare (rtx operands[])
 {
-  rtx target = operands[0];
-  rtx orig_src1 = operands[1];
-  rtx orig_src2 = operands[2];
-  rtx bytes_rtx = operands[3];
-  rtx align_rtx = operands[4];
+  /* TARGET_POPCNTD is already guarded at expand cmpmemsi.  */
+  gcc_assert (TARGET_POPCNTD);
 
   /* This case is complicated to handle because the subtract
      with carry instructions do not generate the 64-bit
@@ -1960,23 +1957,19 @@ expand_block_compare (rtx operands[])
   if (TARGET_32BIT && TARGET_POWERPC64)
     return false;
 
-  bool isP7 = (rs6000_tune == PROCESSOR_POWER7);
-
   /* Allow this param to shut off all expansion.  */
   if (rs6000_block_compare_inline_limit == 0)
     return false;
 
-  /* targetm.slow_unaligned_access -- don't do unaligned stuff.
-     However slow_unaligned_access returns true on P7 even though the
-     performance of this code is good there.  */
-  if (!isP7
-      && (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
-	  || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2))))
-    return false;
+  rtx target = operands[0];
+  rtx orig_src1 = operands[1];
+  rtx orig_src2 = operands[2];
+  rtx bytes_rtx = operands[3];
+  rtx align_rtx = operands[4];
 
-  /* Unaligned l*brx traps on P7 so don't do this.  However this should
-     not affect much because LE isn't really supported on P7 anyway.  */
-  if (isP7 && !BYTES_BIG_ENDIAN)
+  /* targetm.slow_unaligned_access -- don't do unaligned stuff.  */
+  if (targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src1))
+      || targetm.slow_unaligned_access (word_mode, MEM_ALIGN (orig_src2)))
     return false;
 
   /* If this is not a fixed size compare, try generating loop code and
@@ -2025,14 +2018,6 @@ expand_block_compare (rtx operands[])
   if (!IN_RANGE (bytes, 1, max_bytes))
     return expand_compare_loop (operands);
 
-  /* The code generated for p7 and older is not faster than glibc
-     memcmp if alignment is small and length is not short, so bail
-     out to avoid those conditions.  */
-  if (targetm.slow_unaligned_access (word_mode, align_by_bits)
-      && ((base_align == 1 && bytes > 16)
-	  || (base_align == 2 && bytes > 32)))
-    return false;
-
   rtx final_label = NULL;
 
   if (use_vec)
diff --git a/gcc/testsuite/gcc.target/powerpc/block-cmp-4.c b/gcc/testsuite/gcc.target/powerpc/block-cmp-4.c
new file mode 100644
index 000000000000..c86febae68a8
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/block-cmp-4.c
@@ -0,0 +1,11 @@
+/* { dg-do compile { target be } } */
+/* { dg-options "-O2 -mdejagnu-cpu=power7" } */
+/* { dg-final { scan-assembler-not {\mb[l]? memcmp\M} } }  */
+
+/* Test that it does expand for memcmpsi instead of calling library on
+   P7 BE when length is less than 32 bytes.  */
+
+int foo (const char* s1, const char* s2)
+{
+  return __builtin_memcmp (s1, s2, 31);
+}
-- 
GitLab