From 886ce970eb096bb302228c891f0c8a889c79ad40 Mon Sep 17 00:00:00 2001
From: Jakub Jelinek <jakub@redhat.com>
Date: Wed, 5 Feb 2025 13:16:17 +0100
Subject: [PATCH] cselib: For CALL_INSNs to const/pure fns invalidate memory
 below sp [PR117239]

The following testcase is miscompiled on x86_64 during postreload.
After reload (with IPA-RA figuring out the calls don't modify any
registers but %rax for return value) postreload sees
(insn 14 12 15 2 (set (mem:DI (plus:DI (reg/f:DI 7 sp)
                (const_int 16 [0x10])) [0  S8 A64])
        (reg:DI 1 dx [orig:105 q+16 ] [105])) "pr117239.c":18:7 95 {*movdi_internal}
     (nil))
(call_insn/i 15 14 16 2 (set (reg:SI 0 ax)
        (call (mem:QI (symbol_ref:DI ("baz") [flags 0x3]  <function_decl 0x7ffb2e2bdf00 r>) [0 baz S1 A8])
            (const_int 24 [0x18]))) "pr117239.c":18:7 1476 {*call_value}
     (expr_list:REG_CALL_DECL (symbol_ref:DI ("baz") [flags 0x3]  <function_decl 0x7ffb2e2bdf00 baz>)
        (expr_list:REG_EH_REGION (const_int 0 [0])
            (nil)))
    (nil))
(insn 16 15 18 2 (parallel [
            (set (reg/f:DI 7 sp)
                (plus:DI (reg/f:DI 7 sp)
                    (const_int 24 [0x18])))
            (clobber (reg:CC 17 flags))
        ]) "pr117239.c":18:7 285 {*adddi_1}
     (expr_list:REG_ARGS_SIZE (const_int 0 [0])
        (nil)))
...
(call_insn/i 19 18 21 2 (set (reg:SI 0 ax)
        (call (mem:QI (symbol_ref:DI ("foo") [flags 0x3]  <function_decl 0x7ffb2e2bdb00 l>) [0 foo S1 A8])
            (const_int 0 [0]))) "pr117239.c":19:3 1476 {*call_value}
     (expr_list:REG_CALL_DECL (symbol_ref:DI ("foo") [flags 0x3]  <function_decl 0x7ffb2e2bdb00 foo>)
        (expr_list:REG_EH_REGION (const_int 0 [0])
            (nil)))
    (nil))
(insn 21 19 26 2 (parallel [
            (set (reg/f:DI 7 sp)
                (plus:DI (reg/f:DI 7 sp)
                    (const_int -24 [0xffffffffffffffe8])))
            (clobber (reg:CC 17 flags))
        ]) "pr117239.c":19:3 discrim 1 285 {*adddi_1}
     (expr_list:REG_ARGS_SIZE (const_int 24 [0x18])
        (nil)))
(insn 26 21 24 2 (set (mem:DI (plus:DI (reg/f:DI 7 sp)
                (const_int 16 [0x10])) [0  S8 A64])
        (reg:DI 1 dx [orig:105 q+16 ] [105])) "pr117239.c":19:3 discrim 1 95 {*movdi_internal}
     (nil))
i.e.
        movq    %rdx, 16(%rsp)
        call    baz
        addq    $24, %rsp
...
        call    foo
        subq    $24, %rsp
        movq    %rdx, 16(%rsp)
Now, postreload uses cselib and cselib remembered that %rdx value has been
stored into 16(%rsp).  Both baz and foo are pure calls.  If they weren't,
when processing those CALL_INSNs cselib would invalidate all MEMs
      if (RTL_LOOPING_CONST_OR_PURE_CALL_P (insn)
          || !(RTL_CONST_OR_PURE_CALL_P (insn)))
        cselib_invalidate_mem (callmem);
where callmem is (mem:BLK (scratch)).  But they are pure, so instead the
code just invalidates the argument slots from CALL_INSN_FUNCTION_USAGE.
The calls actually clobber more than that, even const/pure calls clobber
all memory below the stack pointer.  And that is something that hasn't been
invalidated.  In this failing testcase, the call to baz is not a big deal,
we don't have anything remembered in memory below %rsp at that call.
But then we increment %rsp by 24, so the %rsp+16 is now 8 bytes below stack
and do the call to foo.  And that call now actually, not just in theory,
clobbers the memory below the stack pointer (in particular overwrites it
with the return value).  But cselib does not invalidate.  Then %rsp
is decremented again (in preparation for another call, to bar) and cselib
is processing store of %rdx (which IPA-RA says has not been modified by
either baz or foo calls) to %rsp + 16, and it sees the memory already has
that value, so the store is useless, let's remove it.
But it is not, the call to foo has changed it, so it needs to be stored
again.

The following patch adds targetted invalidation of memory below stack
pointer (or on SPARC memory below stack pointer + 2047 when stack bias is
used, or on PA memory above stack pointer instead).
It does so only in !ACCUMULATE_OUTGOING_ARGS or cfun->calls_alloca functions,
because in other functions the stack pointer should be constant from
the end of prologue till start of epilogue and so nothing should be stored
within the function below the stack pointer.

Now, memory below stack pointer is special, except for functions using
alloca/VLAs I believe no addressable memory should be there, it should be
purely outgoing function argument area, if we take address of some automatic
variable, it should live all the time above the outgoing function argument
area.  So on top of just trying to flush memory below stack pointer
(represented by %rsp - PTRDIFF_MAX with PTRDIFF_MAX size on most arches),
the patch tries to optimize and only invalidate memory that has address
clearly derived from stack pointer (memory with other bases is not
invalidated) and if we can prove (we see same SP_DERIVED_VALUE_P bases in
both VALUEs) it is above current stack, also don't call
canon_anti_dependence which might just give up in certain cases.

I've gathered statistics from x86_64-linux and i686-linux
bootstraps/regtests.  During -m64 compilations from those, there were
3718396 + 42634 + 27761 cases of processing MEMs in cselib_invalidate_mem
(callmem[1]) calls, the first number is number of MEMs not invalidated
because of the optimization, i.e.
+             if (sp_derived_base == NULL_RTX)
+               {
+                 has_mem = true;
+                 num_mems++;
+                 p = &(*p)->next;
+                 continue;
+               }
in the patch, the second number is number of MEMs not invalidated because
canon_anti_dependence returned false and finally the last number is number
of MEMs actually invalidated (so that is what hasn't been invalidated
before).  During -m32 compilations the numbers were
1422412 + 39354 + 16509 with the same meaning.

Note, when there is no red zone, in theory even the sp = sp + incr
instruction invalidates memory below the new stack pointer, as signal
can come and overwrite the memory.  So maybe we should be invalidating
something at those instructions as well.  But in leaf functions we certainly
can have even addressable automatic vars in the red zone (which would make
it harder to distinguish), on the other side aren't normally storing
anything below the red zone, and in non-leaf it should normally be just the
outgoing arguments area.

2025-02-05  Jakub Jelinek  <jakub@redhat.com>

	PR rtl-optimization/117239
	* cselib.cc: Include predict.h.
	(callmem): Change type from rtx to rtx[2].
	(cselib_preserve_only_values): Use callmem[0] rather than callmem.
	(cselib_invalidate_mem): Optimize and don't try to invalidate
	for the mem_rtx == callmem[1] case MEMs which clearly can't be
	below the stack pointer.
	(cselib_process_insn): Use callmem[0] rather than callmem.
	For const/pure calls also call cselib_invalidate_mem (callmem[1])
	in !ACCUMULATE_OUTGOING_ARGS or cfun->calls_alloca functions.
	(cselib_init): Initialize callmem[0] rather than callmem and also
	initialize callmem[1].

	* gcc.dg/pr117239.c: New test.
---
 gcc/cselib.cc                   | 169 +++++++++++++++++++++++++++++---
 gcc/testsuite/gcc.dg/pr117239.c |  42 ++++++++
 2 files changed, 199 insertions(+), 12 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr117239.c

diff --git a/gcc/cselib.cc b/gcc/cselib.cc
index aedde9fdd607..d18208e51440 100644
--- a/gcc/cselib.cc
+++ b/gcc/cselib.cc
@@ -33,6 +33,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "cselib.h"
 #include "function-abi.h"
 #include "alias.h"
+#include "predict.h"
 
 /* A list of cselib_val structures.  */
 struct elt_list
@@ -248,8 +249,9 @@ static unsigned int *used_regs;
 static unsigned int n_used_regs;
 
 /* We pass this to cselib_invalidate_mem to invalidate all of
-   memory for a non-const call instruction.  */
-static GTY(()) rtx callmem;
+   memory for a non-const call instruction and memory below stack pointer
+   for const/pure calls.  */
+static GTY(()) rtx callmem[2];
 
 /* Set by discard_useless_locs if it deleted the last location of any
    value.  */
@@ -808,7 +810,7 @@ cselib_preserve_only_values (void)
   for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
     cselib_invalidate_regno (i, reg_raw_mode[i]);
 
-  cselib_invalidate_mem (callmem);
+  cselib_invalidate_mem (callmem[0]);
 
   remove_useless_values ();
 
@@ -2630,6 +2632,8 @@ cselib_invalidate_mem (rtx mem_rtx)
       struct elt_loc_list **p = &v->locs;
       bool had_locs = v->locs != NULL;
       rtx_insn *setting_insn = v->locs ? v->locs->setting_insn : NULL;
+      rtx sp_base = NULL_RTX;
+      HOST_WIDE_INT sp_off = 0;
 
       while (*p)
 	{
@@ -2644,6 +2648,114 @@ cselib_invalidate_mem (rtx mem_rtx)
 	      p = &(*p)->next;
 	      continue;
 	    }
+
+	  /* When invalidating memory below the stack pointer for const/pure
+	     calls and alloca/VLAs aren't used, attempt to optimize.  Values
+	     stored into area sometimes below the stack pointer shouldn't be
+	     addressable and should be stored just through stack pointer
+	     derived expressions, so don't invalidate MEMs not using stack
+	     derived addresses, or if the MEMs clearly aren't below the stack
+	     pointer.  This isn't a fully conservative approach, the hope is
+	     that invalidating more MEMs than this isn't actually needed.  */
+	  if (mem_rtx == callmem[1]
+	      && num_mems < param_max_cselib_memory_locations
+	      && GET_CODE (XEXP (x, 0)) == VALUE
+	      && !cfun->calls_alloca)
+	    {
+	      cselib_val *v2 = CSELIB_VAL_PTR (XEXP (x, 0));
+	      rtx x_base = NULL_RTX;
+	      HOST_WIDE_INT x_off = 0;
+	      if (SP_DERIVED_VALUE_P (v2->val_rtx))
+		x_base = v2->val_rtx;
+	      else
+		for (struct elt_loc_list *l = v2->locs; l; l = l->next)
+		  if (GET_CODE (l->loc) == PLUS
+		      && GET_CODE (XEXP (l->loc, 0)) == VALUE
+		      && SP_DERIVED_VALUE_P (XEXP (l->loc, 0))
+		      && CONST_INT_P (XEXP (l->loc, 1)))
+		    {
+		      x_base = XEXP (l->loc, 0);
+		      x_off = INTVAL (XEXP (l->loc, 1));
+		      break;
+		    }
+	      /* If x_base is NULL here, don't invalidate x as its address
+		 isn't derived from sp such that it could be in outgoing
+		 argument area of some call in !ACCUMULATE_OUTGOING_ARGS
+		 function.  */
+	      if (x_base)
+		{
+		  if (sp_base == NULL_RTX)
+		    {
+		      if (cselib_val *v3
+			  = cselib_lookup_1 (stack_pointer_rtx, Pmode, 0,
+					     VOIDmode))
+			{
+			  if (SP_DERIVED_VALUE_P (v3->val_rtx))
+			    sp_base = v3->val_rtx;
+			  else
+			    for (struct elt_loc_list *l = v3->locs;
+				 l; l = l->next)
+			      if (GET_CODE (l->loc) == PLUS
+				  && GET_CODE (XEXP (l->loc, 0)) == VALUE
+				  && SP_DERIVED_VALUE_P (XEXP (l->loc, 0))
+				  && CONST_INT_P (XEXP (l->loc, 1)))
+				{
+				  sp_base = XEXP (l->loc, 0);
+				  sp_off = INTVAL (XEXP (l->loc, 1));
+				  break;
+				}
+			}
+		      if (sp_base == NULL_RTX)
+			sp_base = pc_rtx;
+		    }
+		  /* Otherwise, if x_base and sp_base are the same,
+		     we know that x_base + x_off is the x's address and
+		     sp_base + sp_off is current value of stack pointer,
+		     so try to determine if x is certainly not below stack
+		     pointer.  */
+		  if (sp_base == x_base)
+		    {
+		      if (STACK_GROWS_DOWNWARD)
+			{
+			  HOST_WIDE_INT off = sp_off;
+#ifdef STACK_ADDRESS_OFFSET
+			  /* On SPARC take stack pointer bias into account as
+			     well.  */
+			  off += (STACK_ADDRESS_OFFSET
+				  - FIRST_PARM_OFFSET (current_function_decl));
+#endif
+			  if (x_off >= off)
+			    /* x is at or above the current stack pointer,
+			       no need to invalidate it.  */
+			    x_base = NULL_RTX;
+			}
+		      else
+			{
+			  HOST_WIDE_INT sz;
+			  enum machine_mode mode = GET_MODE (x);
+			  if ((MEM_SIZE_KNOWN_P (x)
+			       && MEM_SIZE (x).is_constant (&sz))
+			      || (mode != BLKmode
+				  && GET_MODE_SIZE (mode).is_constant (&sz)))
+			    if (x_off < sp_off
+				&& ((HOST_WIDE_INT) ((unsigned HOST_WIDE_INT)
+						     x_off + sz) <= sp_off))
+			      /* x's end is below or at the current stack
+				 pointer in !STACK_GROWS_DOWNWARD target,
+				 no need to invalidate it.  */
+			      x_base = NULL_RTX;
+			 }
+		    }
+		}
+	      if (x_base == NULL_RTX)
+		{
+		  has_mem = true;
+		  num_mems++;
+		  p = &(*p)->next;
+		  continue;
+		}
+	    }
+
 	  if (num_mems < param_max_cselib_memory_locations
 	      && ! canon_anti_dependence (x, false, mem_rtx,
 					  GET_MODE (mem_rtx), mem_addr))
@@ -3196,14 +3308,24 @@ cselib_process_insn (rtx_insn *insn)
 	 as if they were regular functions.  */
       if (RTL_LOOPING_CONST_OR_PURE_CALL_P (insn)
 	  || !(RTL_CONST_OR_PURE_CALL_P (insn)))
-	cselib_invalidate_mem (callmem);
+	cselib_invalidate_mem (callmem[0]);
       else
-	/* For const/pure calls, invalidate any argument slots because
-	   they are owned by the callee.  */
-	for (x = CALL_INSN_FUNCTION_USAGE (insn); x; x = XEXP (x, 1))
-	  if (GET_CODE (XEXP (x, 0)) == USE
-	      && MEM_P (XEXP (XEXP (x, 0), 0)))
-	    cselib_invalidate_mem (XEXP (XEXP (x, 0), 0));
+	{
+	  /* For const/pure calls, invalidate any argument slots because
+	     they are owned by the callee.  */
+	  for (x = CALL_INSN_FUNCTION_USAGE (insn); x; x = XEXP (x, 1))
+	    if (GET_CODE (XEXP (x, 0)) == USE
+		&& MEM_P (XEXP (XEXP (x, 0), 0)))
+	      cselib_invalidate_mem (XEXP (XEXP (x, 0), 0));
+	  /* And invalidate memory below the stack (or above for
+	     !STACK_GROWS_DOWNWARD), as even const/pure call can invalidate
+	     that.  Do this only if !ACCUMULATE_OUTGOING_ARGS or if
+	     cfun->calls_alloca, otherwise the stack pointer shouldn't be
+	     changing in the middle of the function and nothing should be
+	     stored below the stack pointer.  */
+	  if (!ACCUMULATE_OUTGOING_ARGS || cfun->calls_alloca)
+	    cselib_invalidate_mem (callmem[1]);
+	}
     }
 
   cselib_record_sets (insn);
@@ -3256,8 +3378,31 @@ cselib_init (int record_what)
 
   /* (mem:BLK (scratch)) is a special mechanism to conflict with everything,
      see canon_true_dependence.  This is only created once.  */
-  if (! callmem)
-    callmem = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
+  if (! callmem[0])
+    callmem[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (VOIDmode));
+  /* Similarly create a MEM representing roughly everything below
+     the stack for STACK_GROWS_DOWNWARD targets or everything above
+     it otherwise.  Do this only when !ACCUMULATE_OUTGOING_ARGS or
+     if cfun->calls_alloca, otherwise the stack pointer shouldn't be
+     changing in the middle of the function and nothing should be stored
+     below the stack pointer.  */
+  if (!callmem[1] && (!ACCUMULATE_OUTGOING_ARGS || cfun->calls_alloca))
+    {
+      if (STACK_GROWS_DOWNWARD)
+	{
+	  unsigned HOST_WIDE_INT off = -(GET_MODE_MASK (Pmode) >> 1);
+#ifdef STACK_ADDRESS_OFFSET
+	  /* On SPARC take stack pointer bias into account as well.  */
+	  off += (STACK_ADDRESS_OFFSET
+		  - FIRST_PARM_OFFSET (current_function_decl)));
+#endif
+	  callmem[1] = plus_constant (Pmode, stack_pointer_rtx, off);
+	}
+      else
+	callmem[1] = stack_pointer_rtx;
+      callmem[1] = gen_rtx_MEM (BLKmode, callmem[1]);
+      set_mem_size (callmem[1], GET_MODE_MASK (Pmode) >> 1);
+    }
 
   cselib_nregs = max_reg_num ();
 
diff --git a/gcc/testsuite/gcc.dg/pr117239.c b/gcc/testsuite/gcc.dg/pr117239.c
new file mode 100644
index 000000000000..0ff33d19677e
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr117239.c
@@ -0,0 +1,42 @@
+/* PR rtl-optimization/117239 */
+/* { dg-do run } */
+/* { dg-options "-fno-inline -O2" } */
+/* { dg-additional-options "-fschedule-insns" { target i?86-*-* x86_64-*-* } } */
+
+int a, b, c = 1, d;
+
+int
+foo (void)
+{
+  return a;
+}
+
+struct A {
+  int e, f, g, h;
+  short i;
+  int j;
+};
+
+void
+bar (int x, struct A y)
+{
+  if (y.j == 1)
+    c = 0;
+}
+
+int
+baz (struct A x)
+{
+  return b;
+}
+
+int
+main ()
+{
+  struct A k = { 0, 0, 0, 0, 0, 1 };
+  d = baz (k);
+  bar (foo (), k);
+  if (c != 0)
+    __builtin_abort ();
+  return 0;
+}
-- 
GitLab