From 2862d99bfdae96a1d4b275fa3f3daad6206ff761 Mon Sep 17 00:00:00 2001 From: Raphael Moreira Zinsly <rzinsly@ventanamicro.com> Date: Mon, 22 Jul 2024 11:23:23 -0300 Subject: [PATCH] RISC-V: Add support to vector stack-clash protection Adds basic support to vector stack-clash protection using a loop to do the probing and stack adjustments. gcc/ChangeLog: * config/riscv/riscv.cc (riscv_allocate_and_probe_stack_loop): New function. (riscv_v_adjust_scalable_frame): Add stack-clash protection support. (riscv_allocate_and_probe_stack_space): Move the probe loop implementation to riscv_allocate_and_probe_stack_loop. * config/riscv/riscv.h: Define RISCV_STACK_CLASH_VECTOR_CFA_REGNUM. gcc/testsuite/ChangeLog: * gcc.target/riscv/stack-check-cfa-3.c: New test. * gcc.target/riscv/stack-check-prologue-16.c: New test. * gcc.target/riscv/struct_vect_24.c: New test. --- gcc/config/riscv/riscv.cc | 99 +++++++++++++++---- gcc/config/riscv/riscv.h | 5 + .../gcc.target/riscv/stack-check-cfa-3.c | 13 +++ .../riscv/stack-check-prologue-16.c | 30 ++++++ .../gcc.target/riscv/struct_vect_24.c | 47 +++++++++ 5 files changed, 173 insertions(+), 21 deletions(-) create mode 100644 gcc/testsuite/gcc.target/riscv/stack-check-cfa-3.c create mode 100644 gcc/testsuite/gcc.target/riscv/stack-check-prologue-16.c create mode 100644 gcc/testsuite/gcc.target/riscv/struct_vect_24.c diff --git a/gcc/config/riscv/riscv.cc b/gcc/config/riscv/riscv.cc index 63139afd3e36..034290617624 100644 --- a/gcc/config/riscv/riscv.cc +++ b/gcc/config/riscv/riscv.cc @@ -7900,6 +7900,35 @@ static const code_for_push_pop_t code_for_push_pop[ZCMP_MAX_GRP_SLOTS][ZCMP_OP_N code_for_gpr_multi_popret_up_to_s11, code_for_gpr_multi_popretz_up_to_s11}}; +/* Set a probe loop for stack clash protection. */ +static void +riscv_allocate_and_probe_stack_loop (rtx tmp, enum rtx_code code, + rtx op0, rtx op1, bool vector, + HOST_WIDE_INT offset) +{ + tmp = riscv_force_temporary (tmp, gen_int_mode (offset, Pmode)); + + /* Loop. */ + rtx label = gen_label_rtx (); + emit_label (label); + + /* Allocate and probe stack. */ + emit_insn (gen_sub3_insn (stack_pointer_rtx, stack_pointer_rtx, tmp)); + emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, + STACK_CLASH_CALLER_GUARD)); + emit_insn (gen_blockage ()); + + /* Adjust the remaining vector length. */ + if (vector) + emit_insn (gen_sub3_insn (op0, op0, tmp)); + + /* Branch if there's still more bytes to probe. */ + riscv_expand_conditional_branch (label, code, op0, op1); + JUMP_LABEL (get_last_insn ()) = label; + + emit_insn (gen_blockage ()); +} + /* Adjust scalable frame of vector for prologue && epilogue. */ static void @@ -7912,6 +7941,49 @@ riscv_v_adjust_scalable_frame (rtx target, poly_int64 offset, bool epilogue) riscv_legitimize_poly_move (Pmode, adjust_size, tmp, gen_int_mode (offset, Pmode)); + /* If doing stack clash protection then we use a loop to allocate and probe + the stack. */ + if (flag_stack_clash_protection && !epilogue) + { + HOST_WIDE_INT min_probe_threshold + = (1 << param_stack_clash_protection_guard_size) - STACK_CLASH_CALLER_GUARD; + + if (!frame_pointer_needed) + { + /* This is done to provide unwinding information for the stack + adjustments we're about to do, however to prevent the optimizers + from removing the T3 move and leaving the CFA note (which would be + very wrong) we tie the old and new stack pointer together. + The tie will expand to nothing but the optimizers will not touch + the instruction. */ + insn = get_last_insn (); + rtx stack_ptr_copy = gen_rtx_REG (Pmode, RISCV_STACK_CLASH_VECTOR_CFA_REGNUM); + emit_move_insn (stack_ptr_copy, stack_pointer_rtx); + riscv_emit_stack_tie (stack_ptr_copy); + + /* We want the CFA independent of the stack pointer for the + duration of the loop. */ + add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy); + RTX_FRAME_RELATED_P (insn) = 1; + } + + riscv_allocate_and_probe_stack_loop (tmp, GE, adjust_size, tmp, true, + min_probe_threshold); + + /* Allocate the residual. */ + insn = emit_insn (gen_sub3_insn (target, target, adjust_size)); + + /* Now reset the CFA register if needed. */ + if (!frame_pointer_needed) + { + add_reg_note (insn, REG_CFA_DEF_CFA, + plus_constant (Pmode, stack_pointer_rtx, -offset)); + RTX_FRAME_RELATED_P (insn) = 1; + } + + return; + } + if (epilogue) insn = gen_add3_insn (target, target, adjust_size); else @@ -8059,8 +8131,9 @@ riscv_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size) else { /* Compute the ending address. */ - temp1 = riscv_force_temporary (temp1, gen_int_mode (rounded_size, Pmode)); - insn = emit_insn (gen_sub3_insn (temp1, stack_pointer_rtx, temp1)); + rtx temp2 = gen_rtx_REG (Pmode, RISCV_PROLOGUE_TEMP2_REGNUM); + temp2 = riscv_force_temporary (temp2, gen_int_mode (rounded_size, Pmode)); + insn = emit_insn (gen_sub3_insn (temp2, stack_pointer_rtx, temp2)); if (!frame_pointer_needed) { @@ -8071,25 +8144,9 @@ riscv_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size) RTX_FRAME_RELATED_P (insn) = 1; } - /* Allocate and probe the stack. */ - - rtx temp2 = gen_rtx_REG (Pmode, RISCV_PROLOGUE_TEMP2_REGNUM); - temp2 = riscv_force_temporary (temp2, gen_int_mode (guard_size, Pmode)); - - /* Loop. */ - rtx label = gen_label_rtx (); - emit_label (label); - - emit_insn (gen_sub3_insn (stack_pointer_rtx, stack_pointer_rtx, temp2)); - emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, - guard_used_by_caller)); - emit_insn (gen_blockage ()); - - /* Check if the stack pointer is at the ending address. */ - riscv_expand_conditional_branch (label, NE, stack_pointer_rtx, temp1); - JUMP_LABEL (get_last_insn ()) = label; - - emit_insn (gen_blockage ()); + /* This allocates and probes the stack. */ + riscv_allocate_and_probe_stack_loop (temp1, NE, stack_pointer_rtx, temp2, + false, guard_size); /* Now reset the CFA register if needed. */ if (!frame_pointer_needed) diff --git a/gcc/config/riscv/riscv.h b/gcc/config/riscv/riscv.h index 7e9093fed3c8..58e17178212b 100644 --- a/gcc/config/riscv/riscv.h +++ b/gcc/config/riscv/riscv.h @@ -429,6 +429,11 @@ ASM_MISA_SPEC #define RISCV_PROLOGUE_TEMP2_REGNUM (GP_TEMP_FIRST + 1) #define RISCV_PROLOGUE_TEMP2(MODE) gen_rtx_REG (MODE, RISCV_PROLOGUE_TEMP2_REGNUM) +/* Both prologue temp registers are used in the vector probe loop for when + stack-clash protection is enabled, so we need to copy SP to a new register + and set it as CFA during the loop, we are using T3 for that. */ +#define RISCV_STACK_CLASH_VECTOR_CFA_REGNUM (GP_TEMP_FIRST + 23) + #define RISCV_CALL_ADDRESS_TEMP_REGNUM (GP_TEMP_FIRST + 1) #define RISCV_CALL_ADDRESS_TEMP(MODE) \ gen_rtx_REG (MODE, RISCV_CALL_ADDRESS_TEMP_REGNUM) diff --git a/gcc/testsuite/gcc.target/riscv/stack-check-cfa-3.c b/gcc/testsuite/gcc.target/riscv/stack-check-cfa-3.c new file mode 100644 index 000000000000..e45f7bb7df51 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/stack-check-cfa-3.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection -funwind-tables -fno-stack-protector" } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ + +#include "stack-check-prologue-16.c" + +/* Checks that the CFA notes are correct for every sp adjustment, but we also + need to make sure we can unwind correctly before the frame is set up. So + check that we're emitting t3 with a copy of sp an setting the CFA there. */ + +/* { dg-final { scan-assembler-times {mv\tt3,sp} 1 } } */ +/* { dg-final { scan-assembler-times {\.cfi_def_cfa [0-9]+, 0} 1 } } */ +/* { dg-final { scan-assembler-times {\.cfi_escape 0xf,0xa,0x72,0,0x92,0xa2,0x38,0,0x9,0xec,0x1e,0x22} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/stack-check-prologue-16.c b/gcc/testsuite/gcc.target/riscv/stack-check-prologue-16.c new file mode 100644 index 000000000000..c74dce04b236 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/stack-check-prologue-16.c @@ -0,0 +1,30 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ +/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection" } */ + +/* Invoke X (P##n) for n in [0, 7]. */ +#define REPEAT8(X, P) \ + X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) + +/* Invoke X (n) for all octal n in [0, 39]. */ +#define REPEAT40(X) \ + REPEAT8 (X, 0) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4) + +/* Expect vector work to be done, with spilling of vector registers. */ +void +f2 (int x[40][100], int *y) +{ + /* Try to force some spilling. */ +#define DECLARE(N) int y##N = y[N]; + REPEAT40 (DECLARE); +#pragma omp simd + for (int i = 0; i < 100; ++i) + { +#define INC(N) x[N][i] += y##N; + REPEAT40 (INC); + } +} + +/* Vector spill, requires probing as vector size is unknown at compile time. */ + +/* { dg-final { scan-assembler-times {sd\tzero,1024\(sp\)} 1 } } */ diff --git a/gcc/testsuite/gcc.target/riscv/struct_vect_24.c b/gcc/testsuite/gcc.target/riscv/struct_vect_24.c new file mode 100644 index 000000000000..7c0852f1a552 --- /dev/null +++ b/gcc/testsuite/gcc.target/riscv/struct_vect_24.c @@ -0,0 +1,47 @@ +/* { dg-do compile } */ +/* { dg-require-effective-target supports_stack_clash_protection } */ +/* { dg-options "-O3 -march=rv64gcv -mabi=lp64d -fstack-clash-protection -fno-stack-protector" } */ +/* { dg-skip-if "" { *-*-* } {"-O0" "-O1" "-O2" "-Og" "-Os" "-Oz" "-funroll-loops"} } */ + +#include <stdint.h> + +#define N 50 +#define S 2 * 4 * 1024 + +/* Invoke X (P##n) for n in [0, 9]. */ +#define REPEAT8(X, P) \ + X (P##0) X (P##1) X (P##2) X (P##3) X (P##4) X (P##5) X (P##6) X (P##7) \ + X (P##8) X (P##9) + +/* Invoke X (n) for all n in [0, 49]. */ +#define REPEAT50(X) \ + REPEAT8 (X, ) REPEAT8 (X, 1) REPEAT8 (X, 2) REPEAT8 (X, 3) REPEAT8 (X, 4) + + /* Try to force some spilling. */ +#define DECLARE(N) int src##N = src[N * 4]; +#define INC(N) dest[i] += src##N; + +#define TEST_LOOP(NAME, TYPE) \ + void __attribute__ ((noinline)) \ + NAME (TYPE *restrict dest, TYPE *restrict src) \ + { \ + REPEAT50 (DECLARE); \ + volatile char foo[S]; \ + foo[S-1]=1; \ + for (int i = 0; i < N; i++) \ + { \ + REPEAT50 (INC); \ + } \ + } + +#define TEST(NAME) \ + TEST_LOOP (NAME##_i32, int32_t) \ + TEST_LOOP (NAME##_i64, int64_t) + +TEST (test) + +/* Check the vectorized loop for stack clash probing. */ + +/* { dg-final { scan-assembler-times {sd\tzero,1024\(sp\)} 6 } } */ +/* { dg-final { scan-assembler-times {bge\tt1,t0,.[^\\r\\n]*} 2 } } */ +/* { dg-final { scan-assembler-times {sub\s+t1,t1,t0} 2 } } */ -- GitLab