From c25c172959e7fb424455ee6acc60571c68b72443 Mon Sep 17 00:00:00 2001 From: Jakub Jelinek <jakub@redhat.com> Date: Fri, 22 Nov 2024 19:50:22 +0100 Subject: [PATCH] match.pd: Fix up the new simpliofiers using with_possible_nonzero_bits2 [PR117420] The following testcase shows wrong-code caused by incorrect use of with_possible_nonzero_bits2. That matcher is defined as /* Slightly extended version, do not make it recursive to keep it cheap. */ (match (with_possible_nonzero_bits2 @0) with_possible_nonzero_bits@0) (match (with_possible_nonzero_bits2 @0) (bit_and:c with_possible_nonzero_bits@0 @2)) and because with_possible_nonzero_bits includes the SSA_NAME case with integral/pointer argument, both forms can actually match when a SSA_NAME with integral/pointer type has a def stmt which is BIT_AND_EXPR assignment with say SSA_NAME with integral/pointer type as one of its operands (or INTEGER_CST, another with_possible_nonzero_bits case). And in match.pd the latter actually wins if both match and so when using (with_possible_nonzero_bits2 @0) the @0 will actually be one of the BIT_AND_EXPR operands if that form is matched. Now, with_possible_nonzero_bits2 and with_certain_nonzero_bits2 were added for the /* X == C (or X & Z == Y | C) is impossible if ~nonzero(X) & C != 0. */ (for cmp (eq ne) (simplify (cmp:c (with_possible_nonzero_bits2 @0) (with_certain_nonzero_bits2 @1)) (if (wi::bit_and_not (wi::to_wide (@1), get_nonzero_bits (@0)) != 0) { constant_boolean_node (cmp == NE_EXPR, type); }))) simplifier, but even for that one I think they do not do a good job, they might actually pessimize stuff rather than optimize, but at least does not result in wrong-code, because the operands are solely tested with wi::to_wide or get_nonzero_bits, but not actually used in the simplification. The reason why it can pessimize stuff is say if we have # RANGE [irange] int ... MASK 0xb VALUE 0x0 x_1 = ...; # RANGE [irange] int ... MASK 0x8 VALUE 0x0 _2 = x_1 & 0xc; _3 = _2 == 2; then if it used just with_possible_nonzero_bits@0, @0 would have get_nonzero_bits (@0) 0x8 and (2 & ~8) != 0, so we can fold it into _3 = 0; But as it uses (with_possible_nonzero_bits2 @0), @0 is x_1 rather than _2 and get_nonzero_bits (@0) is unnecessarily conservative, 0xb rather than 0x8 and (2 & ~0xb) == 0, so we don't optimize. Now, with_possible_nonzero_bits2 can actually improve stuff as well in that pattern, if say value ranges aren't fully computed yet or the BIT_AND_EXPR assignment has been added later and the lhs doesn't have range computed yet, get_nonzero_range on the BIT_AND_EXPR lhs will be all bits set, while on the BIT_AND_EXPR operand might actually succeed. I believe better would be to either modify get_nonzero_bits so that it special cases the SSA_NAME with BIT_AND_EXPR def_stmt (but one level deep only like with_possible_nonzero_bits2, no recursion), in that case return bitwise and of get_nonzero_bits (non-recursive) for the lhs and both operands, and possibly BIT_AND_EXPR itself e.g. for GENERIC matching during by returning bitwise and of both operands. Then with_possible_nonzero_bits2 could be needed for the GENERIC case, perhaps have the second match #if GENERIC, but changed so that the @N operand always is the whole thing rather than its operand which is error-prone. Or add get_nonzero_bits wrapper with a different name which would do that. with_certain_nonzero_bits2 could be changed similarly, these days we can test known non-zero bits rather than possible non-zero bits on SSA_NAMEs too, we record both mask and value, so possible nonzero bits (aka. get_nonzero_bits) is mask () | value (), while known nonzero bits is value () & ~mask (), with a new function (get_known_nonzero_bits or get_certain_nonzero_bits etc.) which handles that. Anyway, the following patch doesn't do what I wrote above just yet, for that single pattern it is just a missed optimization. But the with_possible_nonzero_bits2 uses in the 3 new simplifiers are just completely incorrect, because they don't just use the @0 operand in get_nonzero_bits (pessimizing stuff if value ranges are fully computed), but also use it in the replacement, then they act as if the BIT_AND_EXPR wasn't there at all. While we could use (with_possible_nonzero_bits2@3 @0) and use get_nonzero_bits (@0) and use @3 in the replacement, that would still often be a pessimization, so I've just used with_possible_nonzero_bits@0. 2024-11-22 Jakub Jelinek <jakub@redhat.com> PR tree-optimization/117420 * match.pd ((X >> C1) << (C1 + C2) -> X << C2, (X >> C1) * (C2 << C1) -> X * C2, X / (1 << C) -> X /[ex] (1 << C)): Use with_possible_nonzero_bits@0 rather than (with_possible_nonzero_bits2 @0). * gcc.dg/torture/pr117420.c: New test. --- gcc/match.pd | 6 +-- gcc/testsuite/gcc.dg/torture/pr117420.c | 52 +++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 3 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/torture/pr117420.c diff --git a/gcc/match.pd b/gcc/match.pd index 48317dc80b63..ad354274b2a1 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -4933,7 +4933,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) #if GIMPLE /* (X >> C1) << (C1 + C2) -> X << C2 if the low C1 bits of X are zero. */ (simplify - (lshift (convert? (rshift (with_possible_nonzero_bits2 @0) INTEGER_CST@1)) + (lshift (convert? (rshift with_possible_nonzero_bits@0 INTEGER_CST@1)) INTEGER_CST@2) (if (INTEGRAL_TYPE_P (type) && wi::ltu_p (wi::to_wide (@1), element_precision (type)) @@ -4944,7 +4944,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) /* (X >> C1) * (C2 << C1) -> X * C2 if the low C1 bits of X are zero. */ (simplify - (mult (convert? (rshift (with_possible_nonzero_bits2 @0) INTEGER_CST@1)) + (mult (convert? (rshift with_possible_nonzero_bits@0 INTEGER_CST@1)) poly_int_tree_p@2) (with { poly_widest_int factor; } (if (INTEGRAL_TYPE_P (type) @@ -5519,7 +5519,7 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) #if GIMPLE /* X / (1 << C) -> X /[ex] (1 << C) if the low C bits of X are clear. */ (simplify - (trunc_div (with_possible_nonzero_bits2 @0) integer_pow2p@1) + (trunc_div with_possible_nonzero_bits@0 integer_pow2p@1) (if (INTEGRAL_TYPE_P (type) && !TYPE_UNSIGNED (type) && wi::multiple_of_p (get_nonzero_bits (@0), wi::to_wide (@1), SIGNED)) diff --git a/gcc/testsuite/gcc.dg/torture/pr117420.c b/gcc/testsuite/gcc.dg/torture/pr117420.c new file mode 100644 index 000000000000..854768e61d3e --- /dev/null +++ b/gcc/testsuite/gcc.dg/torture/pr117420.c @@ -0,0 +1,52 @@ +/* PR tree-optimization/117420 */ +/* { dg-do run } */ + +int a; + +__attribute__((noipa)) int +foo () +{ + int b = -(1 | -(a < 1)); + int c = (~b & 2) / 2; + if (b != 1) + __builtin_abort (); +} + +__attribute__((noipa)) int +bar () +{ + int b = -(1 | -(a < 1)); + int c = (~b & 2) / 2; + if (b != -1) + __builtin_abort (); +} + +__attribute__((noipa)) int +baz () +{ + int b = -(1 | -(a < 1)); + int c = (~b & 2) / 2; + if (c != 1) + __builtin_abort (); +} + +__attribute__((noipa)) int +qux () +{ + int b = -(1 | -(a < 1)); + int c = (~b & 2) / 2; + if (c != 0) + __builtin_abort (); +} + +int +main () +{ + foo (); + a = 1; + bar (); + a = 0; + baz (); + a = 1; + qux (); +} -- GitLab