diff --git a/gcc/testsuite/gcc.dg/tree-ssa/satd-hadamard.c b/gcc/testsuite/gcc.dg/tree-ssa/satd-hadamard.c new file mode 100644 index 0000000000000000000000000000000000000000..576ef01628cc6caf3d6f2d2f6d1913df7a38bb91 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/satd-hadamard.c @@ -0,0 +1,43 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-forwprop4-details" } */ + +#include <stdint.h> + +#define HADAMARD4(d0, d1, d2, d3, s0, s1, s2, s3) {\ + int t0 = s0 + s1;\ + int t1 = s0 - s1;\ + int t2 = s2 + s3;\ + int t3 = s2 - s3;\ + d0 = t0 + t2;\ + d1 = t1 + t3;\ + d2 = t0 - t2;\ + d3 = t1 - t3;\ +} + +int +x264_pixel_satd_8x4_simplified (uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2) +{ + uint32_t tmp[4][4]; + uint32_t a0, a1, a2, a3; + int sum = 0; + int i; + + for (i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2) + { + a0 = (pix1[0] - pix2[0]) + ((pix1[4] - pix2[4]) << 16); + a1 = (pix1[1] - pix2[1]) + ((pix1[5] - pix2[5]) << 16); + a2 = (pix1[2] - pix2[2]) + ((pix1[6] - pix2[6]) << 16); + a3 = (pix1[3] - pix2[3]) + ((pix1[7] - pix2[7]) << 16); + HADAMARD4(tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3); + } + + for (i = 0; i < 4; i++) + { + HADAMARD4(a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i]); + sum += a0 + a1 + a2 + a3; + } + + return (((uint16_t)sum) + ((uint32_t)sum>>16)) >> 1; +} + +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 3, 6, 7 }" "forwprop4" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-10.c b/gcc/testsuite/gcc.dg/tree-ssa/vector-10.c new file mode 100644 index 0000000000000000000000000000000000000000..d5caebdf1742823fa158192a0ca0f77b3bdf2e7a --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-10.c @@ -0,0 +1,122 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details" } */ + +typedef int vec __attribute__((vector_size (4 * sizeof (int)))); + +void f1 (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2) +{ + vec sel0 = { 0, 2, 0, 2 }; + vec sel1 = { 1, 3, 1, 3 }; + vec sel = { 0, 1, 6, 7 }; + vec v_1, v_2, v_x, v_y, v_out_1, v_out_2; + vec v_in_1 = *p_v_in_1; + vec v_in_2; + + /* First vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0); + v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_1 = __builtin_shuffle (v_x, v_y, sel); + + /* Won't blend because v_in_2 is defined after v_1 above. */ + v_in_2 = *p_v_in_2; + /* Second vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0); + v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_2 = __builtin_shuffle (v_x, v_y, sel); + + *p_v_out_1 = v_out_1; + *p_v_out_2 = v_out_2; +} + +void f2 (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2) +{ + vec sel0 = { 0, 2, 0, 2 }; + vec sel1 = { 1, 3, 1, 3 }; + vec sel = { 0, 1, 6, 7 }; + vec v_1, v_2, v_x, v_y, v_out_1, v_out_2; + vec v_in_1 = *p_v_in_1; + vec v_in_2 = *p_v_in_2; + + /* First vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0); + v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_1 = __builtin_shuffle (v_x, v_y, sel); + /* Won't blend because of this store between the sequences. */ + *p_v_out_1 = v_out_1; + + /* Second vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0); + v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_2 = __builtin_shuffle (v_x, v_y, sel); + + *p_v_out_2 = v_out_2; +} + +void f3 (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2) +{ + vec sel0 = { 0, 2, 0, 2 }; + vec sel1 = { 1, 3, 1, 3 }; + vec sel = { 0, 1, 6, 7 }; + vec v_1, v_2, v_x, v_y, v_out_1, v_out_2; + vec v_in_1 = *p_v_in_1; + vec v_in_2 = *p_v_in_2; + + /* First vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0); + v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_1 = __builtin_shuffle (v_x, v_y, sel); + + /* Second vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0); + v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1); + v_x = v_1 + v_2; + /* Won't blend because v_2 is RHS1 here. */ + v_y = v_2 - v_1; + v_out_2 = __builtin_shuffle (v_x, v_y, sel); + + *p_v_out_1 = v_out_1; + *p_v_out_2 = v_out_2; +} + +extern vec foo (vec v); +void f4 (vec *p_v_in_1, vec *p_v_out_1, vec *p_v_out_2) +{ + vec sel0 = { 0, 2, 0, 2 }; + vec sel1 = { 1, 3, 1, 3 }; + vec sel = { 0, 1, 6, 7 }; + vec v_1, v_2, v_x, v_y, v_out_1, v_out_2; + vec v_in_1 = *p_v_in_1; + vec v_in_2; + + /* First vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0); + v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_1 = __builtin_shuffle (v_x, v_y, sel); + + /* Won't merge because of dependency. */ + v_in_2 = foo (v_out_1); + + /* Second vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0); + v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_2 = __builtin_shuffle (v_x, v_y, sel); + + *p_v_out_1 = v_out_1; + *p_v_out_2 = v_out_2; +} + +/* { dg-final { scan-tree-dump-not "Vec perm simplify sequences have been merged" "forwprop1" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-8.c b/gcc/testsuite/gcc.dg/tree-ssa/vector-8.c new file mode 100644 index 0000000000000000000000000000000000000000..bc2269065e4ffb6a21a7f1ed85169614105bac49 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-8.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details" } */ + +typedef int vec __attribute__((vector_size (4 * sizeof (int)))); + +void f (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2) +{ + vec sel0 = { 0, 2, 0, 2 }; + vec sel1 = { 1, 3, 1, 3 }; + vec sel = { 0, 1, 6, 7 }; + vec v_1, v_2, v_x, v_y, v_out_1, v_out_2; + vec v_in_1 = *p_v_in_1; + vec v_in_2 = *p_v_in_2; + + /* First vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0); + v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_1 = __builtin_shuffle (v_x, v_y, sel); + + /* Second vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0); + v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1); + v_x = v_1 + v_2; + v_y = v_1 - v_2; + v_out_2 = __builtin_shuffle (v_x, v_y, sel); + + *p_v_out_1 = v_out_1; + *p_v_out_2 = v_out_2; +} + +/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been blended" "forwprop1" } } */ +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 3, 6, 7 }" "forwprop1" } } */ diff --git a/gcc/testsuite/gcc.dg/tree-ssa/vector-9.c b/gcc/testsuite/gcc.dg/tree-ssa/vector-9.c new file mode 100644 index 0000000000000000000000000000000000000000..e5f898e02814468f428623d2663856b9159428d2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/tree-ssa/vector-9.c @@ -0,0 +1,34 @@ +/* { dg-do compile } */ +/* { dg-additional-options "-O3 -fdump-tree-forwprop1-details" } */ + +typedef int vec __attribute__((vector_size (4 * sizeof (int)))); + +void f (vec *p_v_in_1, vec *p_v_in_2, vec *p_v_out_1, vec *p_v_out_2) +{ + vec sel0 = { 0, 2, 0, 2 }; + vec sel1 = { 1, 3, 1, 3 }; + vec sel = { 0, 1, 6, 7 }; + vec v_1, v_2, v_x, v_y, v_out_1, v_out_2; + vec v_in_1 = *p_v_in_1; + vec v_in_2 = *p_v_in_2; + + /* First vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_1, v_in_1, sel0); + v_2 = __builtin_shuffle (v_in_1, v_in_1, sel1); + v_x = v_1 * v_2; + v_y = v_2 - v_1; + v_out_1 = __builtin_shuffle (v_x, v_y, sel); + + /* Second vec perm sequence. */ + v_1 = __builtin_shuffle (v_in_2, v_in_2, sel0); + v_2 = __builtin_shuffle (v_in_2, v_in_2, sel1); + v_x = v_2 * v_1; + v_y = v_2 - v_1; + v_out_2 = __builtin_shuffle (v_x, v_y, sel); + + *p_v_out_1 = v_out_1; + *p_v_out_2 = v_out_2; +} + +/* { dg-final { scan-tree-dump "Vec perm simplify sequences have been blended" "forwprop1" } } */ +/* { dg-final { scan-tree-dump "VEC_PERM_EXPR.*{ 2, 3, 6, 7 }" "forwprop1" } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/satd-hadamard.c b/gcc/testsuite/gcc.target/aarch64/sve/satd-hadamard.c new file mode 100644 index 0000000000000000000000000000000000000000..fcd140e3584c79ed605456bd14b145a160ec0802 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/satd-hadamard.c @@ -0,0 +1,3 @@ +#include "../../../gcc.dg/tree-ssa/satd-hadamard.c" + +/* { dg-final { scan-assembler-not {\ttbl\t} } } */ diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc index 5c690a2b03e216250f7ae427c64068f9cce8f84f..8088cc190d11a2203950369b3d0313e3cfe15471 100644 --- a/gcc/tree-ssa-forwprop.cc +++ b/gcc/tree-ssa-forwprop.cc @@ -50,6 +50,8 @@ along with GCC; see the file COPYING3. If not see #include "optabs-tree.h" #include "insn-config.h" #include "recog.h" +#include "cfgloop.h" +#include "tree-vectorizer.h" #include "tree-vector-builder.h" #include "vec-perm-indices.h" #include "internal-fn.h" @@ -184,6 +186,25 @@ along with GCC; see the file COPYING3. If not see This will (of course) be extended as other needs arise. */ +/* Data structure that contains simplifiable vectorized permute sequences. + See recognise_vec_perm_simplify_seq () for a description of the sequence. */ + +struct _vec_perm_simplify_seq +{ + /* Defining stmts of vectors in the sequence. */ + gassign *v_1_stmt; + gassign *v_2_stmt; + gassign *v_x_stmt; + gassign *v_y_stmt; + /* Final permute statment. */ + gassign *stmt; + /* New selector indices for stmt. */ + tree new_sel; + /* Elements of each vector and selector. */ + unsigned int nelts; +}; +typedef struct _vec_perm_simplify_seq *vec_perm_simplify_seq; + static bool forward_propagate_addr_expr (tree, tree, bool); /* Set to true if we delete dead edges during the optimization. */ @@ -225,7 +246,6 @@ fwprop_invalidate_lattice (tree name) lattice[SSA_NAME_VERSION (name)] = NULL_TREE; } - /* Get the statement we can propagate from into NAME skipping trivial copies. Returns the statement which defines the propagation source or NULL_TREE if there is no such one. @@ -3460,6 +3480,546 @@ fwprop_ssa_val (tree name) return name; } +/* Get an index map from the provided vector permute selector + and return the number of unique indices. + E.g.: { 1, 3, 1, 3 } -> <0, 1, 0, 1>, 2 + { 0, 2, 0, 2 } -> <0, 1, 0, 1>, 2 + { 3, 2, 1, 0 } -> <0, 1, 2, 3>, 4. */ + +static unsigned int +get_vect_selector_index_map (tree sel, vec<unsigned int> *index_map) +{ + gcc_assert (VECTOR_CST_NELTS (sel).is_constant ()); + unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant (); + unsigned int n = 0; + + for (unsigned int i = 0; i < nelts; i++) + { + /* Extract the i-th value from the selector. */ + tree sel_cst_tree = VECTOR_CST_ELT (sel, i); + unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree); + + unsigned int j = 0; + for (; j <= i; j++) + { + tree prev_sel_cst_tree = VECTOR_CST_ELT (sel, j); + unsigned int prev_sel_cst + = TREE_INT_CST_LOW (prev_sel_cst_tree); + if (prev_sel_cst == sel_cst) + break; + } + index_map->quick_push (j); + n += (i == j) ? 1 : 0; + } + + return n; +} + +/* Search for opportunities to free half of the lanes in the following pattern: + + v_in = {e0, e1, e2, e3} + v_1 = VEC_PERM <v_in, v_in, {0, 2, 0, 2}> + // v_1 = {e0, e2, e0, e2} + v_2 = VEC_PERM <v_in, v_in, {1, 3, 1, 3}> + // v_2 = {e1, e3, e1, e3} + + v_x = v_1 + v_2 + // v_x = {e0+e1, e2+e3, e0+e1, e2+e3} + v_y = v_1 - v_2 + // v_y = {e0-e1, e2-e3, e0-e1, e2-e3} + + v_out = VEC_PERM <v_x, v_y, {0, 1, 6, 7}> + // v_out = {e0+e1, e2+e3, e0-e1, e2-e3} + + The last statement could be simplified to: + v_out' = VEC_PERM <v_x, v_y, {0, 1, 4, 5}> + // v_out' = {e0+e1, e2+e3, e0-e1, e2-e3} + + Characteristic properties: + - v_1 and v_2 are created from the same input vector v_in and introduce the + lane duplication (in the selection operand) that we can eliminate. + - v_x and v_y are results from lane-preserving operations that use v_1 and + v_2 as inputs. + - v_out is created by selecting from duplicated lanes. */ + +static bool +recognise_vec_perm_simplify_seq (gassign *stmt, vec_perm_simplify_seq *seq) +{ + gcc_checking_assert (stmt); + gcc_checking_assert (gimple_assign_rhs_code (stmt) == VEC_PERM_EXPR); + basic_block bb = gimple_bb (stmt); + + /* Decompose the final vec permute statement. */ + tree v_x = gimple_assign_rhs1 (stmt); + tree v_y = gimple_assign_rhs2 (stmt); + tree sel = gimple_assign_rhs3 (stmt); + + if (!VECTOR_CST_NELTS (sel).is_constant () + || TREE_CODE (v_x) != SSA_NAME + || TREE_CODE (v_y) != SSA_NAME + || !has_single_use (v_x) + || !has_single_use (v_y)) + return false; + + unsigned int nelts = VECTOR_CST_NELTS (sel).to_constant (); + + /* Lookup the definition of v_x and v_y. */ + gassign *v_x_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x)); + gassign *v_y_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_y)); + if (!v_x_stmt || gimple_bb (v_x_stmt) != bb + || !v_y_stmt || gimple_bb (v_y_stmt) != bb) + return false; + + /* Check the operations that define v_x and v_y. */ + if (TREE_CODE_CLASS (gimple_assign_rhs_code (v_x_stmt)) != tcc_binary + || TREE_CODE_CLASS (gimple_assign_rhs_code (v_y_stmt)) != tcc_binary) + return false; + + tree v_x_1 = gimple_assign_rhs1 (v_x_stmt); + tree v_x_2 = gimple_assign_rhs2 (v_x_stmt); + tree v_y_1 = gimple_assign_rhs1 (v_y_stmt); + tree v_y_2 = gimple_assign_rhs2 (v_y_stmt); + + if (v_x_stmt == v_y_stmt + || TREE_CODE (v_x_1) != SSA_NAME + || TREE_CODE (v_x_2) != SSA_NAME + || num_imm_uses (v_x_1) != 2 + || num_imm_uses (v_x_2) != 2) + return false; + + if (v_x_1 != v_y_1 || v_x_2 != v_y_2) + { + /* Allow operands of commutative operators to swap. */ + if (commutative_tree_code (gimple_assign_rhs_code (v_x_stmt))) + { + /* Keep v_x_1 the first operand for non-commutative operators. */ + v_x_1 = gimple_assign_rhs2 (v_x_stmt); + v_x_2 = gimple_assign_rhs1 (v_x_stmt); + if (v_x_1 != v_y_1 || v_x_2 != v_y_2) + return false; + } + else if (commutative_tree_code (gimple_assign_rhs_code (v_y_stmt))) + { + if (v_x_1 != v_y_2 || v_x_2 != v_y_1) + return false; + } + else + return false; + } + gassign *v_1_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x_1)); + gassign *v_2_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (v_x_2)); + if (!v_1_stmt || gimple_bb (v_1_stmt) != bb + || !v_2_stmt || gimple_bb (v_2_stmt) != bb) + return false; + + if (gimple_assign_rhs_code (v_1_stmt) != VEC_PERM_EXPR + || gimple_assign_rhs_code (v_2_stmt) != VEC_PERM_EXPR) + return false; + + /* Decompose initial VEC_PERM_EXPRs. */ + tree v_in = gimple_assign_rhs1 (v_1_stmt); + tree v_1_sel = gimple_assign_rhs3 (v_1_stmt); + tree v_2_sel = gimple_assign_rhs3 (v_2_stmt); + if (v_in != gimple_assign_rhs2 (v_1_stmt) + || v_in != gimple_assign_rhs1 (v_2_stmt) + || v_in != gimple_assign_rhs2 (v_2_stmt)) + return false; + + if (!VECTOR_CST_NELTS (v_1_sel).is_constant () + || !VECTOR_CST_NELTS (v_2_sel).is_constant ()) + return false; + + if (nelts != VECTOR_CST_NELTS (v_1_sel).to_constant () + || nelts != VECTOR_CST_NELTS (v_2_sel).to_constant ()) + return false; + + /* Now check permutation selection operands. */ + auto_vec<unsigned int> v_1_lane_map, v_2_lane_map; + v_1_lane_map.reserve (nelts); + v_2_lane_map.reserve (nelts); + unsigned int v_1_lanes, v_2_lanes; + v_1_lanes = get_vect_selector_index_map (v_1_sel, &v_1_lane_map); + v_2_lanes = get_vect_selector_index_map (v_2_sel, &v_2_lane_map); + + /* Check if we could free up half of the lanes. */ + if (v_1_lanes != v_2_lanes || v_1_lanes > (nelts / 2)) + return false; + + /* Create the new selector. */ + vec_perm_builder new_sel_perm (nelts, nelts, 1); + for (unsigned int i = 0; i < nelts; i++) + { + /* Extract the i-th value from the selector. */ + tree sel_cst_tree = VECTOR_CST_ELT (sel, i); + unsigned int sel_cst = TREE_INT_CST_LOW (sel_cst_tree); + + unsigned int j; + if (sel_cst < nelts) + j = v_1_lane_map[sel_cst]; + else + j = v_2_lane_map[sel_cst - nelts] + nelts; + + new_sel_perm.quick_push (j); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "%u", j); + if (i != (nelts -1)) + fprintf (dump_file, ", "); + } + } + + vec_perm_indices new_indices (new_sel_perm, 2, nelts); + tree vectype = TREE_TYPE (gimple_assign_lhs (stmt)); + machine_mode vmode = TYPE_MODE (vectype); + if (!can_vec_perm_const_p (vmode, vmode, new_indices, false)) + return false; + + *seq = XNEW (struct _vec_perm_simplify_seq); + (*seq)->stmt = stmt; + (*seq)->v_1_stmt = v_1_stmt; + (*seq)->v_2_stmt = v_2_stmt; + (*seq)->v_x_stmt = v_x_stmt; + (*seq)->v_y_stmt = v_y_stmt; + (*seq)->nelts = nelts; + (*seq)->new_sel = vect_gen_perm_mask_checked (vectype, new_indices); + + if (dump_file) + { + fprintf (dump_file, "Found vec perm simplify sequence ending with: "); + print_gimple_stmt (dump_file, stmt, 0); + } + + return true; +} + +/* Reduce the lane consumption of a simplifiable vec perm sequence. */ + +static void +narrow_vec_perm_simplify_seq (const vec_perm_simplify_seq &seq) +{ + gassign *stmt = seq->stmt; + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Updating VEC_PERM statment:\n"); + fprintf (dump_file, "Old stmt: "); + print_gimple_stmt (dump_file, stmt, 0); + } + + /* Update the last VEC_PERM statement. */ + gimple_assign_set_rhs3 (stmt, seq->new_sel); + update_stmt (stmt); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "New stmt: "); + print_gimple_stmt (dump_file, stmt, 0); + } +} + +/* Test if we can blend two simplifiable vec permute sequences. + NEED_SWAP will be set, if sequences must be swapped for blending. */ + +static bool +can_blend_vec_perm_simplify_seqs_p (vec_perm_simplify_seq seq1, + vec_perm_simplify_seq seq2, + bool *need_swap) +{ + unsigned int nelts = seq1->nelts; + basic_block bb = gimple_bb (seq1->stmt); + + gcc_assert (gimple_bb (seq2->stmt) == bb); + + /* BBs and number of elements must be equal. */ + if (gimple_bb (seq2->stmt) != bb || seq2->nelts != nelts) + return false; + + /* We need vectors of the same type. */ + if (TREE_TYPE (gimple_assign_lhs (seq1->stmt)) + != TREE_TYPE (gimple_assign_lhs (seq2->stmt))) + return false; + + /* We require isomorphic operators. */ + if (((gimple_assign_rhs_code (seq1->v_x_stmt) + != gimple_assign_rhs_code (seq2->v_x_stmt)) + || (gimple_assign_rhs_code (seq1->v_y_stmt) + != gimple_assign_rhs_code (seq2->v_y_stmt)))) + return false; + + /* We cannot have any dependencies between the sequences. + + For merging, we will reuse seq1->v_1_stmt and seq1->v_2_stmt. + seq1's v_in is defined before these statements, but we need + to check if seq2's v_in is defined before them as well. + + Further, we will reuse seq2->stmt. We need to ensure that + seq1->v_x_stmt and seq1->v_y_stmt are before it. + + Note, that we don't need to check the BBs here, because all + statements of both sequences have to be in the same BB. + */ + + tree seq2_v_in = gimple_assign_rhs1 (seq2->v_1_stmt); + if (TREE_CODE (seq2_v_in) != SSA_NAME) + return false; + + gassign *seq2_v_in_stmt = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (seq2_v_in)); + if (!seq2_v_in_stmt || gimple_bb (seq2_v_in_stmt) != bb + || (gimple_uid (seq2_v_in_stmt) > gimple_uid (seq1->v_1_stmt)) + || (gimple_uid (seq1->v_x_stmt) > gimple_uid (seq2->stmt)) + || (gimple_uid (seq1->v_y_stmt) > gimple_uid (seq2->stmt))) + { + tree seq1_v_in = gimple_assign_rhs1 (seq1->v_1_stmt); + if (TREE_CODE (seq1_v_in) != SSA_NAME) + return false; + + gassign *seq1_v_in_stmt + = dyn_cast<gassign *> (SSA_NAME_DEF_STMT (seq1_v_in)); + /* Let's try to see if we succeed when swapping the sequences. */ + if (!seq1_v_in_stmt || gimple_bb (seq1_v_in_stmt) + || (gimple_uid (seq1_v_in_stmt) > gimple_uid (seq2->v_1_stmt)) + || (gimple_uid (seq2->v_x_stmt) > gimple_uid (seq1->stmt)) + || (gimple_uid (seq2->v_y_stmt) > gimple_uid (seq1->stmt))) + return false; + *need_swap = true; + } + else + *need_swap = false; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Found vec perm simplify sequence pair.\n"); + + return true; +} + +/* Calculate the permutations for blending the two given vec permute + sequences. This may fail if the resulting permutation is not + supported. */ + +static bool +calc_perm_vec_perm_simplify_seqs (vec_perm_simplify_seq seq1, + vec_perm_simplify_seq seq2, + vec_perm_indices *seq2_stmt_indices, + vec_perm_indices *seq1_v_1_stmt_indices, + vec_perm_indices *seq1_v_2_stmt_indices) +{ + unsigned int i; + unsigned int nelts = seq1->nelts; + auto_vec<int> lane_assignment; + lane_assignment.create (2 * nelts); + + /* Mark all lanes as free. */ + lane_assignment.quick_grow_cleared (2 * nelts); + + /* Reserve lanes for seq1. */ + for (i = 0; i < nelts; i++) + { + unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq1->new_sel, i)); + lane_assignment[l] = 1; + } + + /* Reserve lanes for seq2 and calculate selector for seq2->stmt. */ + vec_perm_builder seq2_stmt_sel_perm (nelts, nelts, 1); + for (i = 0; i < nelts; i++) + { + unsigned int l = TREE_INT_CST_LOW (VECTOR_CST_ELT (seq2->new_sel, i)); + while (lane_assignment[l] != 0) + l++; + lane_assignment[l] = 2; + seq2_stmt_sel_perm.quick_push (l); + } + + seq2_stmt_indices->new_vector (seq2_stmt_sel_perm, 2, nelts); + tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt)); + machine_mode vmode = TYPE_MODE (vectype); + if (!can_vec_perm_const_p (vmode, vmode, *seq2_stmt_indices, false)) + return false; + + /* Calculate selectors for seq1->v_1_stmt and seq1->v_2_stmt. */ + vec_perm_builder seq1_v_1_stmt_sel_perm (nelts, nelts, 1); + vec_perm_builder seq1_v_2_stmt_sel_perm (nelts, nelts, 1); + for (i = 0; i < nelts; i++) + { + bool use_seq1 = lane_assignment[i] == 1; + tree s1 = gimple_assign_rhs3 (use_seq1 ? seq1->v_1_stmt + : seq2->v_1_stmt); + tree s2 = gimple_assign_rhs3 (use_seq1 ? seq1->v_2_stmt + : seq2->v_2_stmt); + unsigned int l1 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s1, i)) % nelts; + unsigned int l2 = TREE_INT_CST_LOW (VECTOR_CST_ELT (s2, i)) % nelts; + + seq1_v_1_stmt_sel_perm.quick_push (l1 + (use_seq1 ? 0 : nelts)); + seq1_v_2_stmt_sel_perm.quick_push (l2 + (use_seq1 ? 0 : nelts)); + } + + seq1_v_1_stmt_indices->new_vector (seq1_v_1_stmt_sel_perm, 2, nelts); + vectype = TREE_TYPE (gimple_assign_lhs (seq1->v_1_stmt)); + vmode = TYPE_MODE (vectype); + if (!can_vec_perm_const_p (vmode, vmode, *seq1_v_1_stmt_indices, false)) + return false; + + seq1_v_2_stmt_indices->new_vector (seq1_v_2_stmt_sel_perm, 2, nelts); + vectype = TREE_TYPE (gimple_assign_lhs (seq1->v_2_stmt)); + vmode = TYPE_MODE (vectype); + if (!can_vec_perm_const_p (vmode, vmode, *seq1_v_2_stmt_indices, false)) + return false; + + return true; +} + +/* Blend the two given simplifiable vec permute sequences using the + given permutations. */ + +static void +blend_vec_perm_simplify_seqs (vec_perm_simplify_seq seq1, + vec_perm_simplify_seq seq2, + const vec_perm_indices &seq2_stmt_indices, + const vec_perm_indices &seq1_v_1_stmt_indices, + const vec_perm_indices &seq1_v_2_stmt_indices) +{ + /* We don't need to adjust seq1->stmt because its lanes consumption + was already narrowed before entering this function. */ + + /* Adjust seq2->stmt: copy RHS1/RHS2 from seq1->stmt and set new sel. */ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Updating VEC_PERM statment:\n"); + fprintf (dump_file, "Old stmt: "); + print_gimple_stmt (dump_file, seq2->stmt, 0); + } + + gimple_assign_set_rhs1 (seq2->stmt, gimple_assign_rhs1 (seq1->stmt)); + gimple_assign_set_rhs2 (seq2->stmt, gimple_assign_rhs2 (seq1->stmt)); + tree vectype = TREE_TYPE (gimple_assign_lhs (seq2->stmt)); + tree sel = vect_gen_perm_mask_checked (vectype, seq2_stmt_indices); + gimple_assign_set_rhs3 (seq2->stmt, sel); + update_stmt (seq2->stmt); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "New stmt: "); + print_gimple_stmt (dump_file, seq2->stmt, 0); + } + + /* Adjust seq1->v_1_stmt: copy RHS2 from seq2->v_1_stmt and set new sel. */ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Updating VEC_PERM statment:\n"); + fprintf (dump_file, "Old stmt: "); + print_gimple_stmt (dump_file, seq1->v_1_stmt, 0); + } + + gimple_assign_set_rhs2 (seq1->v_1_stmt, gimple_assign_rhs1 (seq2->v_1_stmt)); + vectype = TREE_TYPE (gimple_assign_lhs (seq1->v_1_stmt)); + sel = vect_gen_perm_mask_checked (vectype, seq1_v_1_stmt_indices); + gimple_assign_set_rhs3 (seq1->v_1_stmt, sel); + update_stmt (seq1->v_1_stmt); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "New stmt: "); + print_gimple_stmt (dump_file, seq1->v_1_stmt, 0); + } + + /* Adjust seq1->v_2_stmt: copy RHS2 from seq2->v_2_stmt and set new sel. */ + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "Updating VEC_PERM statment:\n"); + fprintf (dump_file, "Old stmt: "); + print_gimple_stmt (dump_file, seq1->v_2_stmt, 0); + } + + gimple_assign_set_rhs2 (seq1->v_2_stmt, gimple_assign_rhs1 (seq2->v_2_stmt)); + vectype = TREE_TYPE (gimple_assign_lhs (seq1->v_2_stmt)); + sel = vect_gen_perm_mask_checked (vectype, seq1_v_2_stmt_indices); + gimple_assign_set_rhs3 (seq1->v_2_stmt, sel); + update_stmt (seq1->v_2_stmt); + + if (dump_file && (dump_flags & TDF_DETAILS)) + { + fprintf (dump_file, "New stmt: "); + print_gimple_stmt (dump_file, seq1->v_2_stmt, 0); + } + + /* At this point, we have four unmodified seq2 stmts, which will be + eliminated by DCE. */ + + if (dump_file) + fprintf (dump_file, "Vec perm simplify sequences have been blended.\n\n"); +} + +/* Try to blend narrowed vec_perm_simplify_seqs pairwise. + The provided list will be empty after this call. */ + +static void +process_vec_perm_simplify_seq_list (vec<vec_perm_simplify_seq> *l) +{ + unsigned int i, j; + vec_perm_simplify_seq seq1, seq2; + + if (l->is_empty ()) + return; + + if (dump_file && (dump_flags & TDF_DETAILS)) + fprintf (dump_file, "Processing %u vec perm simplify sequences.\n", + l->length ()); + + FOR_EACH_VEC_ELT (*l, i, seq1) + { + if (i + 1 < l->length ()) + { + FOR_EACH_VEC_ELT_FROM (*l, j, seq2, i + 1) + { + bool swap = false; + if (can_blend_vec_perm_simplify_seqs_p (seq1, seq2, &swap)) + { + vec_perm_indices seq2_stmt_indices; + vec_perm_indices seq1_v_1_stmt_indices; + vec_perm_indices seq1_v_2_stmt_indices; + if (calc_perm_vec_perm_simplify_seqs (swap ? seq2 : seq1, + swap ? seq1 : seq2, + &seq2_stmt_indices, + &seq1_v_1_stmt_indices, + &seq1_v_2_stmt_indices)) + { + /* Narrow lane usage. */ + narrow_vec_perm_simplify_seq (seq1); + narrow_vec_perm_simplify_seq (seq2); + + /* Blend sequences. */ + blend_vec_perm_simplify_seqs (swap ? seq2 : seq1, + swap ? seq1 : seq2, + seq2_stmt_indices, + seq1_v_1_stmt_indices, + seq1_v_2_stmt_indices); + + /* We can use unordered_remove as we break the loop. */ + l->unordered_remove (j); + XDELETE (seq2); + break; + } + } + } + } + + /* We don't need to call l->remove for seq1. */ + XDELETE (seq1); + } + + l->truncate (0); +} + +static void +append_vec_perm_simplify_seq_list (vec<vec_perm_simplify_seq> *l, + const vec_perm_simplify_seq &seq) +{ + /* If no space on list left, then process the list. */ + if (!l->space (1)) + process_vec_perm_simplify_seq_list (l); + + l->quick_push (seq); +} + /* Main entry point for the forward propagation and statement combine optimizer. */ @@ -3526,6 +4086,7 @@ pass_forwprop::execute (function *fun) auto_bitmap simple_dce_worklist; auto_bitmap need_ab_cleanup; to_purge = BITMAP_ALLOC (NULL); + auto_vec<vec_perm_simplify_seq, 8> vec_perm_simplify_seq_list; for (int i = 0; i < postorder_num; ++i) { gimple_stmt_iterator gsi; @@ -3605,14 +4166,18 @@ pass_forwprop::execute (function *fun) /* Apply forward propagation to all stmts in the basic-block. Note we update GSI within the loop as necessary. */ + unsigned int uid = 1; for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); ) { gimple *stmt = gsi_stmt (gsi); tree lhs, rhs; enum tree_code code; + gimple_set_uid (stmt, uid++); + if (!is_gimple_assign (stmt)) { + process_vec_perm_simplify_seq_list (&vec_perm_simplify_seq_list); gsi_next (&gsi); continue; } @@ -3620,9 +4185,11 @@ pass_forwprop::execute (function *fun) lhs = gimple_assign_lhs (stmt); rhs = gimple_assign_rhs1 (stmt); code = gimple_assign_rhs_code (stmt); + if (TREE_CODE (lhs) != SSA_NAME || has_zero_uses (lhs)) { + process_vec_perm_simplify_seq_list (&vec_perm_simplify_seq_list); gsi_next (&gsi); continue; } @@ -3901,10 +4468,25 @@ pass_forwprop::execute (function *fun) else gsi_next (&gsi); } + else if (code == VEC_PERM_EXPR) + { + /* Find vectorized sequences, where we can reduce the lane + utilization. The narrowing will be donw later and only + if we find a pair of sequences that can be blended. */ + gassign *assign = dyn_cast <gassign *> (stmt); + vec_perm_simplify_seq seq; + if (recognise_vec_perm_simplify_seq (assign, &seq)) + append_vec_perm_simplify_seq_list (&vec_perm_simplify_seq_list, + seq); + + gsi_next (&gsi); + } else gsi_next (&gsi); } + process_vec_perm_simplify_seq_list (&vec_perm_simplify_seq_list); + /* Combine stmts with the stmts defining their operands. Note we update GSI within the loop as necessary. */ for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))