From 0fca40f598654f83453b17e44f902183859b16e4 Mon Sep 17 00:00:00 2001 From: Ira Rosen <irar@il.ibm.com> Date: Thu, 28 Aug 2008 11:11:14 +0000 Subject: [PATCH] target.h (struct vectorize): Add new target builtin. * target.h (struct vectorize): Add new target builtin. * tree-vectorizer.c (destroy_loop_vec_info): Call vect_free_slp_instance instead of vect_free_slp_node. * tree-vectorizer.h (enum slp_load_perm_type): New. (struct _slp_instance): Add new fields. (SLP_INSTANCE_LOAD_PERMUTATION): New. (SLP_INSTANCE_LOADS): New. (vect_free_slp_tree): Remove. (vect_free_slp_instance): Declare. (SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New. (vectorizable_load): Add argument. (vect_transform_slp_perm_load): New. * tree-vect-analyze.c (vect_analyze_operations): Add an argument to vectorizable_load. (vect_get_place_in_interleaving_chain): New function. (vect_free_slp_tree): Make static. (vect_free_slp_instance): New function. (vect_build_slp_tree): Add new arguments. Allow load permutations and collect the load location in the interleaving chain. (vect_supported_slp_permutation_p): New function. (vect_supported_load_permutation_p): Likewise. (vect_analyze_slp_instance): In case of loads permutation, call vect_supported_load_permutation_p to check that the permutation is supported. * target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New. * tree-vect-transform.c (vect_transform_stmt): Add new argument. (vect_create_mask_and_perm): New function. (vect_get_mask_element, vect_transform_slp_perm_load): Likewise. (vectorizable_load): Add an argument. Don't keep the created vectors statements in the node if permutation is required. Call vect_transform_slp_perm_load to generate the permutation. (vect_transform_stmt): Add new argument. Call vectorizable_load with additional argument. (vect_schedule_slp_instance): In case of loads permutation, allocate vectorized statements structure for all the related SLP nodes. Call vect_transform_stmt with addditional argument. (vect_transform_loop): Call vect_transform_stmt with correct arguments. * config/spu/spu.c (spu_builtin_vec_perm): New. (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine. * config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define. * config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New. (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine. From-SVN: r139706 --- gcc/ChangeLog | 45 +++ gcc/config/rs6000/rs6000.c | 37 +++ gcc/config/spu/spu.c | 58 ++++ gcc/config/spu/spu.h | 5 + gcc/target-def.h | 4 +- gcc/target.h | 5 +- gcc/testsuite/ChangeLog | 13 + gcc/testsuite/gcc.dg/vect/slp-perm-1.c | 60 ++++ gcc/testsuite/gcc.dg/vect/slp-perm-2.c | 55 ++++ gcc/testsuite/gcc.dg/vect/slp-perm-3.c | 70 +++++ gcc/testsuite/gcc.dg/vect/slp-perm-4.c | 85 ++++++ gcc/testsuite/gcc.dg/vect/slp-perm-5.c | 77 +++++ gcc/testsuite/gcc.dg/vect/slp-perm-6.c | 77 +++++ gcc/testsuite/gcc.dg/vect/slp-perm-7.c | 76 +++++ gcc/testsuite/gcc.dg/vect/slp-perm-8.c | 57 ++++ gcc/testsuite/gcc.dg/vect/slp-perm-9.c | 58 ++++ gcc/testsuite/lib/target-supports.exp | 22 ++ gcc/tree-vect-analyze.c | 339 ++++++++++++++++----- gcc/tree-vect-transform.c | 405 +++++++++++++++++++++++-- gcc/tree-vectorizer.c | 3 +- gcc/tree-vectorizer.h | 23 +- 21 files changed, 1461 insertions(+), 113 deletions(-) create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-1.c create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-2.c create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-3.c create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-4.c create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-5.c create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-6.c create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-7.c create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-8.c create mode 100644 gcc/testsuite/gcc.dg/vect/slp-perm-9.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6430e4f8fd0e..1f7396fa12ac 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,48 @@ +2008-08-28 Ira Rosen <irar@il.ibm.com> + + * target.h (struct vectorize): Add new target builtin. + * tree-vectorizer.c (destroy_loop_vec_info): Call + vect_free_slp_instance instead of vect_free_slp_node. + * tree-vectorizer.h (enum slp_load_perm_type): New. + (struct _slp_instance): Add new fields. + (SLP_INSTANCE_LOAD_PERMUTATION): New. + (SLP_INSTANCE_LOADS): New. + (vect_free_slp_tree): Remove. + (vect_free_slp_instance): Declare. + (SLP_TREE_LOADS_PERM_TYPE, TARG_VEC_PERMUTE_COST): New. + (vectorizable_load): Add argument. + (vect_transform_slp_perm_load): New. + * tree-vect-analyze.c (vect_analyze_operations): Add an argument to + vectorizable_load. + (vect_get_place_in_interleaving_chain): New function. + (vect_free_slp_tree): Make static. + (vect_free_slp_instance): New function. + (vect_build_slp_tree): Add new arguments. Allow load permutations and + collect the load location in the interleaving chain. + (vect_supported_slp_permutation_p): New function. + (vect_supported_load_permutation_p): Likewise. + (vect_analyze_slp_instance): In case of loads permutation, call + vect_supported_load_permutation_p to check that the permutation is + supported. + * target-def.h (TARGET_VECTORIZE_BUILTIN_VEC_PERM): New. + * tree-vect-transform.c (vect_transform_stmt): Add new argument. + (vect_create_mask_and_perm): New function. + (vect_get_mask_element, vect_transform_slp_perm_load): Likewise. + (vectorizable_load): Add an argument. Don't keep the created vectors + statements in the node if permutation is required. Call + vect_transform_slp_perm_load to generate the permutation. + (vect_transform_stmt): Add new argument. Call vectorizable_load with + additional argument. + (vect_schedule_slp_instance): In case of loads permutation, allocate + vectorized statements structure for all the related SLP nodes. Call + vect_transform_stmt with addditional argument. + (vect_transform_loop): Call vect_transform_stmt with correct arguments. + * config/spu/spu.c (spu_builtin_vec_perm): New. + (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine. + * config/spu/spu.h (TARG_VEC_PERMUTE_COS): Define. + * config/rs6000/rs6000.c (rs6000_builtin_vec_perm): New. + (TARGET_VECTORIZE_BUILTIN_VEC_PERM): Redefine. + 2008-08-28 Chris Fairles <chris.fairles@gmail.com> * gthr-posix.h (__gthread_create, __gthread_join, __gthread_detach, diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c index 83997ff2a0df..2124ea3c50d3 100644 --- a/gcc/config/rs6000/rs6000.c +++ b/gcc/config/rs6000/rs6000.c @@ -862,6 +862,7 @@ static tree rs6000_builtin_mask_for_load (void); static tree rs6000_builtin_mul_widen_even (tree); static tree rs6000_builtin_mul_widen_odd (tree); static tree rs6000_builtin_conversion (enum tree_code, tree); +static tree rs6000_builtin_vec_perm (tree, tree *); static void def_builtin (int, const char *, tree, int); static bool rs6000_vector_alignment_reachable (const_tree, bool); @@ -1138,6 +1139,8 @@ static const char alt_reg_names[][8] = #define TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD rs6000_builtin_mul_widen_odd #undef TARGET_VECTORIZE_BUILTIN_CONVERSION #define TARGET_VECTORIZE_BUILTIN_CONVERSION rs6000_builtin_conversion +#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM +#define TARGET_VECTORIZE_BUILTIN_VEC_PERM rs6000_builtin_vec_perm #undef TARGET_VECTOR_ALIGNMENT_REACHABLE #define TARGET_VECTOR_ALIGNMENT_REACHABLE rs6000_vector_alignment_reachable @@ -2080,6 +2083,40 @@ rs6000_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_pac } } +/* Implement targetm.vectorize.builtin_vec_perm. */ +tree +rs6000_builtin_vec_perm (tree type, tree *mask_element_type) +{ + tree d; + + *mask_element_type = unsigned_char_type_node; + + switch (TYPE_MODE (type)) + { + case V16QImode: + d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_16QI]; + break; + + case V8HImode: + d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_8HI]; + break; + + case V4SImode: + d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SI]; + break; + + case V4SFmode: + d = rs6000_builtin_decls[ALTIVEC_BUILTIN_VPERM_4SF]; + break; + + default: + return NULL_TREE; + } + + gcc_assert (d); + return d; +} + /* Handle generic options of the form -mfoo=yes/no. NAME is the option name. VALUE is the option value. diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c index 1021a918275b..da99d3f108a8 100644 --- a/gcc/config/spu/spu.c +++ b/gcc/config/spu/spu.c @@ -137,6 +137,7 @@ static tree spu_builtin_mul_widen_odd (tree); static tree spu_builtin_mask_for_load (void); static int spu_builtin_vectorization_cost (bool); static bool spu_vector_alignment_reachable (const_tree, bool); +static tree spu_builtin_vec_perm (tree, tree *); static int spu_sms_res_mii (struct ddg *g); extern const char *reg_names[]; @@ -288,6 +289,9 @@ const struct attribute_spec spu_attribute_table[]; #undef TARGET_VECTOR_ALIGNMENT_REACHABLE #define TARGET_VECTOR_ALIGNMENT_REACHABLE spu_vector_alignment_reachable +#undef TARGET_VECTORIZE_BUILTIN_VEC_PERM +#define TARGET_VECTORIZE_BUILTIN_VEC_PERM spu_builtin_vec_perm + #undef TARGET_LIBGCC_CMP_RETURN_MODE #define TARGET_LIBGCC_CMP_RETURN_MODE spu_libgcc_cmp_return_mode @@ -5543,6 +5547,60 @@ spu_vector_alignment_reachable (const_tree type ATTRIBUTE_UNUSED, bool is_packed return true; } +/* Implement targetm.vectorize.builtin_vec_perm. */ +tree +spu_builtin_vec_perm (tree type, tree *mask_element_type) +{ + struct spu_builtin_description *d; + + *mask_element_type = unsigned_char_type_node; + + switch (TYPE_MODE (type)) + { + case V16QImode: + if (TYPE_UNSIGNED (type)) + d = &spu_builtins[SPU_SHUFFLE_0]; + else + d = &spu_builtins[SPU_SHUFFLE_1]; + break; + + case V8HImode: + if (TYPE_UNSIGNED (type)) + d = &spu_builtins[SPU_SHUFFLE_2]; + else + d = &spu_builtins[SPU_SHUFFLE_3]; + break; + + case V4SImode: + if (TYPE_UNSIGNED (type)) + d = &spu_builtins[SPU_SHUFFLE_4]; + else + d = &spu_builtins[SPU_SHUFFLE_5]; + break; + + case V2DImode: + if (TYPE_UNSIGNED (type)) + d = &spu_builtins[SPU_SHUFFLE_6]; + else + d = &spu_builtins[SPU_SHUFFLE_7]; + break; + + case V4SFmode: + d = &spu_builtins[SPU_SHUFFLE_8]; + break; + + case V2DFmode: + d = &spu_builtins[SPU_SHUFFLE_9]; + break; + + default: + return NULL_TREE; + } + + gcc_assert (d); + return d->fndecl; +} + /* Count the total number of instructions in each pipe and return the maximum, which is used as the Minimum Iteration Interval (MII) in the modulo scheduler. get_pipe() will return -2, -1, 0, or 1. diff --git a/gcc/config/spu/spu.h b/gcc/config/spu/spu.h index 86042aacb2fa..b27e9b7b12ed 100644 --- a/gcc/config/spu/spu.h +++ b/gcc/config/spu/spu.h @@ -572,6 +572,11 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \ #undef TARG_VEC_STORE_COST #define TARG_VEC_STORE_COST 1 +/* Cost of vector permutation. */ +#ifndef TARG_VEC_PERMUTE_COST +#define TARG_VEC_PERMUTE_COST 1 +#endif + /* Misc */ diff --git a/gcc/target-def.h b/gcc/target-def.h index cd64b386e9ed..18b0eb583172 100644 --- a/gcc/target-def.h +++ b/gcc/target-def.h @@ -364,6 +364,7 @@ #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST 0 #define TARGET_VECTOR_ALIGNMENT_REACHABLE \ default_builtin_vector_alignment_reachable +#define TARGET_VECTORIZE_BUILTIN_VEC_PERM 0 #define TARGET_VECTORIZE \ { \ @@ -373,7 +374,8 @@ TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_EVEN, \ TARGET_VECTORIZE_BUILTIN_MUL_WIDEN_ODD, \ TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST, \ - TARGET_VECTOR_ALIGNMENT_REACHABLE \ + TARGET_VECTOR_ALIGNMENT_REACHABLE, \ + TARGET_VECTORIZE_BUILTIN_VEC_PERM \ } #define TARGET_DEFAULT_TARGET_FLAGS 0 diff --git a/gcc/target.h b/gcc/target.h index 3a104c5632b5..610d7650198a 100644 --- a/gcc/target.h +++ b/gcc/target.h @@ -438,7 +438,10 @@ struct gcc_target /* Return true if vector alignment is reachable (by peeling N iterations) for the given type. */ bool (* vector_alignment_reachable) (const_tree, bool); - } vectorize; + + /* Target builtin that implements vector permute. */ + tree (* builtin_vec_perm) (tree, tree*); +} vectorize; /* The initial value of target_flags. */ int default_target_flags; diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index 707ccb7431d5..d5b6c83a13d0 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,3 +1,16 @@ +2008-08-28 Ira Rosen <irar@il.ibm.com> + + * lib/target-supports.exp (check_effective_target_vect_perm): New. + * gcc.dg/vect/slp-perm-1.c: New testcase. + * gcc.dg/vect/slp-perm-2.c: New testcase. + * gcc.dg/vect/slp-perm-3.c: New testcase. + * gcc.dg/vect/slp-perm-4.c: New testcase. + * gcc.dg/vect/slp-perm-5.c: New testcase. + * gcc.dg/vect/slp-perm-6.c: New testcase. + * gcc.dg/vect/slp-perm-7.c: New testcase. + * gcc.dg/vect/slp-perm-8.c: New testcase. + * gcc.dg/vect/slp-perm-9.c: New testcase. + 2008-08-27 Manuel Lopez-Ibanez <manu@gcc.gnu.org> PR 37217 diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-1.c b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c new file mode 100644 index 000000000000..410758ce2931 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-1.c @@ -0,0 +1,60 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M20 23 +#define M01 1322 +#define M11 13 +#define M21 27271 +#define M02 74 +#define M12 191 +#define M22 500 + +#define N 16 + +void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput) +{ + unsigned int i, a, b, c; + + for (i = 0; i < N / 3; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + + *pOutput++ = M00 * a + M01 * b + M02 * c; + *pOutput++ = M10 * a + M11 * b + M12 * c; + *pOutput++ = M20 * a + M21 * b + M22 * c; + } +} + +int main (int argc, const char* argv[]) +{ + unsigned int input[N], output[N], i; + unsigned int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0}; + + for (i = 0; i < N; i++) + { + input[i] = i%256; + if (input[i] > 200) + abort(); + output[i] = 0; + } + + foo (input, output); + + for (i = 0; i < N; i++) + if (output[i] != check_results[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-2.c b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c new file mode 100644 index 000000000000..da38a8dd5d65 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-2.c @@ -0,0 +1,55 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M01 1322 +#define M11 13 +#define M02 74 +#define M12 191 + +#define N 16 + +void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput) +{ + unsigned int i, a, b; + + for (i = 0; i < N / 2; i++) + { + a = *pInput++; + b = *pInput++; + + *pOutput++ = M00 * a + M01 * b; + *pOutput++ = M10 * a + M11 * b; + } +} + +int main (int argc, const char* argv[]) +{ + unsigned int input[N], output[N], i; + unsigned int check_results[N] = {1322, 13, 4166, 471, 7010, 929, 9854, 1387, 12698, 1845, 15542, 2303, 18386, 2761, 21230, 3219}; + + for (i = 0; i < N; i++) + { + input[i] = i%256; + if (input[i] > 200) + abort(); + output[i] = 0; + } + + foo (input, output); + + for (i = 0; i < N; i++) + if (output[i] != check_results[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-3.c b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c new file mode 100644 index 000000000000..312db31e30cf --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-3.c @@ -0,0 +1,70 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M20 23 +#define M30 237 +#define M01 1322 +#define M11 13 +#define M21 27271 +#define M31 2280 +#define M02 74 +#define M12 191 +#define M22 500 +#define M32 111 +#define M03 134 +#define M13 117 +#define M23 11 +#define M33 771 + +#define N 16 + +void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput) +{ + unsigned int i, a, b, c, d; + + for (i = 0; i < N / 4; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + d = *pInput++; + + *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d; + *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d; + *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d; + *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d; + } +} + +int main (int argc, const char* argv[]) +{ + unsigned int input[N], output[N], i; + unsigned int check_results[N] = {1872, 746, 28304, 4815, 8392, 2894, 139524, 18411, 14912, 5042, 250744, 32007, 21432, 7190, 361964, 45603}; + + for (i = 0; i < N; i++) + { + input[i] = i%256; + if (input[i] > 200) + abort(); + output[i] = 0; + } + + foo (input, output); + + for (i = 0; i < N - N; i++) + if (output[i] != check_results[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + + diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-4.c b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c new file mode 100644 index 000000000000..f4db75d000e2 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-4.c @@ -0,0 +1,85 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M20 23 +#define M30 237 +#define M40 437 + +#define M01 1322 +#define M11 13 +#define M21 27271 +#define M31 2280 +#define M41 284 + +#define M02 74 +#define M12 191 +#define M22 500 +#define M32 111 +#define M42 1114 + +#define M03 134 +#define M13 117 +#define M23 11 +#define M33 771 +#define M43 71 + +#define M04 334 +#define M14 147 +#define M24 115 +#define M34 7716 +#define M44 16 + +#define N 16 + +void foo (unsigned int *__restrict__ pInput, unsigned int *__restrict__ pOutput) +{ + unsigned int i, a, b, c, d, e; + + for (i = 0; i < N / 5; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + d = *pInput++; + e = *pInput++; + + *pOutput++ = M00 * a + M01 * b + M02 * c + M03 * d + M04 * e; + *pOutput++ = M10 * a + M11 * b + M12 * c + M13 * d + M14 * e; + *pOutput++ = M20 * a + M21 * b + M22 * c + M23 * d + M24 * e; + *pOutput++ = M30 * a + M31 * b + M32 * c + M33 * d + M34 * e; + *pOutput++ = M40 * a + M41 * b + M42 * c + M43 * d + M44 * e; + } +} + +int main (int argc, const char* argv[]) +{ + unsigned int input[N], output[N], i; + unsigned int check_results[N] = {3208, 1334, 28764, 35679, 2789, 13028, 4754, 168364, 91254, 12399, 22848, 8174, 307964, 146829, 22009, 0}; + + for (i = 0; i < N; i++) + { + input[i] = i%256; + if (input[i] > 200) + abort(); + output[i] = 0; + } + + foo (input, output); + + for (i = 0; i < N - N; i++) + if (output[i] != check_results[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 0 "vect" } } */ +/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-5.c b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c new file mode 100644 index 000000000000..ca59a5e014bb --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-5.c @@ -0,0 +1,77 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M20 23 +#define M01 1322 +#define M11 13 +#define M21 27271 +#define M02 74 +#define M12 191 +#define M22 500 + +#define K00 405 +#define K10 112 +#define K01 4322 +#define K11 135 + +#define N 16 + +void foo (int *__restrict__ pInput, int *__restrict__ pOutput, + int *__restrict__ pInput2, int *__restrict__ pOutput2) +{ + int i, a, b, c, d, e; + + for (i = 0; i < N / 3; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + + d = *pInput2++; + e = *pInput2++; + + *pOutput++ = M00 * a + M01 * b + M02 * c; + *pOutput++ = M10 * a + M11 * b + M12 * c; + *pOutput++ = M20 * a + M21 * b + M22 * c; + + *pOutput2++ = K00 * d + K01 * e; + *pOutput2++ = K10 * d + K11 * e; + } +} + +int main (int argc, const char* argv[]) +{ + int input[N], output[N], i; + int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0}; + int input2[N], output2[N]; + int check_results2[N] = {4322, 135, 13776, 629, 23230, 1123, 32684, 1617, 42138, 2111, 0, 0, 0, 0, 0, 0}; + + for (i = 0; i < N; i++) + { + input[i] = i%256; + input2[i] = i%256; + output[i] = 0; + output2[i] = 0; + if (input[i] > 256) + abort (); + } + + foo (input, output, input2, output2); + + for (i = 0; i < N; i++) + if (output[i] != check_results[i] || output2[i] != check_results2[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + + diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-6.c b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c new file mode 100644 index 000000000000..ff9be8aa8e23 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-6.c @@ -0,0 +1,77 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M20 23 +#define M01 1322 +#define M11 13 +#define M21 27271 +#define M02 74 +#define M12 191 +#define M22 500 + +#define K00 405 +#define K10 112 +#define K01 4322 +#define K11 135 + +#define N 16 + +void foo (int *__restrict__ pInput, int *__restrict__ pOutput, + int *__restrict__ pInput2, int *__restrict__ pOutput2) +{ + int i, a, b, c, d, e; + + for (i = 0; i < N / 3; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + + d = *pInput2++; + e = *pInput2++; + + *pOutput++ = M00 * a + M01 * b + M02 * c; + *pOutput++ = M10 * a + M11 * b + M12 * c; + *pOutput++ = M20 * a + M21 * b + M22 * c; + + /* Regular SLP - no permutation required. */ + *pOutput2++ = K00 * d; + *pOutput2++ = K10 * e; + } +} + +int main (int argc, const char* argv[]) +{ + int input[N], output[N], i; + int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0}; + int input2[N], output2[N]; + int check_results2[N] = {0, 112, 810, 336, 1620, 560, 2430, 784, 3240, 1008, 0, 0, 0, 0, 0, 0}; + + for (i = 0; i < N; i++) + { + input[i] = i%256; + input2[i] = i%256; + output[i] = 0; + output2[i] = 0; + if (input[i] > 256) + abort (); + } + + foo (input, output, input2, output2); + + for (i = 0; i < N; i++) + if (output[i] != check_results[i] || output2[i] != check_results2[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 2 "vect" { target vect_perm } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-7.c b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c new file mode 100644 index 000000000000..0065407e238f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-7.c @@ -0,0 +1,76 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define M00 100 +#define M10 216 +#define M20 23 +#define M01 1322 +#define M11 13 +#define M21 27271 +#define M02 74 +#define M12 191 +#define M22 500 + +#define K00 405 +#define K10 112 +#define K01 4322 +#define K11 135 + +#define N 16 + +/* SLP with load permutation and loop-based vectorization. */ +void foo (int *__restrict__ pInput, int *__restrict__ pOutput, + int *__restrict__ pInput2, int *__restrict__ pOutput2) +{ + int i, a, b, c, d; + + for (i = 0; i < N / 3; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + d = *pInput2++; + + *pOutput++ = M00 * a + M01 * b + M02 * c; + *pOutput++ = M10 * a + M11 * b + M12 * c; + *pOutput++ = M20 * a + M21 * b + M22 * c; + + /* Loop-based vectorization. */ + *pOutput2++ = K00 * d; + } +} + +int main (int argc, const char* argv[]) +{ + int input[N], output[N], i; + int check_results[N] = {1470, 395, 28271, 5958, 1655, 111653, 10446, 2915, 195035, 14934, 4175, 278417, 19422, 5435, 361799, 0}; + int input2[N], output2[N]; + int check_results2[N] = {0, 405, 810, 1215, 1620, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + for (i = 0; i < N; i++) + { + input[i] = i%256; + input2[i] = i%256; + output[i] = 0; + output2[i] = 0; + if (input[i] > 200) + abort (); + } + + foo (input, output, input2, output2); + + for (i = 0; i < N; i++) + if (output[i] != check_results[i] || output2[i] != check_results2[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + + diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-8.c b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c new file mode 100644 index 000000000000..8c60d44ed0c4 --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-8.c @@ -0,0 +1,57 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 200 + +void foo (unsigned char *__restrict__ pInput, unsigned char *__restrict__ pOutput) +{ + unsigned char i, a, b, c; + + for (i = 0; i < N / 3; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + + *pOutput++ = a + b + c + 3; + *pOutput++ = a + b + c + 12; + *pOutput++ = a + b + c + 1; + } +} + +int main (int argc, const char* argv[]) +{ + unsigned char input[N], output[N], i; + unsigned char check_results[N]; + + for (i = 0; i < N; i++) + { + input[i] = i; + output[i] = 0; + if (input[i] > 256) + abort (); + } + + for (i = 0; i < N / 3; i++) + { + check_results[3*i] = 9 * i + 6; + check_results[3*i+1] = 9 * i + 15; + check_results[3*i+2] = 9 * i + 4; + } + + foo (input, output); + + for (i = 0; i < N - (N % 3); i++) + if (output[i] != check_results[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 1 "vect" { target vect_perm } } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/gcc.dg/vect/slp-perm-9.c b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c new file mode 100644 index 000000000000..964e691cc68f --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/slp-perm-9.c @@ -0,0 +1,58 @@ +/* { dg-require-effective-target vect_int } */ + +#include <stdarg.h> +#include <stdio.h> +#include "tree-vect.h" + +#define N 200 + +void foo (unsigned short *__restrict__ pInput, unsigned short *__restrict__ pOutput) +{ + unsigned short i, a, b, c; + + for (i = 0; i < N / 3; i++) + { + a = *pInput++; + b = *pInput++; + c = *pInput++; + + *pOutput++ = a + b + c + 3; + *pOutput++ = a + b + c + 12; + *pOutput++ = a + b + c + 1; + } +} + +int main (int argc, const char* argv[]) +{ + unsigned short input[N], output[N], i; + unsigned short check_results[N]; + + for (i = 0; i < N; i++) + { + input[i] = i; + output[i] = 0; + if (input[i] > 256) + abort (); + } + + for (i = 0; i < N / 3; i++) + { + check_results[3*i] = 9 * i + 6; + check_results[3*i+1] = 9 * i + 15; + check_results[3*i+2] = 9 * i + 4; + } + + foo (input, output); + + for (i = 0; i < N - (N % 3); i++) + if (output[i] != check_results[i]) + abort (); + + return 0; +} + +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 1 "vect" } } */ +/* { dg-final { scan-tree-dump-times "permutation requires at least three vectors" 1 "vect" { target vect_perm } } } */ +/* { dg-final { scan-tree-dump-times "vectorizing stmts using SLP" 0 "vect" } } */ +/* { dg-final { cleanup-tree-dump "vect" } } */ + diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-supports.exp index 69c8ea438c65..e508038dfdbd 100644 --- a/gcc/testsuite/lib/target-supports.exp +++ b/gcc/testsuite/lib/target-supports.exp @@ -1608,6 +1608,28 @@ proc check_effective_target_vect_no_bitwise { } { return $et_vect_no_bitwise_saved } +# Return 1 if the target plus current options supports vector permutation, +# 0 otherwise. +# +# This won't change for different subtargets so cache the result. + +proc check_effective_target_vect_perm { } { + global et_vect_perm + + if [info exists et_vect_perm_saved] { + verbose "check_effective_target_vect_perm: using cached result" 2 + } else { + set et_vect_perm_saved 0 + if { [istarget powerpc*-*-*] + || [istarget spu-*-*] } { + set et_vect_perm_saved 1 + } + } + verbose "check_effective_target_vect_perm: returning $et_vect_perm_saved" 2 + return $et_vect_perm_saved +} + + # Return 1 if the target plus current options supports a vector # widening summation of *short* args into *int* result, 0 otherwise. # A target can also support this widening summation if it can support diff --git a/gcc/tree-vect-analyze.c b/gcc/tree-vect-analyze.c index 305ba4c66037..c672d7affecb 100644 --- a/gcc/tree-vect-analyze.c +++ b/gcc/tree-vect-analyze.c @@ -486,7 +486,7 @@ vect_analyze_operations (loop_vec_info loop_vinfo) || vectorizable_conversion (stmt, NULL, NULL, NULL) || vectorizable_operation (stmt, NULL, NULL, NULL) || vectorizable_assignment (stmt, NULL, NULL, NULL) - || vectorizable_load (stmt, NULL, NULL, NULL) + || vectorizable_load (stmt, NULL, NULL, NULL, NULL) || vectorizable_call (stmt, NULL, NULL) || vectorizable_store (stmt, NULL, NULL, NULL) || vectorizable_condition (stmt, NULL, NULL) @@ -846,6 +846,31 @@ vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) } +/* Find the place of the data-ref in STMT in the interleaving chain that starts + from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */ + +static int +vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt) +{ + gimple next_stmt = first_stmt; + int result = 0; + + if (first_stmt != DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt))) + return -1; + + while (next_stmt && next_stmt != stmt) + { + result++; + next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); + } + + if (next_stmt) + return result; + else + return -1; +} + + /* Function vect_insert_into_interleaving_chain. Insert DRA into the interleaving chain of DRB according to DRA's INIT. */ @@ -2482,7 +2507,7 @@ vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo) /* Recursively free the memory allocated for the SLP tree rooted at NODE. */ -void +static void vect_free_slp_tree (slp_tree node) { if (!node) @@ -2503,6 +2528,17 @@ vect_free_slp_tree (slp_tree node) } +/* Free the memory allocated for the SLP instance. */ + +void +vect_free_slp_instance (slp_instance instance) +{ + vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (instance)); + VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance)); +} + + /* Get the defs for the rhs of STMT (collect them in DEF_STMTS0/1), check that they are of a legal type and that they match the defs of the first stmt of the SLP group (stored in FIRST_STMT_...). */ @@ -2705,7 +2741,9 @@ static bool vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, unsigned int group_size, int *inside_cost, int *outside_cost, - int ncopies_for_cost, unsigned int *max_nunits) + int ncopies_for_cost, unsigned int *max_nunits, + VEC (int, heap) **load_permutation, + VEC (slp_tree, heap) **loads) { VEC (gimple, heap) *def_stmts0 = VEC_alloc (gimple, heap, group_size); VEC (gimple, heap) *def_stmts1 = VEC_alloc (gimple, heap, group_size); @@ -2716,7 +2754,6 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, enum tree_code first_stmt_code = 0, rhs_code; tree first_stmt_def1_type = NULL_TREE, first_stmt_def0_type = NULL_TREE; tree lhs; - gimple prev_stmt = NULL; bool stop_recursion = false, need_same_oprnds = false; tree vectype, scalar_type, first_op1 = NULL_TREE; unsigned int vectorization_factor = 0, ncopies; @@ -2728,6 +2765,9 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, struct data_reference *first_dr; bool pattern0 = false, pattern1 = false; HOST_WIDE_INT dummy; + bool permutation = false; + unsigned int load_place; + gimple first_load; /* For every stmt in NODE find its def stmt/s. */ for (i = 0; VEC_iterate (gimple, stmts, i, stmt); i++) @@ -2813,8 +2853,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, if (icode == CODE_FOR_nothing) { if (vect_print_dump_info (REPORT_SLP)) - fprintf (vect_dump, - "Build SLP failed: op not supported by target."); + fprintf (vect_dump, "Build SLP failed: " + "op not supported by target."); return false; } optab_op2_mode = insn_data[icode].operand[2].mode; @@ -2878,70 +2918,60 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, else { /* Load. */ - if (i == 0) - { - /* First stmt of the SLP group should be the first load of - the interleaving loop if data permutation is not allowed. - Check that there is no gap between the loads. */ - if (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt - || DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0) - { - /* FORNOW: data permutations and gaps in loads are not - supported. */ - if (vect_print_dump_info (REPORT_SLP)) - { - fprintf (vect_dump, "Build SLP failed: strided " - " loads need permutation or have gaps "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } - - return false; - } - - first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); - if (vect_supportable_dr_alignment (first_dr) - == dr_unaligned_unsupported) - { - if (vect_print_dump_info (REPORT_SLP)) - { - fprintf (vect_dump, "Build SLP failed: unsupported " - " unaligned load "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } - - return false; - } - - /* Analyze costs (for the first stmt in the group). */ - vect_model_load_cost (vinfo_for_stmt (stmt), - ncopies_for_cost, *node); - } - else - { - /* Check that we have consecutive loads from interleaving - chain and that there is no gap between the loads. */ - if (DR_GROUP_NEXT_DR (vinfo_for_stmt (prev_stmt)) != stmt - || DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1) - { - /* FORNOW: data permutations and gaps in loads are not - supported. */ - if (vect_print_dump_info (REPORT_SLP)) - { - fprintf (vect_dump, "Build SLP failed: strided " - " loads need permutation or have gaps "); - print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); - } - return false; - } - } - - prev_stmt = stmt; - - /* We stop the tree when we reach a group of loads. */ - stop_recursion = true; - continue; - } - } /* Strided access. */ + /* FORNOW: Check that there is no gap between the loads. */ + if ((DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) == stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 0) + || (DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)) != stmt + && DR_GROUP_GAP (vinfo_for_stmt (stmt)) != 1)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: strided " + "loads have gaps "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + + if (first_load == stmt) + { + first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)); + if (vect_supportable_dr_alignment (first_dr) + == dr_unaligned_unsupported) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported " + "unaligned load "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* Analyze costs (for the first stmt in the group). */ + vect_model_load_cost (vinfo_for_stmt (stmt), + ncopies_for_cost, *node); + } + + /* Store the place of this load in the interleaving chain. In + case that permutation is needed we later decide if a specific + permutation is supported. */ + load_place = vect_get_place_in_interleaving_chain (stmt, + first_load); + if (load_place != i) + permutation = true; + + VEC_safe_push (int, heap, *load_permutation, load_place); + + /* We stop the tree when we reach a group of loads. */ + stop_recursion = true; + continue; + } + } /* Strided access. */ else { if (TREE_CODE_CLASS (rhs_code) == tcc_reference) @@ -2990,7 +3020,15 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, /* Strided loads were reached - stop the recursion. */ if (stop_recursion) - return true; + { + if (permutation) + { + VEC_safe_push (slp_tree, heap, *loads, *node); + *inside_cost += TARG_VEC_PERMUTE_COST * group_size; + } + + return true; + } /* Create SLP_TREE nodes for the definition node/s. */ if (first_stmt_dt0 == vect_loop_def) @@ -3003,8 +3041,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, SLP_TREE_OUTSIDE_OF_LOOP_COST (left_node) = 0; SLP_TREE_INSIDE_OF_LOOP_COST (left_node) = 0; if (!vect_build_slp_tree (loop_vinfo, &left_node, group_size, - inside_cost, outside_cost, - ncopies_for_cost, max_nunits)) + inside_cost, outside_cost, ncopies_for_cost, + max_nunits, load_permutation, loads)) return false; SLP_TREE_LEFT (*node) = left_node; @@ -3020,8 +3058,8 @@ vect_build_slp_tree (loop_vec_info loop_vinfo, slp_tree *node, SLP_TREE_OUTSIDE_OF_LOOP_COST (right_node) = 0; SLP_TREE_INSIDE_OF_LOOP_COST (right_node) = 0; if (!vect_build_slp_tree (loop_vinfo, &right_node, group_size, - inside_cost, outside_cost, - ncopies_for_cost, max_nunits)) + inside_cost, outside_cost, ncopies_for_cost, + max_nunits, load_permutation, loads)) return false; SLP_TREE_RIGHT (*node) = right_node; @@ -3076,6 +3114,116 @@ vect_mark_slp_stmts (slp_tree node, enum slp_vect_type mark, int j) } +/* Check if the permutation required by the SLP INSTANCE is supported. + Reorganize the SLP nodes stored in SLP_INSTANCE_LOADS if needed. */ + +static bool +vect_supported_slp_permutation_p (slp_instance instance) +{ + slp_tree node = VEC_index (slp_tree, SLP_INSTANCE_LOADS (instance), 0); + gimple stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (node), 0); + gimple first_load = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt)); + VEC (slp_tree, heap) *sorted_loads = NULL; + int index; + slp_tree *tmp_loads = NULL; + int group_size = SLP_INSTANCE_GROUP_SIZE (instance), i, j; + slp_tree load; + + /* FORNOW: The only supported loads permutation is loads from the same + location in all the loads in the node, when the data-refs in + nodes of LOADS constitute an interleaving chain. + Sort the nodes according to the order of accesses in the chain. */ + tmp_loads = (slp_tree *) xmalloc (sizeof (slp_tree) * group_size); + for (i = 0, j = 0; + VEC_iterate (int, SLP_INSTANCE_LOAD_PERMUTATION (instance), i, index) + && VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), j, load); + i += group_size, j++) + { + gimple scalar_stmt = VEC_index (gimple, SLP_TREE_SCALAR_STMTS (load), 0); + /* Check that the loads are all in the same interleaving chain. */ + if (DR_GROUP_FIRST_DR (vinfo_for_stmt (scalar_stmt)) != first_load) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "Build SLP failed: unsupported data " + "permutation "); + print_gimple_stmt (vect_dump, scalar_stmt, 0, TDF_SLIM); + } + + free (tmp_loads); + return false; + } + + tmp_loads[index] = load; + } + + sorted_loads = VEC_alloc (slp_tree, heap, group_size); + for (i = 0; i < group_size; i++) + VEC_safe_push (slp_tree, heap, sorted_loads, tmp_loads[i]); + + VEC_free (slp_tree, heap, SLP_INSTANCE_LOADS (instance)); + SLP_INSTANCE_LOADS (instance) = sorted_loads; + free (tmp_loads); + + if (!vect_transform_slp_perm_load (stmt, NULL, NULL, + SLP_INSTANCE_UNROLLING_FACTOR (instance), + instance, true)) + return false; + + return true; +} + + +/* Check if the required load permutation is supported. + LOAD_PERMUTATION contains a list of indices of the loads. + In SLP this permutation is relative to the order of strided stores that are + the base of the SLP instance. */ + +static bool +vect_supported_load_permutation_p (slp_instance slp_instn, int group_size, + VEC (int, heap) *load_permutation) +{ + int i = 0, j, prev = -1, next, k; + bool supported; + + /* FORNOW: permutations are only supported for loop-aware SLP. */ + if (!slp_instn) + return false; + + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Load permutation "); + for (i = 0; VEC_iterate (int, load_permutation, i, next); i++) + fprintf (vect_dump, "%d ", next); + } + + /* FORNOW: the only supported permutation is 0..01..1.. of length equal to + GROUP_SIZE and where each sequence of same drs is of GROUP_SIZE length as + well. */ + supported = true; + for (j = 0; j < group_size; j++) + { + for (i = j * group_size, k = 0; + VEC_iterate (int, load_permutation, i, next) && k < group_size; + i++, k++) + { + if (i != j * group_size && next != prev) + { + supported = false; + break; + } + + prev = next; + } + } + + if (supported && i == group_size * group_size + && vect_supported_slp_permutation_p (slp_instn)) + return true; + + return false; +} + /* Analyze an SLP instance starting from a group of strided stores. Call vect_build_slp_tree to build a tree of packed stmts if possible. Return FALSE if it's impossible to SLP any stmt in the loop. */ @@ -3093,8 +3241,11 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt) bool slp_impossible = false; int inside_cost = 0, outside_cost = 0, ncopies_for_cost; unsigned int max_nunits = 0; - - scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))); + VEC (int, heap) *load_permutation; + VEC (slp_tree, heap) *loads; + + scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF ( + vinfo_for_stmt (stmt)))); vectype = get_vectype_for_scalar_type (scalar_type); if (!vectype) { @@ -3134,16 +3285,21 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt) factor (number of vectors is 1 if NUNITS >= GROUP_SIZE, and is GROUP_SIZE / NUNITS otherwise. */ ncopies_for_cost = unrolling_factor * group_size / nunits; + + load_permutation = VEC_alloc (int, heap, group_size * group_size); + loads = VEC_alloc (slp_tree, heap, group_size); /* Build the tree for the SLP instance. */ if (vect_build_slp_tree (loop_vinfo, &node, group_size, &inside_cost, - &outside_cost, ncopies_for_cost, &max_nunits)) + &outside_cost, ncopies_for_cost, &max_nunits, + &load_permutation, &loads)) { /* Create a new SLP instance. */ new_instance = XNEW (struct _slp_instance); SLP_INSTANCE_TREE (new_instance) = node; SLP_INSTANCE_GROUP_SIZE (new_instance) = group_size; - /* Calculate the unrolling factor based on the smallest type. */ + /* Calculate the unrolling factor based on the smallest type in the + loop. */ if (max_nunits > nunits) unrolling_factor = least_common_multiple (max_nunits, group_size) / group_size; @@ -3151,6 +3307,27 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt) SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor; SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (new_instance) = outside_cost; SLP_INSTANCE_INSIDE_OF_LOOP_COST (new_instance) = inside_cost; + SLP_INSTANCE_LOADS (new_instance) = loads; + SLP_INSTANCE_LOAD_PERMUTATION (new_instance) = load_permutation; + if (VEC_length (slp_tree, loads)) + { + if (!vect_supported_load_permutation_p (new_instance, group_size, + load_permutation)) + { + if (vect_print_dump_info (REPORT_SLP)) + { + fprintf (vect_dump, "Build SLP failed: unsupported load " + "permutation "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + vect_free_slp_instance (new_instance); + return false; + } + } + else + VEC_free (int, heap, SLP_INSTANCE_LOAD_PERMUTATION (new_instance)); + VEC_safe_push (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo), new_instance); if (vect_print_dump_info (REPORT_SLP)) @@ -3162,7 +3339,9 @@ vect_analyze_slp_instance (loop_vec_info loop_vinfo, gimple stmt) /* Failed to SLP. */ /* Free the allocated memory. */ vect_free_slp_tree (node); - + VEC_free (int, heap, load_permutation); + VEC_free (slp_tree, heap, loads); + if (slp_impossible) return false; diff --git a/gcc/tree-vect-transform.c b/gcc/tree-vect-transform.c index 0744527794dd..3a77c5bab1c0 100644 --- a/gcc/tree-vect-transform.c +++ b/gcc/tree-vect-transform.c @@ -47,7 +47,7 @@ along with GCC; see the file COPYING3. If not see /* Utility functions for the code transformation. */ static bool vect_transform_stmt (gimple, gimple_stmt_iterator *, bool *, - slp_tree); + slp_tree, slp_instance); static tree vect_create_destination_var (tree, tree); static tree vect_create_data_ref_ptr (gimple, struct loop*, tree, tree *, gimple *, bool, bool *); @@ -936,7 +936,7 @@ vect_create_addr_base_for_vector_ref (gimple stmt, base_offset = force_gimple_operand (base_offset, &seq, false, tmp); gimple_seq_add_seq (new_stmt_list, seq); } - + /* base + base_offset */ addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), data_ref_base, base_offset); @@ -5962,6 +5962,313 @@ vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size, } +/* Create NCOPIES permutation statements using the mask MASK_BYTES (by + building a vector of type MASK_TYPE from it) and two input vectors placed in + DR_CHAIN at FIRST_VEC_INDX and SECOND_VEC_INDX for the first copy and + shifting by STRIDE elements of DR_CHAIN for every copy. + (STRIDE is the number of vectorized stmts for NODE divided by the number of + copies). + VECT_STMTS_COUNTER specifies the index in the vectorized stmts of NODE, where + the created stmts must be inserted. */ + +static inline void +vect_create_mask_and_perm (gimple stmt, gimple next_scalar_stmt, + int *mask_array, int mask_nunits, + tree mask_element_type, tree mask_type, + int first_vec_indx, int second_vec_indx, + gimple_stmt_iterator *gsi, slp_tree node, + tree builtin_decl, tree vectype, + VEC(tree,heap) *dr_chain, + int ncopies, int vect_stmts_counter) +{ + tree t = NULL_TREE, mask_vec, mask, perm_dest; + gimple perm_stmt = NULL; + stmt_vec_info next_stmt_info; + int i, group_size, stride, dr_chain_size; + tree first_vec, second_vec, data_ref; + tree sym; + ssa_op_iter iter; + VEC (tree, heap) *params = NULL; + + /* Create a vector mask. */ + for (i = mask_nunits - 1; i >= 0; --i) + t = tree_cons (NULL_TREE, build_int_cst (mask_element_type, mask_array[i]), + t); + + mask_vec = build_vector (mask_type, t); + mask = vect_init_vector (stmt, mask_vec, mask_type, NULL); + + group_size = VEC_length (gimple, SLP_TREE_SCALAR_STMTS (node)); + stride = SLP_TREE_NUMBER_OF_VEC_STMTS (node) / ncopies; + dr_chain_size = VEC_length (tree, dr_chain); + + /* Initialize the vect stmts of NODE to properly insert the generated + stmts later. */ + for (i = VEC_length (gimple, SLP_TREE_VEC_STMTS (node)); + i < (int) SLP_TREE_NUMBER_OF_VEC_STMTS (node); i++) + VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (node), NULL); + + perm_dest = vect_create_destination_var (gimple_assign_lhs (stmt), vectype); + for (i = 0; i < ncopies; i++) + { + first_vec = VEC_index (tree, dr_chain, first_vec_indx); + second_vec = VEC_index (tree, dr_chain, second_vec_indx); + + /* Build argument list for the vectorized call. */ + VEC_free (tree, heap, params); + params = VEC_alloc (tree, heap, 3); + VEC_quick_push (tree, params, first_vec); + VEC_quick_push (tree, params, second_vec); + VEC_quick_push (tree, params, mask); + + /* Generate the permute statement. */ + perm_stmt = gimple_build_call_vec (builtin_decl, params); + data_ref = make_ssa_name (perm_dest, perm_stmt); + gimple_call_set_lhs (perm_stmt, data_ref); + vect_finish_stmt_generation (stmt, perm_stmt, gsi); + FOR_EACH_SSA_TREE_OPERAND (sym, perm_stmt, iter, SSA_OP_ALL_VIRTUALS) + { + if (TREE_CODE (sym) == SSA_NAME) + sym = SSA_NAME_VAR (sym); + mark_sym_for_renaming (sym); + } + + /* Store the vector statement in NODE. */ + VEC_replace (gimple, SLP_TREE_VEC_STMTS (node), + stride * i + vect_stmts_counter, perm_stmt); + + first_vec_indx += stride; + second_vec_indx += stride; + } + + /* Mark the scalar stmt as vectorized. */ + next_stmt_info = vinfo_for_stmt (next_scalar_stmt); + STMT_VINFO_VEC_STMT (next_stmt_info) = perm_stmt; +} + + +/* Given FIRST_MASK_ELEMENT - the mask element in element representation, + return in CURRENT_MASK_ELEMENT its equivalent in target specific + representation. Check that the mask is valid and return FALSE if not. + Return TRUE in NEED_NEXT_VECTOR if the permutation requires to move to + the next vector, i.e., the current first vector is not needed. */ + +static bool +vect_get_mask_element (gimple stmt, int first_mask_element, int m, + int mask_nunits, bool only_one_vec, int index, + int *mask, int *current_mask_element, + bool *need_next_vector) +{ + int i; + static int number_of_mask_fixes = 1; + static bool mask_fixed = false; + static bool needs_first_vector = false; + + /* Convert to target specific representation. */ + *current_mask_element = first_mask_element + m; + /* Adjust the value in case it's a mask for second and third vectors. */ + *current_mask_element -= mask_nunits * (number_of_mask_fixes - 1); + + if (*current_mask_element < mask_nunits) + needs_first_vector = true; + + /* We have only one input vector to permute but the mask accesses values in + the next vector as well. */ + if (only_one_vec && *current_mask_element >= mask_nunits) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "permutation requires at least two vectors "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* The mask requires the next vector. */ + if (*current_mask_element >= mask_nunits * 2) + { + if (needs_first_vector || mask_fixed) + { + /* We either need the first vector too or have already moved to the + next vector. In both cases, this permutation needs three + vectors. */ + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "permutation requires at " + "least three vectors "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + /* We move to the next vector, dropping the first one and working with + the second and the third - we need to adjust the values of the mask + accordingly. */ + *current_mask_element -= mask_nunits * number_of_mask_fixes; + + for (i = 0; i < index; i++) + mask[i] -= mask_nunits * number_of_mask_fixes; + + (number_of_mask_fixes)++; + mask_fixed = true; + } + + *need_next_vector = mask_fixed; + + /* This was the last element of this mask. Start a new one. */ + if (index == mask_nunits - 1) + { + number_of_mask_fixes = 1; + mask_fixed = false; + needs_first_vector = false; + } + + return true; +} + + +/* Generate vector permute statements from a list of loads in DR_CHAIN. + If ANALYZE_ONLY is TRUE, only check that it is possible to create valid + permute statements for SLP_NODE_INSTANCE. */ +bool +vect_transform_slp_perm_load (gimple stmt, VEC (tree, heap) *dr_chain, + gimple_stmt_iterator *gsi, int vf, + slp_instance slp_node_instance, bool analyze_only) +{ + stmt_vec_info stmt_info = vinfo_for_stmt (stmt); + tree mask_element_type = NULL_TREE, mask_type; + int i, j, k, m, scale, mask_nunits, nunits, vec_index = 0, scalar_index; + slp_tree node; + tree vectype = STMT_VINFO_VECTYPE (stmt_info), builtin_decl; + gimple next_scalar_stmt; + int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance); + int first_mask_element; + int index, unroll_factor, *mask, current_mask_element, ncopies; + bool only_one_vec = false, need_next_vector = false; + int first_vec_index, second_vec_index, orig_vec_stmts_num, vect_stmts_counter; + + if (!targetm.vectorize.builtin_vec_perm) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "no builtin for vect permute for "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + builtin_decl = targetm.vectorize.builtin_vec_perm (vectype, + &mask_element_type); + if (!builtin_decl || !mask_element_type) + { + if (vect_print_dump_info (REPORT_DETAILS)) + { + fprintf (vect_dump, "no builtin for vect permute for "); + print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); + } + + return false; + } + + mask_type = get_vectype_for_scalar_type (mask_element_type); + mask_nunits = TYPE_VECTOR_SUBPARTS (mask_type); + mask = (int *) xmalloc (sizeof (int) * mask_nunits); + nunits = TYPE_VECTOR_SUBPARTS (vectype); + scale = mask_nunits / nunits; + unroll_factor = SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); + + /* The number of vector stmts to generate based only on SLP_NODE_INSTANCE + unrolling factor. */ + orig_vec_stmts_num = group_size * + SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance) / nunits; + if (orig_vec_stmts_num == 1) + only_one_vec = true; + + /* Number of copies is determined by the final vectorization factor + relatively to SLP_NODE_INSTANCE unrolling factor. */ + ncopies = vf / SLP_INSTANCE_UNROLLING_FACTOR (slp_node_instance); + + /* Generate permutation masks for every NODE. Number of masks for each NODE + is equal to GROUP_SIZE. + E.g., we have a group of three nodes with three loads from the same + location in each node, and the vector size is 4. I.e., we have a + a0b0c0a1b1c1... sequence and we need to create the following vectors: + for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3 + for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3 + ... + + The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9} (in target + scpecific type, e.g., in bytes for Altivec. + The last mask is illegal since we assume two operands for permute + operation, and the mask element values can't be outside that range. Hence, + the last mask must be converted into {2,5,5,5}. + For the first two permutations we need the first and the second input + vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation + we need the second and the third vectors: {b1,c1,a2,b2} and + {c2,a3,b3,c3}. */ + + for (i = 0; + VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (slp_node_instance), + i, node); + i++) + { + scalar_index = 0; + index = 0; + vect_stmts_counter = 0; + vec_index = 0; + first_vec_index = vec_index++; + if (only_one_vec) + second_vec_index = first_vec_index; + else + second_vec_index = vec_index++; + + for (j = 0; j < unroll_factor; j++) + { + for (k = 0; k < group_size; k++) + { + first_mask_element = (i + j * group_size) * scale; + for (m = 0; m < scale; m++) + { + if (!vect_get_mask_element (stmt, first_mask_element, m, + mask_nunits, only_one_vec, index, mask, + ¤t_mask_element, &need_next_vector)) + return false; + + mask[index++] = current_mask_element; + } + + if (index == mask_nunits) + { + index = 0; + if (!analyze_only) + { + if (need_next_vector) + { + first_vec_index = second_vec_index; + second_vec_index = vec_index; + } + + next_scalar_stmt = VEC_index (gimple, + SLP_TREE_SCALAR_STMTS (node), scalar_index++); + + vect_create_mask_and_perm (stmt, next_scalar_stmt, + mask, mask_nunits, mask_element_type, mask_type, + first_vec_index, second_vec_index, gsi, node, + builtin_decl, vectype, dr_chain, ncopies, + vect_stmts_counter++); + } + } + } + } + } + + free (mask); + return true; +} + /* vectorizable_load. Check if STMT reads a non scalar data-ref (array/pointer/structure) that @@ -5972,7 +6279,7 @@ vect_transform_strided_load (gimple stmt, VEC(tree,heap) *dr_chain, int size, bool vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, - slp_tree slp_node) + slp_tree slp_node, slp_instance slp_node_instance) { tree scalar_dest; tree vec_dest = NULL; @@ -6008,6 +6315,7 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, struct loop *at_loop; int vec_num; bool slp = (slp_node != NULL); + bool slp_perm = false; enum tree_code code; /* Multiple types in SLP are handled by creating the appropriate number of @@ -6028,6 +6336,9 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, return false; } + if (slp && SLP_INSTANCE_LOAD_PERMUTATION (slp_node_instance)) + slp_perm = true; + if (!STMT_VINFO_RELEVANT_P (stmt_info)) return false; @@ -6397,33 +6708,47 @@ vectorizable_load (gimple stmt, gimple_stmt_iterator *gsi, gimple *vec_stmt, /* Collect vector loads and later create their permutation in vect_transform_strided_load (). */ - if (strided_load) + if (strided_load || slp_perm) VEC_quick_push (tree, dr_chain, new_temp); /* Store vector loads in the corresponding SLP_NODE. */ - if (slp) + if (slp && !slp_perm) VEC_quick_push (gimple, SLP_TREE_VEC_STMTS (slp_node), new_stmt); } - if (slp) + if (slp && !slp_perm) continue; - if (strided_load) - { - if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi)) - return false; - *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); - VEC_free (tree, heap, dr_chain); - dr_chain = VEC_alloc (tree, heap, group_size); - } + if (slp_perm) + { + if (!vect_transform_slp_perm_load (stmt, dr_chain, gsi, + LOOP_VINFO_VECT_FACTOR (loop_vinfo), + slp_node_instance, false)) + { + VEC_free (tree, heap, dr_chain); + return false; + } + } else - { - if (j == 0) - STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; - else - STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; - prev_stmt_info = vinfo_for_stmt (new_stmt); - } + { + if (strided_load) + { + if (!vect_transform_strided_load (stmt, dr_chain, group_size, gsi)) + return false; + + *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); + VEC_free (tree, heap, dr_chain); + dr_chain = VEC_alloc (tree, heap, group_size); + } + else + { + if (j == 0) + STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; + else + STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; + prev_stmt_info = vinfo_for_stmt (new_stmt); + } + } } if (dr_chain) @@ -6690,7 +7015,8 @@ vectorizable_condition (gimple stmt, gimple_stmt_iterator *gsi, static bool vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi, - bool *strided_store, slp_tree slp_node) + bool *strided_store, slp_tree slp_node, + slp_instance slp_node_instance) { bool is_store = false; gimple vec_stmt = NULL; @@ -6732,7 +7058,8 @@ vect_transform_stmt (gimple stmt, gimple_stmt_iterator *gsi, break; case load_vec_info_type: - done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node); + done = vectorizable_load (stmt, gsi, &vec_stmt, slp_node, + slp_node_instance); gcc_assert (done); break; @@ -7807,6 +8134,8 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, stmt_vec_info stmt_info; unsigned int vec_stmts_size, nunits, group_size; tree vectype; + int i; + slp_tree loads_node; if (!node) return false; @@ -7830,8 +8159,28 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, size. */ vec_stmts_size = (vectorization_factor * group_size) / nunits; - SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size); - SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; + /* In case of load permutation we have to allocate vectorized statements for + all the nodes that participate in that permutation. */ + if (SLP_INSTANCE_LOAD_PERMUTATION (instance)) + { + for (i = 0; + VEC_iterate (slp_tree, SLP_INSTANCE_LOADS (instance), i, loads_node); + i++) + { + if (!SLP_TREE_VEC_STMTS (loads_node)) + { + SLP_TREE_VEC_STMTS (loads_node) = VEC_alloc (gimple, heap, + vec_stmts_size); + SLP_TREE_NUMBER_OF_VEC_STMTS (loads_node) = vec_stmts_size; + } + } + } + + if (!SLP_TREE_VEC_STMTS (node)) + { + SLP_TREE_VEC_STMTS (node) = VEC_alloc (gimple, heap, vec_stmts_size); + SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; + } if (vect_print_dump_info (REPORT_DETAILS)) { @@ -7840,7 +8189,7 @@ vect_schedule_slp_instance (slp_tree node, slp_instance instance, } si = gsi_for_stmt (stmt); - is_store = vect_transform_stmt (stmt, &si, &strided_store, node); + is_store = vect_transform_stmt (stmt, &si, &strided_store, node, instance); if (is_store) { if (DR_GROUP_FIRST_DR (stmt_info)) @@ -7980,7 +8329,7 @@ vect_transform_loop (loop_vec_info loop_vinfo) { if (vect_print_dump_info (REPORT_DETAILS)) fprintf (vect_dump, "transform phi."); - vect_transform_stmt (phi, NULL, NULL, NULL); + vect_transform_stmt (phi, NULL, NULL, NULL, NULL); } } @@ -8059,7 +8408,7 @@ vect_transform_loop (loop_vec_info loop_vinfo) fprintf (vect_dump, "transform statement."); strided_store = false; - is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL); + is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL); if (is_store) { if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) diff --git a/gcc/tree-vectorizer.c b/gcc/tree-vectorizer.c index 437b145db0f6..cdab0b54df0d 100644 --- a/gcc/tree-vectorizer.c +++ b/gcc/tree-vectorizer.c @@ -1802,7 +1802,8 @@ destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++) - vect_free_slp_tree (SLP_INSTANCE_TREE (instance)); + vect_free_slp_instance (instance); + VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); diff --git a/gcc/tree-vectorizer.h b/gcc/tree-vectorizer.h index 10e7aa309004..678dc59da729 100644 --- a/gcc/tree-vectorizer.h +++ b/gcc/tree-vectorizer.h @@ -105,6 +105,8 @@ typedef struct _slp_tree { } cost; } *slp_tree; +DEF_VEC_P(slp_tree); +DEF_VEC_ALLOC_P(slp_tree, heap); /* SLP instance is a sequence of stmts in a loop that can be packed into SIMD stmts. */ @@ -124,6 +126,13 @@ typedef struct _slp_instance { int outside_of_loop; /* Statements generated outside loop. */ int inside_of_loop; /* Statements generated inside loop. */ } cost; + + /* Loads permutation relatively to the stores, NULL if there is no + permutation. */ + VEC (int, heap) *load_permutation; + + /* The group of nodes that contain loads of this SLP instance. */ + VEC (slp_tree, heap) *loads; } *slp_instance; DEF_VEC_P(slp_instance); @@ -135,6 +144,8 @@ DEF_VEC_ALLOC_P(slp_instance, heap); #define SLP_INSTANCE_UNROLLING_FACTOR(S) (S)->unrolling_factor #define SLP_INSTANCE_OUTSIDE_OF_LOOP_COST(S) (S)->cost.outside_of_loop #define SLP_INSTANCE_INSIDE_OF_LOOP_COST(S) (S)->cost.inside_of_loop +#define SLP_INSTANCE_LOAD_PERMUTATION(S) (S)->load_permutation +#define SLP_INSTANCE_LOADS(S) (S)->loads #define SLP_TREE_LEFT(S) (S)->left #define SLP_TREE_RIGHT(S) (S)->right @@ -522,6 +533,11 @@ typedef struct _stmt_vec_info { #define TARG_VEC_STORE_COST 1 #endif +/* Cost of vector permutation. */ +#ifndef TARG_VEC_PERMUTE_COST +#define TARG_VEC_PERMUTE_COST 1 +#endif + /* The maximum number of intermediate steps required in multi-step type conversion. */ #define MAX_INTERM_CVT_STEPS 3 @@ -700,7 +716,7 @@ extern void free_stmt_vec_info (gimple stmt); /** In tree-vect-analyze.c **/ /* Driver for analysis stage. */ extern loop_vec_info vect_analyze_loop (struct loop *); -extern void vect_free_slp_tree (slp_tree); +extern void vect_free_slp_instance (slp_instance); extern loop_vec_info vect_analyze_loop_form (struct loop *); extern tree vect_get_smallest_scalar_type (gimple, HOST_WIDE_INT *, HOST_WIDE_INT *); @@ -716,7 +732,7 @@ void vect_pattern_recog (loop_vec_info); /** In tree-vect-transform.c **/ extern bool vectorizable_load (gimple, gimple_stmt_iterator *, gimple *, - slp_tree); + slp_tree, slp_instance); extern bool vectorizable_store (gimple, gimple_stmt_iterator *, gimple *, slp_tree); extern bool vectorizable_operation (gimple, gimple_stmt_iterator *, gimple *, @@ -742,6 +758,9 @@ extern void vect_model_simple_cost (stmt_vec_info, int, enum vect_def_type *, extern void vect_model_store_cost (stmt_vec_info, int, enum vect_def_type, slp_tree); extern void vect_model_load_cost (stmt_vec_info, int, slp_tree); +extern bool vect_transform_slp_perm_load (gimple, VEC (tree, heap) *, + gimple_stmt_iterator *, int, slp_instance, bool); + /* Driver for transformation stage. */ extern void vect_transform_loop (loop_vec_info); -- GitLab