diff --git a/gcc/ChangeLog b/gcc/ChangeLog index e9a8e0703391bec50ad2ada295a5274b1b411422..e774b5855da4cfb585e6e766e9fdadc8c3f771df 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,35 @@ 2011-10-12 Jakub Jelinek <jakub@redhat.com> + * config/i386/i386.md (UNSPEC_VPERMDI): Remove. + * config/i386/i386.c (ix86_expand_vec_perm): Handle + V16QImode and V32QImode for TARGET_AVX2. + (MAX_VECT_LEN): Increase to 32. + (expand_vec_perm_blend): Add support for 32-byte integer + vectors with TARGET_AVX2. + (valid_perm_using_mode_p): New function. + (expand_vec_perm_pshufb): Add support for 32-byte integer + vectors with TARGET_AVX2. + (expand_vec_perm_vpshufb2_vpermq): New function. + (expand_vec_perm_vpshufb2_vpermq_even_odd): New function. + (expand_vec_perm_even_odd_1): Handle 32-byte integer vectors + with TARGET_AVX2. + (ix86_expand_vec_perm_builtin_1): Try expand_vec_perm_vpshufb2_vpermq + and expand_vec_perm_vpshufb2_vpermq_even_odd. + * config/i386/sse.md (VEC_EXTRACT_EVENODD_MODE): Add for TARGET_AVX2 + 32-byte integer vector modes. + (vec_pack_trunc_<mode>): Use VI248_AVX2 instead of VI248_128. + (avx2_interleave_highv32qi, avx2_interleave_lowv32qi): Remove pasto. + (avx2_pshufdv3, avx2_pshuflwv3, avx2_pshufhwv3): Generate + 4 new operands. + (avx2_pshufd_1, avx2_pshuflw_1, avx2_pshufhw_1): Don't use + match_dup, instead add 4 new operands and require they have + right cross-lane values. + (avx2_permv4di): Change into define_expand. + (avx2_permv4di_1): New instruction. + (avx2_permv2ti): Use nonimmediate_operand instead of register_operand + for "xm" constrained operand. + (VEC_PERM_AVX2): Add V32QI and V16QI for TARGET_AVX2. + * config/i386/sse.md (avx2_gathersi<mode>, avx2_gatherdi<mode>, avx2_gatherdi<mode>256): Add clobber of match_scratch, change memory_operand to register_operand, diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index f73a96912cc9396541d3c0eebd015f0d729041f7..26a49241619253954f5485bc3a42f1cea5fc6a95 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -19334,7 +19334,7 @@ ix86_expand_vec_perm (rtx operands[]) rtx op0 = operands[1]; rtx op1 = operands[2]; rtx mask = operands[3]; - rtx t1, t2, vt, vec[16]; + rtx t1, t2, t3, t4, vt, vt2, vec[32]; enum machine_mode mode = GET_MODE (op0); enum machine_mode maskmode = GET_MODE (mask); int w, e, i; @@ -19343,50 +19343,68 @@ ix86_expand_vec_perm (rtx operands[]) /* Number of elements in the vector. */ w = GET_MODE_NUNITS (mode); e = GET_MODE_UNIT_SIZE (mode); - gcc_assert (w <= 16); + gcc_assert (w <= 32); if (TARGET_AVX2) { - if (mode == V4DImode || mode == V4DFmode) + if (mode == V4DImode || mode == V4DFmode || mode == V16HImode) { /* Unfortunately, the VPERMQ and VPERMPD instructions only support an constant shuffle operand. With a tiny bit of effort we can use VPERMD instead. A re-interpretation stall for V4DFmode is - unfortunate but there's no avoiding it. */ - t1 = gen_reg_rtx (V8SImode); + unfortunate but there's no avoiding it. + Similarly for V16HImode we don't have instructions for variable + shuffling, while for V32QImode we can use after preparing suitable + masks vpshufb; vpshufb; vpermq; vpor. */ + + if (mode == V16HImode) + { + maskmode = mode = V32QImode; + w = 32; + e = 1; + } + else + { + maskmode = mode = V8SImode; + w = 8; + e = 4; + } + t1 = gen_reg_rtx (maskmode); /* Replicate the low bits of the V4DImode mask into V8SImode: mask = { A B C D } t1 = { A A B B C C D D }. */ - for (i = 0; i < 4; ++i) + for (i = 0; i < w / 2; ++i) vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2); - vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec)); - vt = force_reg (V8SImode, vt); - mask = gen_lowpart (V8SImode, mask); - emit_insn (gen_avx2_permvarv8si (t1, vt, mask)); + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + vt = force_reg (maskmode, vt); + mask = gen_lowpart (maskmode, mask); + if (maskmode == V8SImode) + emit_insn (gen_avx2_permvarv8si (t1, vt, mask)); + else + emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt)); /* Multiply the shuffle indicies by two. */ - emit_insn (gen_avx2_lshlv8si3 (t1, t1, const1_rtx)); + t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1, + OPTAB_DIRECT); /* Add one to the odd shuffle indicies: t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */ - for (i = 0; i < 4; ++i) + for (i = 0; i < w / 2; ++i) { vec[i * 2] = const0_rtx; vec[i * 2 + 1] = const1_rtx; } - vt = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, vec)); - vt = force_const_mem (V8SImode, vt); - emit_insn (gen_addv8si3 (t1, t1, vt)); + vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec)); + vt = force_const_mem (maskmode, vt); + t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1, + OPTAB_DIRECT); - /* Continue as if V8SImode was used initially. */ + /* Continue as if V8SImode (resp. V32QImode) was used initially. */ operands[3] = mask = t1; - target = gen_lowpart (V8SImode, target); - op0 = gen_lowpart (V8SImode, op0); - op1 = gen_lowpart (V8SImode, op1); - maskmode = mode = V8SImode; - w = 8; - e = 4; + target = gen_lowpart (mode, target); + op0 = gen_lowpart (mode, op0); + op1 = gen_lowpart (mode, op1); } switch (mode) @@ -19443,6 +19461,92 @@ ix86_expand_vec_perm (rtx operands[]) emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx)); return; + case V32QImode: + t1 = gen_reg_rtx (V32QImode); + t2 = gen_reg_rtx (V32QImode); + t3 = gen_reg_rtx (V32QImode); + vt2 = GEN_INT (128); + for (i = 0; i < 32; i++) + vec[i] = vt2; + vt = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec)); + vt = force_reg (V32QImode, vt); + for (i = 0; i < 32; i++) + vec[i] = i < 16 ? vt2 : const0_rtx; + vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec)); + vt2 = force_reg (V32QImode, vt2); + /* From mask create two adjusted masks, which contain the same + bits as mask in the low 7 bits of each vector element. + The first mask will have the most significant bit clear + if it requests element from the same 128-bit lane + and MSB set if it requests element from the other 128-bit lane. + The second mask will have the opposite values of the MSB, + and additionally will have its 128-bit lanes swapped. + E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have + t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and + t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ... + stands for other 12 bytes. */ + /* The bit whether element is from the same lane or the other + lane is bit 4, so shift it up by 3 to the MSB position. */ + emit_insn (gen_avx2_lshlv4di3 (gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, mask), + GEN_INT (3))); + /* Clear MSB bits from the mask just in case it had them set. */ + emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask)); + /* After this t1 will have MSB set for elements from other lane. */ + emit_insn (gen_xorv32qi3 (t1, t1, vt2)); + /* Clear bits other than MSB. */ + emit_insn (gen_andv32qi3 (t1, t1, vt)); + /* Or in the lower bits from mask into t3. */ + emit_insn (gen_iorv32qi3 (t3, t1, t2)); + /* And invert MSB bits in t1, so MSB is set for elements from the same + lane. */ + emit_insn (gen_xorv32qi3 (t1, t1, vt)); + /* Swap 128-bit lanes in t3. */ + emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3), + gen_lowpart (V4DImode, t3), + const2_rtx, GEN_INT (3), + const0_rtx, const1_rtx)); + /* And or in the lower bits from mask into t1. */ + emit_insn (gen_iorv32qi3 (t1, t1, t2)); + if (one_operand_shuffle) + { + /* Each of these shuffles will put 0s in places where + element from the other 128-bit lane is needed, otherwise + will shuffle in the requested value. */ + emit_insn (gen_avx2_pshufbv32qi3 (t3, op0, t3)); + emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1)); + /* For t3 the 128-bit lanes are swapped again. */ + emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3), + gen_lowpart (V4DImode, t3), + const2_rtx, GEN_INT (3), + const0_rtx, const1_rtx)); + /* And oring both together leads to the result. */ + emit_insn (gen_iorv32qi3 (target, t1, t3)); + return; + } + + t4 = gen_reg_rtx (V32QImode); + /* Similarly to the above one_operand_shuffle code, + just for repeated twice for each operand. merge_two: + code will merge the two results together. */ + emit_insn (gen_avx2_pshufbv32qi3 (t4, op0, t3)); + emit_insn (gen_avx2_pshufbv32qi3 (t3, op1, t3)); + emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1)); + emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1)); + emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t4), + gen_lowpart (V4DImode, t4), + const2_rtx, GEN_INT (3), + const0_rtx, const1_rtx)); + emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, t3), + gen_lowpart (V4DImode, t3), + const2_rtx, GEN_INT (3), + const0_rtx, const1_rtx)); + emit_insn (gen_iorv32qi3 (t4, t2, t4)); + emit_insn (gen_iorv32qi3 (t3, t1, t3)); + t1 = t4; + t2 = t3; + goto merge_two; + default: gcc_assert (GET_MODE_SIZE (mode) <= 16); break; @@ -31773,9 +31877,9 @@ x86_emit_floatuns (rtx operands[2]) emit_label (donelab); } -/* AVX does not support 32-byte integer vector operations, - thus the longest vector we are faced with is V16QImode. */ -#define MAX_VECT_LEN 16 +/* AVX2 does support 32-byte integer vector operations, + thus the longest vector we are faced with is V32QImode. */ +#define MAX_VECT_LEN 32 struct expand_vec_perm_d { @@ -34582,7 +34686,7 @@ expand_vselect_vconcat (rtx target, rtx op0, rtx op1, } /* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D - in terms of blendp[sd] / pblendw / pblendvb. */ + in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */ static bool expand_vec_perm_blend (struct expand_vec_perm_d *d) @@ -34590,10 +34694,17 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) enum machine_mode vmode = d->vmode; unsigned i, mask, nelt = d->nelt; rtx target, op0, op1, x; + rtx rperm[32], vperm; - if (!TARGET_SSE4_1 || d->op0 == d->op1) + if (d->op0 == d->op1) return false; - if (!(GET_MODE_SIZE (vmode) == 16 || vmode == V4DFmode || vmode == V8SFmode)) + if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32) + ; + else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode)) + ; + else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16) + ; + else return false; /* This is a blend, not a permute. Elements must stay in their @@ -34611,30 +34722,6 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) /* ??? Without SSE4.1, we could implement this with and/andn/or. This decision should be extracted elsewhere, so that we only try that sequence once all budget==3 options have been tried. */ - - /* For bytes, see if bytes move in pairs so we can use pblendw with - an immediate argument, rather than pblendvb with a vector argument. */ - if (vmode == V16QImode) - { - bool pblendw_ok = true; - for (i = 0; i < 16 && pblendw_ok; i += 2) - pblendw_ok = (d->perm[i] + 1 == d->perm[i + 1]); - - if (!pblendw_ok) - { - rtx rperm[16], vperm; - - for (i = 0; i < nelt; ++i) - rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx); - - vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm)); - vperm = force_reg (V16QImode, vperm); - - emit_insn (gen_sse4_1_pblendvb (d->target, d->op0, d->op1, vperm)); - return true; - } - } - target = d->target; op0 = d->op0; op1 = d->op1; @@ -34647,6 +34734,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) case V2DFmode: case V4SFmode: case V8HImode: + case V8SImode: for (i = 0; i < nelt; ++i) mask |= (d->perm[i] >= nelt) << i; break; @@ -34654,24 +34742,122 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d) case V2DImode: for (i = 0; i < 2; ++i) mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4); + vmode = V8HImode; goto do_subreg; case V4SImode: for (i = 0; i < 4; ++i) mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); + vmode = V8HImode; goto do_subreg; case V16QImode: + /* See if bytes move in pairs so we can use pblendw with + an immediate argument, rather than pblendvb with a vector + argument. */ + for (i = 0; i < 16; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + { + use_pblendvb: + for (i = 0; i < nelt; ++i) + rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx); + + finish_pblendvb: + vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm)); + vperm = force_reg (vmode, vperm); + + if (GET_MODE_SIZE (vmode) == 16) + emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm)); + else + emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm)); + return true; + } + for (i = 0; i < 8; ++i) mask |= (d->perm[i * 2] >= 16) << i; + vmode = V8HImode; + /* FALLTHRU */ do_subreg: - vmode = V8HImode; target = gen_lowpart (vmode, target); op0 = gen_lowpart (vmode, op0); op1 = gen_lowpart (vmode, op1); break; + case V32QImode: + /* See if bytes move in pairs. If not, vpblendvb must be used. */ + for (i = 0; i < 32; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + goto use_pblendvb; + /* See if bytes move in quadruplets. If yes, vpblendd + with immediate can be used. */ + for (i = 0; i < 32; i += 4) + if (d->perm[i] + 2 != d->perm[i + 2]) + break; + if (i < 32) + { + /* See if bytes move the same in both lanes. If yes, + vpblendw with immediate can be used. */ + for (i = 0; i < 16; i += 2) + if (d->perm[i] + 16 != d->perm[i + 16]) + goto use_pblendvb; + + /* Use vpblendw. */ + for (i = 0; i < 16; ++i) + mask |= (d->perm[i * 2] >= 32) << i; + vmode = V16HImode; + goto do_subreg; + } + + /* Use vpblendd. */ + for (i = 0; i < 8; ++i) + mask |= (d->perm[i * 4] >= 32) << i; + vmode = V8SImode; + goto do_subreg; + + case V16HImode: + /* See if words move in pairs. If yes, vpblendd can be used. */ + for (i = 0; i < 16; i += 2) + if (d->perm[i] + 1 != d->perm[i + 1]) + break; + if (i < 16) + { + /* See if words move the same in both lanes. If not, + vpblendvb must be used. */ + for (i = 0; i < 8; i++) + if (d->perm[i] + 8 != d->perm[i + 8]) + { + /* Use vpblendvb. */ + for (i = 0; i < 32; ++i) + rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx); + + vmode = V32QImode; + nelt = 32; + target = gen_lowpart (vmode, target); + op0 = gen_lowpart (vmode, op0); + op1 = gen_lowpart (vmode, op1); + goto finish_pblendvb; + } + + /* Use vpblendw. */ + for (i = 0; i < 16; ++i) + mask |= (d->perm[i] >= 16) << i; + break; + } + + /* Use vpblendd. */ + for (i = 0; i < 8; ++i) + mask |= (d->perm[i * 2] >= 16) << i; + vmode = V8SImode; + goto do_subreg; + + case V4DImode: + /* Use vpblendd. */ + for (i = 0; i < 4; ++i) + mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2); + vmode = V8SImode; + goto do_subreg; + default: gcc_unreachable (); } @@ -34732,43 +34918,164 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d) return true; } -/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D - in terms of pshufb or vpperm. */ +/* Return true if permutation D can be performed as VMODE permutation + instead. */ static bool -expand_vec_perm_pshufb (struct expand_vec_perm_d *d) +valid_perm_using_mode_p (enum machine_mode vmode, struct expand_vec_perm_d *d) { - unsigned i, nelt, eltsz; - rtx rperm[16], vperm, target, op0, op1; + unsigned int i, j, chunk; - if (!(d->op0 == d->op1 ? TARGET_SSSE3 : TARGET_XOP)) - return false; - if (GET_MODE_SIZE (d->vmode) != 16) + if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT + || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT + || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode)) return false; - if (d->testing_p) + if (GET_MODE_NUNITS (vmode) >= d->nelt) return true; + chunk = d->nelt / GET_MODE_NUNITS (vmode); + for (i = 0; i < d->nelt; i += chunk) + if (d->perm[i] & (chunk - 1)) + return false; + else + for (j = 1; j < chunk; ++j) + if ((d->perm[i] & (d->nelt - 1)) + j + != (d->perm[i + j] & (d->nelt - 1))) + return false; + + return true; +} + +/* A subroutine of ix86_expand_vec_perm_builtin_1. Try to implement D + in terms of pshufb, vpperm, vpermq, vpermd or vperm2i128. */ + +static bool +expand_vec_perm_pshufb (struct expand_vec_perm_d *d) +{ + unsigned i, nelt, eltsz, mask; + unsigned char perm[32]; + enum machine_mode vmode = V16QImode; + rtx rperm[32], vperm, target, op0, op1; + nelt = d->nelt; - eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); - for (i = 0; i < nelt; ++i) + if (d->op0 != d->op1) { - unsigned j, e = d->perm[i]; - for (j = 0; j < eltsz; ++j) - rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); + if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16) + { + if (TARGET_AVX2 + && valid_perm_using_mode_p (V2TImode, d)) + { + if (d->testing_p) + return true; + + /* Use vperm2i128 insn. The pattern uses + V4DImode instead of V2TImode. */ + target = gen_lowpart (V4DImode, d->target); + op0 = gen_lowpart (V4DImode, d->op0); + op1 = gen_lowpart (V4DImode, d->op1); + rperm[0] + = GEN_INT (((d->perm[0] & (nelt / 2)) ? 1 : 0) + || ((d->perm[nelt / 2] & (nelt / 2)) ? 2 : 0)); + emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0])); + return true; + } + return false; + } } + else + { + if (GET_MODE_SIZE (d->vmode) == 16) + { + if (!TARGET_SSSE3) + return false; + } + else if (GET_MODE_SIZE (d->vmode) == 32) + { + if (!TARGET_AVX2) + return false; - vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm)); - vperm = force_reg (V16QImode, vperm); + /* V4DImode should be already handled through + expand_vselect by vpermq instruction. */ + gcc_assert (d->vmode != V4DImode); - target = gen_lowpart (V16QImode, d->target); - op0 = gen_lowpart (V16QImode, d->op0); + vmode = V32QImode; + if (d->vmode == V8SImode + || d->vmode == V16HImode + || d->vmode == V32QImode) + { + /* First see if vpermq can be used for + V8SImode/V16HImode/V32QImode. */ + if (valid_perm_using_mode_p (V4DImode, d)) + { + for (i = 0; i < 4; i++) + perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3; + if (d->testing_p) + return true; + return expand_vselect (gen_lowpart (V4DImode, d->target), + gen_lowpart (V4DImode, d->op0), + perm, 4); + } + + /* Next see if vpermd can be used. */ + if (valid_perm_using_mode_p (V8SImode, d)) + vmode = V8SImode; + } + + if (vmode == V32QImode) + { + /* vpshufb only works intra lanes, it is not + possible to shuffle bytes in between the lanes. */ + for (i = 0; i < nelt; ++i) + if ((d->perm[i] ^ i) & (nelt / 2)) + return false; + } + } + else + return false; + } + + if (d->testing_p) + return true; + + if (vmode == V8SImode) + for (i = 0; i < 8; ++i) + rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7); + else + { + eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); + if (d->op0 != d->op1) + mask = 2 * nelt - 1; + else if (vmode == V16QImode) + mask = nelt - 1; + else + mask = nelt / 2 - 1; + + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i] & mask; + for (j = 0; j < eltsz; ++j) + rperm[i * eltsz + j] = GEN_INT (e * eltsz + j); + } + } + + vperm = gen_rtx_CONST_VECTOR (vmode, + gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm)); + vperm = force_reg (vmode, vperm); + + target = gen_lowpart (vmode, d->target); + op0 = gen_lowpart (vmode, d->op0); if (d->op0 == d->op1) - emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); + { + if (vmode == V16QImode) + emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm)); + else if (vmode == V32QImode) + emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm)); + } else { - op1 = gen_lowpart (V16QImode, d->op1); + op1 = gen_lowpart (vmode, d->op1); emit_insn (gen_xop_pperm (target, op0, op1, vperm)); } @@ -34856,7 +35163,8 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_vpermil (d)) return true; - /* Try the SSSE3 pshufb or XOP vpperm variable permutation. */ + /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128, + vpshufb, vpermd or vpermq variable permutation. */ if (expand_vec_perm_pshufb (d)) return true; @@ -35156,6 +35464,150 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d) return true; } +/* Implement arbitrary permutation of one V32QImode and V16QImode operand + with two vpshufb insns, vpermq and vpor. We should have already failed + all two or three instruction sequences. */ + +static bool +expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d) +{ + rtx rperm[2][32], vperm, l, h, hp, op, m128; + unsigned int i, nelt, eltsz; + + if (!TARGET_AVX2 + || d->op0 != d->op1 + || (d->vmode != V32QImode && d->vmode != V16HImode)) + return false; + + nelt = d->nelt; + eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); + + /* Generate two permutation masks. If the required element is within + the same lane, it is shuffled in. If the required element from the + other lane, force a zero by setting bit 7 in the permutation mask. + In the other mask the mask has non-negative elements if element + is requested from the other lane, but also moved to the other lane, + so that the result of vpshufb can have the two V2TImode halves + swapped. */ + m128 = GEN_INT (-128); + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i] & (nelt / 2 - 1); + unsigned which = ((d->perm[i] ^ i) & (nelt / 2)); + + for (j = 0; j < eltsz; ++j) + { + rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j); + rperm[!which][(i * eltsz + j) ^ (which ^ (nelt / 2))] = m128; + } + } + + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); + vperm = force_reg (V32QImode, vperm); + + h = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); + + /* Swap the 128-byte lanes of h into hp. */ + hp = gen_reg_rtx (V32QImode); + op = gen_lowpart (V4DImode, h); + emit_insn (gen_avx2_permv4di_1 (gen_lowpart (V4DImode, hp), op, + const2_rtx, GEN_INT (3), const0_rtx, + const1_rtx)); + + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); + vperm = force_reg (V32QImode, vperm); + + l = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); + + op = gen_lowpart (V32QImode, d->target); + emit_insn (gen_iorv32qi3 (op, l, hp)); + + return true; +} + +/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even + and extract-odd permutations of two V32QImode and V16QImode operand + with two vpshufb insns, vpor and vpermq. We should have already + failed all two or three instruction sequences. */ + +static bool +expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d) +{ + rtx rperm[2][32], vperm, l, h, ior, op, m128; + unsigned int i, nelt, eltsz; + + if (!TARGET_AVX2 + || d->op0 == d->op1 + || (d->vmode != V32QImode && d->vmode != V16HImode)) + return false; + + for (i = 0; i < d->nelt; ++i) + if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2)) + return false; + + if (d->testing_p) + return true; + + nelt = d->nelt; + eltsz = GET_MODE_SIZE (GET_MODE_INNER (d->vmode)); + + /* Generate two permutation masks. In the first permutation mask + the first quarter will contain indexes for the first half + of the op0, the second quarter will contain bit 7 set, third quarter + will contain indexes for the second half of the op0 and the + last quarter bit 7 set. In the second permutation mask + the first quarter will contain bit 7 set, the second quarter + indexes for the first half of the op1, the third quarter bit 7 set + and last quarter indexes for the second half of the op1. + I.e. the first mask e.g. for V32QImode extract even will be: + 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128 + (all values masked with 0xf except for -128) and second mask + for extract even will be + -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */ + m128 = GEN_INT (-128); + for (i = 0; i < nelt; ++i) + { + unsigned j, e = d->perm[i] & (nelt / 2 - 1); + unsigned which = d->perm[i] >= nelt; + unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0; + + for (j = 0; j < eltsz; ++j) + { + rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j); + rperm[1 - which][(i * eltsz + j) ^ xorv] = m128; + } + } + + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0])); + vperm = force_reg (V32QImode, vperm); + + l = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm)); + + vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1])); + vperm = force_reg (V32QImode, vperm); + + h = gen_reg_rtx (V32QImode); + op = gen_lowpart (V32QImode, d->op0); + emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm)); + + ior = gen_reg_rtx (V32QImode); + emit_insn (gen_iorv32qi3 (ior, l, h)); + + /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */ + op = gen_lowpart (V4DImode, d->target); + ior = gen_lowpart (V4DImode, ior); + emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx, + const1_rtx, GEN_INT (3))); + + return true; +} + /* A subroutine of ix86_expand_vec_perm_builtin_1. Implement extract-even and extract-odd permutations. */ @@ -35265,6 +35717,61 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd) } break; + case V16HImode: + case V32QImode: + return expand_vec_perm_vpshufb2_vpermq_even_odd (d); + + case V4DImode: + t1 = gen_reg_rtx (V4DImode); + t2 = gen_reg_rtx (V4DImode); + + /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */ + emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20))); + emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31))); + + /* Now an vpunpck[lh]qdq will produce the result required. */ + if (odd) + t3 = gen_avx2_interleave_highv4di (d->target, t1, t2); + else + t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2); + emit_insn (t3); + break; + + case V8SImode: + t1 = gen_reg_rtx (V8SImode); + t2 = gen_reg_rtx (V8SImode); + + /* Shuffle the lanes around into + { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */ + emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, d->op0), + gen_lowpart (V4DImode, d->op1), + GEN_INT (0x20))); + emit_insn (gen_avx2_permv2ti (gen_lowpart (V4DImode, t2), + gen_lowpart (V4DImode, d->op0), + gen_lowpart (V4DImode, d->op1), + GEN_INT (0x31))); + + /* Swap the 2nd and 3rd position in each lane into + { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */ + emit_insn (gen_avx2_pshufdv3 (t1, t1, + GEN_INT (2 * 2 + 1 * 16 + 3 * 64))); + emit_insn (gen_avx2_pshufdv3 (t2, t2, + GEN_INT (2 * 2 + 1 * 16 + 3 * 64))); + + /* Now an vpunpck[lh]qdq will produce + { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */ + if (odd) + t3 = gen_avx2_interleave_highv4di (gen_lowpart (V4DImode, d->target), + gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, t2)); + else + t3 = gen_avx2_interleave_lowv4di (gen_lowpart (V4DImode, d->target), + gen_lowpart (V4DImode, t1), + gen_lowpart (V4DImode, t2)); + emit_insn (t3); + break; + default: gcc_unreachable (); } @@ -35399,6 +35906,14 @@ ix86_expand_vec_perm_builtin_1 (struct expand_vec_perm_d *d) if (expand_vec_perm_pshufb2 (d)) return true; + /* Try sequences of four instructions. */ + + if (expand_vec_perm_vpshufb2_vpermq (d)) + return true; + + if (expand_vec_perm_vpshufb2_vpermq_even_odd (d)) + return true; + /* ??? Look for narrow permutations whose element orderings would allow the promotion to a wider mode. */ diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index b527ad29896475312b995ae77b7d1e6d6e058ea9..9c9508d278a22dffac78fb29b9a11b0cf876af30 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -235,7 +235,6 @@ UNSPEC_VPERMSI UNSPEC_VPERMDF UNSPEC_VPERMSF - UNSPEC_VPERMDI UNSPEC_VPERMTI UNSPEC_GATHER diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index b916edabb0dfcc4a5f0f95735e2426d1c4b37be4..12331516cd6a92bec01069b4f5062b9a9924f8ac 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -4330,10 +4330,10 @@ ;; Modes handled by vec_extract_even/odd pattern. (define_mode_iterator VEC_EXTRACT_EVENODD_MODE - [(V16QI "TARGET_SSE2") - (V8HI "TARGET_SSE2") - (V4SI "TARGET_SSE2") - (V2DI "TARGET_SSE2") + [(V32QI "TARGET_AVX2") (V16QI "TARGET_SSE2") + (V16HI "TARGET_AVX2") (V8HI "TARGET_SSE2") + (V8SI "TARGET_AVX2") (V4SI "TARGET_SSE2") + (V4DI "TARGET_AVX2") (V2DI "TARGET_SSE2") (V8SF "TARGET_AVX") V4SF (V4DF "TARGET_AVX") (V2DF "TARGET_SSE2")]) @@ -6196,11 +6196,9 @@ DONE; }) -;; ??? Irritatingly, the 256-bit VPSHUFB only shuffles within the 128-bit -;; lanes. For now, we don't try to support V32QI or V16HImode. So we -;; don't want to use VI_AVX2. (define_mode_iterator VEC_PERM_AVX2 [V16QI V8HI V4SI V2DI V4SF V2DF + (V32QI "TARGET_AVX2") (V16HI "TARGET_AVX2") (V8SI "TARGET_AVX2") (V4DI "TARGET_AVX2") (V8SF "TARGET_AVX2") (V4DF "TARGET_AVX2")]) @@ -6431,8 +6429,8 @@ (define_expand "vec_pack_trunc_<mode>" [(match_operand:<ssepackmode> 0 "register_operand" "") - (match_operand:VI248_128 1 "register_operand" "") - (match_operand:VI248_128 2 "register_operand" "")] + (match_operand:VI248_AVX2 1 "register_operand" "") + (match_operand:VI248_AVX2 2 "register_operand" "")] "TARGET_SSE2" { rtx op1 = gen_lowpart (<ssepackmode>mode, operands[1]); @@ -6513,8 +6511,7 @@ (const_int 28) (const_int 60) (const_int 29) (const_int 61) (const_int 30) (const_int 62) - (const_int 31) (const_int 63) - (const_int 32) (const_int 64)])))] + (const_int 31) (const_int 63)])))] "TARGET_AVX2" "vpunpckhbw\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sselog") @@ -6559,7 +6556,6 @@ (const_int 5) (const_int 37) (const_int 6) (const_int 38) (const_int 7) (const_int 39) - (const_int 15) (const_int 47) (const_int 16) (const_int 48) (const_int 17) (const_int 49) (const_int 18) (const_int 50) @@ -6919,7 +6915,11 @@ GEN_INT ((mask >> 0) & 3), GEN_INT ((mask >> 2) & 3), GEN_INT ((mask >> 4) & 3), - GEN_INT ((mask >> 6) & 3))); + GEN_INT ((mask >> 6) & 3), + GEN_INT (((mask >> 0) & 3) + 4), + GEN_INT (((mask >> 2) & 3) + 4), + GEN_INT (((mask >> 4) & 3) + 4), + GEN_INT (((mask >> 6) & 3) + 4))); DONE; }) @@ -6931,11 +6931,15 @@ (match_operand 3 "const_0_to_3_operand" "") (match_operand 4 "const_0_to_3_operand" "") (match_operand 5 "const_0_to_3_operand" "") - (match_dup 2) - (match_dup 3) - (match_dup 4) - (match_dup 5)])))] - "TARGET_AVX2" + (match_operand 6 "const_4_to_7_operand" "") + (match_operand 7 "const_4_to_7_operand" "") + (match_operand 8 "const_4_to_7_operand" "") + (match_operand 9 "const_4_to_7_operand" "")])))] + "TARGET_AVX2 + && INTVAL (operands[2]) + 4 == INTVAL (operands[6]) + && INTVAL (operands[3]) + 4 == INTVAL (operands[7]) + && INTVAL (operands[4]) + 4 == INTVAL (operands[8]) + && INTVAL (operands[5]) + 4 == INTVAL (operands[9])" { int mask = 0; mask |= INTVAL (operands[2]) << 0; @@ -7002,7 +7006,11 @@ GEN_INT ((mask >> 0) & 3), GEN_INT ((mask >> 2) & 3), GEN_INT ((mask >> 4) & 3), - GEN_INT ((mask >> 6) & 3))); + GEN_INT ((mask >> 6) & 3), + GEN_INT (((mask >> 0) & 3) + 8), + GEN_INT (((mask >> 2) & 3) + 8), + GEN_INT (((mask >> 4) & 3) + 8), + GEN_INT (((mask >> 6) & 3) + 8))); DONE; }) @@ -7018,15 +7026,19 @@ (const_int 5) (const_int 6) (const_int 7) - (match_dup 2) - (match_dup 3) - (match_dup 4) - (match_dup 5) + (match_operand 6 "const_8_to_11_operand" "") + (match_operand 7 "const_8_to_11_operand" "") + (match_operand 8 "const_8_to_11_operand" "") + (match_operand 9 "const_8_to_11_operand" "") (const_int 12) (const_int 13) (const_int 14) (const_int 15)])))] - "TARGET_AVX2" + "TARGET_AVX2 + && INTVAL (operands[2]) + 8 == INTVAL (operands[6]) + && INTVAL (operands[3]) + 8 == INTVAL (operands[7]) + && INTVAL (operands[4]) + 8 == INTVAL (operands[8]) + && INTVAL (operands[5]) + 8 == INTVAL (operands[9])" { int mask = 0; mask |= INTVAL (operands[2]) << 0; @@ -7098,7 +7110,11 @@ GEN_INT (((mask >> 0) & 3) + 4), GEN_INT (((mask >> 2) & 3) + 4), GEN_INT (((mask >> 4) & 3) + 4), - GEN_INT (((mask >> 6) & 3) + 4))); + GEN_INT (((mask >> 6) & 3) + 4), + GEN_INT (((mask >> 0) & 3) + 12), + GEN_INT (((mask >> 2) & 3) + 12), + GEN_INT (((mask >> 4) & 3) + 12), + GEN_INT (((mask >> 6) & 3) + 12))); DONE; }) @@ -7118,11 +7134,15 @@ (const_int 9) (const_int 10) (const_int 11) - (match_dup 2) - (match_dup 3) - (match_dup 4) - (match_dup 5)])))] - "TARGET_AVX2" + (match_operand 6 "const_12_to_15_operand" "") + (match_operand 7 "const_12_to_15_operand" "") + (match_operand 8 "const_12_to_15_operand" "") + (match_operand 9 "const_12_to_15_operand" "")])))] + "TARGET_AVX2 + && INTVAL (operands[2]) + 8 == INTVAL (operands[6]) + && INTVAL (operands[3]) + 8 == INTVAL (operands[7]) + && INTVAL (operands[4]) + 8 == INTVAL (operands[8]) + && INTVAL (operands[5]) + 8 == INTVAL (operands[9])" { int mask = 0; mask |= (INTVAL (operands[2]) - 4) << 0; @@ -11526,14 +11546,39 @@ (set_attr "prefix" "vex") (set_attr "mode" "OI")]) -(define_insn "avx2_permv4di" +(define_expand "avx2_permv4di" + [(match_operand:V4DI 0 "register_operand" "") + (match_operand:V4DI 1 "nonimmediate_operand" "") + (match_operand:SI 2 "const_0_to_255_operand" "")] + "TARGET_AVX2" +{ + int mask = INTVAL (operands[2]); + emit_insn (gen_avx2_permv4di_1 (operands[0], operands[1], + GEN_INT ((mask >> 0) & 3), + GEN_INT ((mask >> 2) & 3), + GEN_INT ((mask >> 4) & 3), + GEN_INT ((mask >> 6) & 3))); + DONE; +}) + +(define_insn "avx2_permv4di_1" [(set (match_operand:V4DI 0 "register_operand" "=x") - (unspec:V4DI - [(match_operand:V4DI 1 "register_operand" "xm") - (match_operand:SI 2 "const_0_to_255_operand" "n")] - UNSPEC_VPERMDI))] + (vec_select:V4DI + (match_operand:V4DI 1 "nonimmediate_operand" "xm") + (parallel [(match_operand 2 "const_0_to_3_operand" "") + (match_operand 3 "const_0_to_3_operand" "") + (match_operand 4 "const_0_to_3_operand" "") + (match_operand 5 "const_0_to_3_operand" "")])))] "TARGET_AVX2" - "vpermq\t{%2, %1, %0|%0, %1, %2}" +{ + int mask = 0; + mask |= INTVAL (operands[2]) << 0; + mask |= INTVAL (operands[3]) << 2; + mask |= INTVAL (operands[4]) << 4; + mask |= INTVAL (operands[5]) << 6; + operands[2] = GEN_INT (mask); + return "vpermq\t{%2, %1, %0|%0, %1, %2}"; +} [(set_attr "type" "sselog") (set_attr "prefix" "vex") (set_attr "mode" "OI")]) @@ -11542,7 +11587,7 @@ [(set (match_operand:V4DI 0 "register_operand" "=x") (unspec:V4DI [(match_operand:V4DI 1 "register_operand" "x") - (match_operand:V4DI 2 "register_operand" "xm") + (match_operand:V4DI 2 "nonimmediate_operand" "xm") (match_operand:SI 3 "const_0_to_255_operand" "n")] UNSPEC_VPERMTI))] "TARGET_AVX2"