diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def index af972e8f72b0d5568afdc26e9d18dc2f747f107e..f304992e3edd47b5e451d2926766cf1298f55d23 100644 --- a/gcc/config/aarch64/aarch64-modes.def +++ b/gcc/config/aarch64/aarch64-modes.def @@ -136,11 +136,13 @@ ADJUST_NUNITS (VNx2QI, aarch64_sve_vg); ADJUST_NUNITS (VNx2HI, aarch64_sve_vg); ADJUST_NUNITS (VNx2SI, aarch64_sve_vg); ADJUST_NUNITS (VNx2HF, aarch64_sve_vg); +ADJUST_NUNITS (VNx2BF, aarch64_sve_vg); ADJUST_NUNITS (VNx2SF, aarch64_sve_vg); ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2); ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2); ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2); +ADJUST_NUNITS (VNx4BF, aarch64_sve_vg * 2); ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4); @@ -151,7 +153,9 @@ ADJUST_ALIGNMENT (VNx8QI, 1); ADJUST_ALIGNMENT (VNx2HI, 2); ADJUST_ALIGNMENT (VNx4HI, 2); ADJUST_ALIGNMENT (VNx2HF, 2); +ADJUST_ALIGNMENT (VNx2BF, 2); ADJUST_ALIGNMENT (VNx4HF, 2); +ADJUST_ALIGNMENT (VNx4BF, 2); ADJUST_ALIGNMENT (VNx2SI, 4); ADJUST_ALIGNMENT (VNx2SF, 4); diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index 31a8c5a5aefc24b36c5115157cde0482b7a7927b..4b0a1ebe9e1dd8bcbf683c5c136d9458b61dd943 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3009,6 +3009,22 @@ "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>" ) +;; Another way of expressing the REVB, REVH and REVW patterns, with this +;; form being easier for permutes. The predicate mode determines the number +;; of lanes and the data mode decides the granularity of the reversal within +;; each lane. +(define_insn "@aarch64_sve_revbhw_<SVE_ALL:mode><PRED_HSD:mode>" + [(set (match_operand:SVE_ALL 0 "register_operand" "=w") + (unspec:SVE_ALL + [(match_operand:PRED_HSD 1 "register_operand" "Upl") + (unspec:SVE_ALL + [(match_operand:SVE_ALL 2 "register_operand" "w")] + UNSPEC_REVBHW)] + UNSPEC_PRED_X))] + "TARGET_SVE && <PRED_HSD:elem_bits> > <SVE_ALL:container_bits>" + "rev<SVE_ALL:Vcwtype>\t%0.<PRED_HSD:Vetype>, %1/m, %2.<PRED_HSD:Vetype>" +) + ;; Predicated integer unary operations with merging. (define_insn "@cond_<optab><mode>" [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, ?&w") @@ -8273,14 +8289,14 @@ ;; Duplicate one element of a vector. (define_insn "@aarch64_sve_dup_lane<mode>" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w") - (vec_duplicate:SVE_FULL + [(set (match_operand:SVE_ALL 0 "register_operand" "=w") + (vec_duplicate:SVE_ALL (vec_select:<VEL> - (match_operand:SVE_FULL 1 "register_operand" "w") + (match_operand:SVE_ALL 1 "register_operand" "w") (parallel [(match_operand:SI 2 "const_int_operand")]))))] "TARGET_SVE - && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)" - "dup\t%0.<Vetype>, %1.<Vetype>[%2]" + && IN_RANGE (INTVAL (operands[2]) * <container_bits> / 8, 0, 63)" + "dup\t%0.<Vctype>, %1.<Vctype>[%2]" ) ;; Use DUP.Q to duplicate a 128-bit segment of a register. @@ -8321,17 +8337,18 @@ ;; Reverse the order of elements within a full vector. (define_insn "@aarch64_sve_rev<mode>" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w") - (unspec:SVE_FULL - [(match_operand:SVE_FULL 1 "register_operand" "w")] + [(set (match_operand:SVE_ALL 0 "register_operand" "=w") + (unspec:SVE_ALL + [(match_operand:SVE_ALL 1 "register_operand" "w")] UNSPEC_REV))] "TARGET_SVE" - "rev\t%0.<Vetype>, %1.<Vetype>") + "rev\t%0.<Vctype>, %1.<Vctype>") ;; ------------------------------------------------------------------------- ;; ---- [INT,FP] Special-purpose binary permutes ;; ------------------------------------------------------------------------- ;; Includes: +;; - EXT ;; - SPLICE ;; - TRN1 ;; - TRN2 @@ -8359,13 +8376,13 @@ ;; Permutes that take half the elements from one vector and half the ;; elements from the other. (define_insn "@aarch64_sve_<perm_insn><mode>" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w") - (unspec:SVE_FULL - [(match_operand:SVE_FULL 1 "register_operand" "w") - (match_operand:SVE_FULL 2 "register_operand" "w")] + [(set (match_operand:SVE_ALL 0 "register_operand" "=w") + (unspec:SVE_ALL + [(match_operand:SVE_ALL 1 "register_operand" "w") + (match_operand:SVE_ALL 2 "register_operand" "w")] PERMUTE))] "TARGET_SVE" - "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>" + "<perm_insn>\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>" ) ;; Apply PERMUTE to 128-bit sequences. The behavior of these patterns @@ -8383,16 +8400,16 @@ ;; Concatenate two vectors and extract a subvector. Note that the ;; immediate (third) operand is the lane index not the byte index. (define_insn "@aarch64_sve_ext<mode>" - [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w") - (unspec:SVE_FULL - [(match_operand:SVE_FULL 1 "register_operand" "0, w") - (match_operand:SVE_FULL 2 "register_operand" "w, w") + [(set (match_operand:SVE_ALL 0 "register_operand" "=w, ?&w") + (unspec:SVE_ALL + [(match_operand:SVE_ALL 1 "register_operand" "0, w") + (match_operand:SVE_ALL 2 "register_operand" "w, w") (match_operand:SI 3 "const_int_operand")] UNSPEC_EXT))] "TARGET_SVE - && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)" + && IN_RANGE (INTVAL (operands[3]) * <container_bits> / 8, 0, 255)" { - operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode)); + operands[3] = GEN_INT (INTVAL (operands[3]) * <container_bits> / 8); return (which_alternative == 0 ? "ext\\t%0.b, %0.b, %2.b, #%3" : "movprfx\t%0, %1\;ext\\t%0.b, %0.b, %2.b, #%3"); diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c index 0ae6c8b53f6e7ae629bafc2ec033a440012cbe42..97cb68980e975dfb2c0c0c0a05f9153beb64a2ad 100644 --- a/gcc/config/aarch64/aarch64.c +++ b/gcc/config/aarch64/aarch64.c @@ -2226,6 +2226,9 @@ aarch64_classify_vector_mode (machine_mode mode) /* Partial SVE HF vectors. */ case E_VNx2HFmode: case E_VNx4HFmode: + /* Partial SVE BF vectors. */ + case E_VNx2BFmode: + case E_VNx4BFmode: /* Partial SVE SF vector. */ case E_VNx2SFmode: return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0; @@ -20468,18 +20471,21 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d) || !diff) return false; - size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode); - if (size == 8) + if (d->vec_flags & VEC_SVE_DATA) + size = (diff + 1) * aarch64_sve_container_bits (d->vmode); + else + size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode); + if (size == 64) { unspec = UNSPEC_REV64; pred_mode = VNx2BImode; } - else if (size == 4) + else if (size == 32) { unspec = UNSPEC_REV32; pred_mode = VNx4BImode; } - else if (size == 2) + else if (size == 16) { unspec = UNSPEC_REV16; pred_mode = VNx8BImode; @@ -20496,28 +20502,11 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d) if (d->testing_p) return true; - if (d->vec_flags == VEC_SVE_DATA) - { - machine_mode int_mode = aarch64_sve_int_mode (pred_mode); - rtx target = gen_reg_rtx (int_mode); - if (BYTES_BIG_ENDIAN) - /* The act of taking a subreg between INT_MODE and d->vmode - is itself a reversing operation on big-endian targets; - see the comment at the head of aarch64-sve.md for details. - First reinterpret OP0 as INT_MODE without using a subreg - and without changing the contents. */ - emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0)); - else - { - /* For SVE we use REV[BHW] unspecs derived from the element size - of v->mode and vector modes whose elements have SIZE bytes. - This ensures that the vector modes match the predicate modes. */ - int unspec = aarch64_sve_rev_unspec (d->vmode); - rtx pred = aarch64_ptrue_reg (pred_mode); - emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred, - gen_lowpart (int_mode, d->op0))); - } - emit_move_insn (d->target, gen_lowpart (d->vmode, target)); + if (d->vec_flags & VEC_SVE_DATA) + { + rtx pred = aarch64_ptrue_reg (pred_mode); + emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode, + d->target, pred, d->op0)); return true; } rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec); @@ -20562,7 +20551,8 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d) || !d->perm[0].is_constant (&elt)) return false; - if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode)) + if ((d->vec_flags & VEC_SVE_DATA) + && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64) return false; /* Success! */ @@ -20782,6 +20772,7 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d) if ((d->vec_flags == VEC_ADVSIMD || d->vec_flags == VEC_SVE_DATA + || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL) || d->vec_flags == VEC_SVE_PRED) && known_gt (nelt, 1)) { diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index 054fd8515c6ebf136da699e2993f6ebb348c3b1a..fb1426b7752890848cb49722ef7442d96cb1408b 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -400,7 +400,7 @@ (define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI VNx8HI VNx4HI VNx2HI VNx8HF VNx4HF VNx2HF - VNx8BF + VNx8BF VNx4BF VNx2BF VNx4SI VNx2SI VNx4SF VNx2SF VNx2DI @@ -418,11 +418,13 @@ VNx2DI]) ;; SVE modes with 2 or 4 elements. -(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF - VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF]) +(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2BF VNx2SI VNx2SF + VNx2DI VNx2DF + VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF]) ;; SVE modes with 2 elements. -(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF]) +(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2BF + VNx2SI VNx2SF VNx2DI VNx2DF]) ;; SVE integer modes with 2 elements, excluding the widest element. (define_mode_iterator SVE_2BHSI [VNx2QI VNx2HI VNx2SI]) @@ -431,7 +433,7 @@ (define_mode_iterator SVE_2HSDI [VNx2HI VNx2SI VNx2DI]) ;; SVE modes with 4 elements. -(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF]) +(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF]) ;; SVE integer modes with 4 elements, excluding the widest element. (define_mode_iterator SVE_4BHI [VNx4QI VNx4HI]) @@ -621,6 +623,7 @@ UNSPEC_REVB ; Used in aarch64-sve.md. UNSPEC_REVH ; Used in aarch64-sve.md. UNSPEC_REVW ; Used in aarch64-sve.md. + UNSPEC_REVBHW ; Used in aarch64-sve.md. UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md. UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md. UNSPEC_FMLA ; Used in aarch64-sve.md. @@ -968,6 +971,16 @@ (VNx4SI "32") (VNx2DI "64") (VNx8HF "16") (VNx4SF "32") (VNx2DF "64")]) +;; The number of bits in a vector container. +(define_mode_attr container_bits [(VNx16QI "8") + (VNx8HI "16") (VNx8QI "16") (VNx8HF "16") + (VNx8BF "16") + (VNx4SI "32") (VNx4HI "32") (VNx4QI "32") + (VNx4SF "32") (VNx4HF "32") (VNx4BF "32") + (VNx2DI "64") (VNx2SI "64") (VNx2HI "64") + (VNx2QI "64") (VNx2DF "64") (VNx2SF "64") + (VNx2HF "64") (VNx2BF "64")]) + ;; Attribute to describe constants acceptable in logical operations (define_mode_attr lconst [(SI "K") (DI "L")]) @@ -1029,7 +1042,7 @@ (VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b") (VNx8HI "h") (VNx4HI "h") (VNx2HI "h") (VNx8HF "h") (VNx4HF "h") (VNx2HF "h") - (VNx8BF "h") + (VNx8BF "h") (VNx4BF "h") (VNx2BF "h") (VNx4SI "s") (VNx2SI "s") (VNx4SF "s") (VNx2SF "s") (VNx2DI "d") @@ -1047,7 +1060,7 @@ (define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b") (VNx8HI "h") (VNx4HI "h") (VNx2HI "h") (VNx8HF "h") (VNx4HF "h") (VNx2HF "h") - (VNx8BF "h") + (VNx8BF "h") (VNx4BF "h") (VNx2BF "h") (VNx4SI "w") (VNx2SI "w") (VNx4SF "w") (VNx2SF "w") (VNx2DI "d") @@ -1066,12 +1079,23 @@ (define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d") (VNx8HI "h") (VNx4HI "s") (VNx2HI "d") (VNx8HF "h") (VNx4HF "s") (VNx2HF "d") - (VNx8BF "h") + (VNx8BF "h") (VNx4BF "s") (VNx2BF "d") (VNx4SI "s") (VNx2SI "d") (VNx4SF "s") (VNx2SF "d") (VNx2DI "d") (VNx2DF "d")]) +;; The instruction mnemonic suffix for an SVE mode's element container, +;; i.e. the Vewtype of full SVE modes that have the same number of elements. +(define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d") + (VNx8HI "h") (VNx4HI "w") (VNx2HI "d") + (VNx8HF "h") (VNx4HF "w") (VNx2HF "d") + (VNx8BF "h") (VNx4BF "w") (VNx2BF "d") + (VNx4SI "w") (VNx2SI "d") + (VNx4SF "w") (VNx2SF "d") + (VNx2DI "d") + (VNx2DF "d")]) + ;; Vetype is used everywhere in scheduling type and assembly output, ;; sometimes they are not the same, for example HF modes on some ;; instructions. stype is defined to represent scheduling type @@ -1107,7 +1131,7 @@ (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI") (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI") (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF") - (VNx8BF "BF") + (VNx8BF "BF") (VNx4BF "BF") (VNx2BF "BF") (VNx4SI "SI") (VNx2SI "SI") (VNx4SF "SF") (VNx2SF "SF") (VNx2DI "DI") @@ -1127,7 +1151,7 @@ (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi") (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi") (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf") - (VNx8BF "bf") + (VNx8BF "bf") (VNx4BF "bf") (VNx2BF "bf") (VNx4SI "si") (VNx2SI "si") (VNx4SF "sf") (VNx2SF "sf") (VNx2DI "di") @@ -1310,7 +1334,7 @@ (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w") (VNx8HI "w") (VNx4HI "w") (VNx2HI "w") (VNx8HF "w") (VNx4HF "w") (VNx2HF "w") - (VNx8BF "w") + (VNx8BF "w") (VNx4BF "w") (VNx2BF "w") (VNx4SI "w") (VNx2SI "w") (VNx4SF "w") (VNx2SF "w") (VNx2DI "x") @@ -1380,6 +1404,8 @@ (VNx2DI "VNx2DI") (VNx8HF "VNx8HI") (VNx4HF "VNx4SI") (VNx2HF "VNx2DI") + (VNx8BF "VNx8HI") (VNx4BF "VNx4SI") + (VNx2BF "VNx2DI") (VNx4SF "VNx4SI") (VNx2SF "VNx2DI") (VNx2DF "VNx2DI")]) @@ -1392,6 +1418,8 @@ (VNx2DI "vnx2di") (VNx8HF "vnx8hi") (VNx4HF "vnx4si") (VNx2HF "vnx2di") + (VNx8BF "vnx8hi") (VNx4BF "vnx4si") + (VNx2BF "vnx2di") (VNx4SF "vnx4si") (VNx2SF "vnx2di") (VNx2DF "vnx2di")]) @@ -1617,7 +1645,7 @@ (VNx4QI "VNx4BI") (VNx2QI "VNx2BI") (VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI") (VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI") - (VNx8BF "VNx8BI") + (VNx8BF "VNx8BI") (VNx4BF "VNx4BI") (VNx2BF "VNx2BI") (VNx4SI "VNx4BI") (VNx2SI "VNx2BI") (VNx4SF "VNx4BI") (VNx2SF "VNx2BI") (VNx2DI "VNx2BI") @@ -1643,7 +1671,7 @@ (VNx4QI "vnx4bi") (VNx2QI "vnx2bi") (VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi") (VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi") - (VNx8BF "vnx8bi") + (VNx8BF "vnx8bi") (VNx4BF "vnx4bi") (VNx2BF "vnx2bi") (VNx4SI "vnx4bi") (VNx2SI "vnx2bi") (VNx4SF "vnx4bi") (VNx2SF "vnx2bi") (VNx2DI "vnx2bi") diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c new file mode 100644 index 0000000000000000000000000000000000000000..3d74ff98e6d61789df05827eef89628730b898e3 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c @@ -0,0 +1,331 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B) B, B +#define PERM1(B) PERM0 (B), PERM0 (B) +#define PERM2(B) PERM1 (B), PERM1 (B) +#define PERM3(B) PERM2 (B), PERM2 (B) +#define PERM4(B) PERM3 (B), PERM3 (B) +#define PERM5(B) PERM4 (B), PERM4 (B) +#define PERM6(B) PERM5 (B), PERM5 (B) + +/* +** qi_dup_h_1: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** dup (z[0-9]+)\.h, \2\.h\[1\] +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_dup_h_1 (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) }); +} + +/* +** qi_dup_h_31: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** dup (z[0-9]+)\.h, \2\.h\[31\] +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_dup_h_31 (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) }); +} + +/* +** qi_dup_s_1: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** dup (z[0-9]+)\.s, \2\.s\[1\] +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_dup_s_1 (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) }); +} + +/* +** qi_dup_s_15: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** dup (z[0-9]+)\.s, \2\.s\[15\] +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_dup_s_15 (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) }); +} + +/* +** qi_dup_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[1\] +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_dup_d_1 (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) }); +} + +/* +** qi_dup_d_7: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[7\] +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_dup_d_7 (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) }); +} + +/* +** hi_dup_s_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** dup (z[0-9]+)\.s, \2\.s\[1\] +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_dup_s_1 (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** hi_dup_s_15: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** dup (z[0-9]+)\.s, \2\.s\[15\] +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_dup_s_15 (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) }); +} + +/* +** hf_dup_s_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** dup (z[0-9]+)\.s, \2\.s\[1\] +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_dup_s_1 (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** hf_dup_s_11: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** dup (z[0-9]+)\.s, \2\.s\[11\] +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_dup_s_11 (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) }); +} + +/* +** bf_dup_s_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** dup (z[0-9]+)\.s, \2\.s\[1\] +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_dup_s_1 (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** bf_dup_s_13: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** dup (z[0-9]+)\.s, \2\.s\[13\] +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_dup_s_13 (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) }); +} + +/* +** hi_dup_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[1\] +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_dup_d_1 (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** hi_dup_d_7: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[7\] +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_dup_d_7 (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) }); +} + +/* +** hf_dup_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[1\] +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_dup_d_1 (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** hf_dup_d_5: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[5\] +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_dup_d_5 (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) }); +} + +/* +** bf_dup_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[1\] +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_dup_d_1 (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** bf_dup_d_6: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[6\] +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_dup_d_6 (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) }); +} + +/* +** si_dup_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[1\] +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_dup_d_1 (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (1) }); +} + +/* +** si_dup_d_7: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[7\] +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_dup_d_7 (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (7) }); +} + +/* +** sf_dup_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[1\] +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_dup_d_1 (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (1) }); +} + +/* +** sf_dup_d_7: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** dup (z[0-9]+)\.d, \2\.d\[7\] +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_dup_d_7 (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (7) }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c new file mode 100644 index 0000000000000000000000000000000000000000..50f73a1aa23afb74bd7b9bc9b4520ad92613e247 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c @@ -0,0 +1,90 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B) B, B +#define PERM1(B) PERM0 (B), PERM0 (B) +#define PERM2(B) PERM1 (B), PERM1 (B) +#define PERM3(B) PERM2 (B), PERM2 (B) +#define PERM4(B) PERM3 (B), PERM3 (B) +#define PERM5(B) PERM4 (B), PERM4 (B) +#define PERM6(B) PERM5 (B), PERM5 (B) + +v128qi +qi_dup_h_32 (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (32) }); +} + +v64qi +qi_dup_s_16 (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (16) }); +} + +v32qi +qi_dup_d_8 (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (8) }); +} + +v64hi +hi_dup_s_16 (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) }); +} + +v64hf +hf_dup_s_16 (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) }); +} + +v64bf +bf_dup_s_16 (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) }); +} + +v32hi +hi_dup_d_8 (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) }); +} + +v32hf +hf_dup_d_8 (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) }); +} + +v32bf +bf_dup_d_8 (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) }); +} + +v32si +si_dup_d_8 (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (8) }); +} + +v32sf +sf_dup_d_8 (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (8) }); +} + +/* { dg-final { scan-assembler-not {\tdup\tz} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c b/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c new file mode 100644 index 0000000000000000000000000000000000000000..4637b5cdc7a5b9b956521379e679ff0e9f7b5edc --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c @@ -0,0 +1,353 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B) B, B + 1 +#define PERM1(B) PERM0 (B), PERM0 (B + 2) +#define PERM2(B) PERM1 (B), PERM1 (B + 4) +#define PERM3(B) PERM2 (B), PERM2 (B + 8) +#define PERM4(B) PERM3 (B), PERM3 (B + 16) +#define PERM5(B) PERM4 (B), PERM4 (B + 32) +#define PERM6(B) PERM5 (B), PERM5 (B + 64) + +/* +** qi_ext_h_1: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #2 +** st1b \2\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_ext_h_1 (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) }); +} + +/* +** qi_ext_h_1_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ext \3\.b, \3\.b, \2\.b, #2 +** st1b \3\.h, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** ext \4\.b, \4\.b, \5\.b, #2 +** st1b \4\.h, \1, \[x8\] +** ) +** ret +*/ +v128qi +qi_ext_h_1_two_op (v128qi x, v128qi y) +{ + return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) }); +} + +/* +** qi_ext_h_127: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #254 +** st1b \2\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_ext_h_127 (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) }); +} + +/* +** qi_ext_s_1: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #4 +** st1b \2\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_ext_s_1 (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) }); +} + +/* +** qi_ext_s_63: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #252 +** st1b \2\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_ext_s_63 (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) }); +} + +/* +** qi_ext_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #8 +** st1b \2\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_ext_d_1 (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) }); +} + +/* +** qi_ext_d_31: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #248 +** st1b \2\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_ext_d_31 (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) }); +} + +/* +** hi_ext_s_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #4 +** st1h \2\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_ext_s_1 (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** hi_ext_s_63: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #252 +** st1h \2\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_ext_s_63 (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) }); +} + +/* +** hf_ext_s_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #4 +** st1h \2\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_ext_s_1 (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** hf_ext_s_60: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #240 +** st1h \2\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_ext_s_60 (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (60) }); +} + +/* +** bf_ext_s_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #4 +** st1h \2\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_ext_s_1 (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** bf_ext_s_40: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #160 +** st1h \2\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_ext_s_40 (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (40) }); +} + +/* +** hi_ext_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #8 +** st1h \2\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_ext_d_1 (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** hi_ext_d_31: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #248 +** st1h \2\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_ext_d_31 (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) }); +} + +/* +** hf_ext_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #8 +** st1h \2\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_ext_d_1 (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** hf_ext_d_18: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #144 +** st1h \2\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_ext_d_18 (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (18) }); +} + +/* +** bf_ext_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #8 +** st1h \2\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_ext_d_1 (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** bf_ext_d_7: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #56 +** st1h \2\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_ext_d_7 (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) }); +} + +/* +** si_ext_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #8 +** st1w \2\.d, \1, \[x8\] +** ret +*/ +v32si +si_ext_d_1 (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (1) }); +} + +/* +** si_ext_d_31: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #248 +** st1w \2\.d, \1, \[x8\] +** ret +*/ +v32si +si_ext_d_31 (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (31) }); +} + +/* +** sf_ext_d_1: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #8 +** st1w \2\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_ext_d_1 (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (1) }); +} + +/* +** sf_ext_d_31: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** ext \2\.b, \2\.b, \2\.b, #248 +** st1w \2\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_ext_d_31 (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (31) }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c b/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c new file mode 100644 index 0000000000000000000000000000000000000000..417da37500ad498995c043ab8fcf8df337c74591 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c @@ -0,0 +1,177 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B) B, B - 1 +#define PERM1(B) PERM0 (B), PERM0 (B - 2) +#define PERM2(B) PERM1 (B), PERM1 (B - 4) +#define PERM3(B) PERM2 (B), PERM2 (B - 8) +#define PERM4(B) PERM3 (B), PERM3 (B - 16) +#define PERM5(B) PERM4 (B), PERM4 (B - 32) +#define PERM6(B) PERM5 (B), PERM5 (B - 64) + +/* +** qi_rev_h: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** rev (z[0-9]+)\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_rev_h (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) }); +} + +/* +** qi_rev_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** rev (z[0-9]+)\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_rev_s (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) }); +} + +/* +** qi_rev_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** rev (z[0-9]+)\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_rev_d (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) }); +} + +/* +** hi_rev_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** rev (z[0-9]+)\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_rev_s (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) }); +} + +/* +** hf_rev_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** rev (z[0-9]+)\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_rev_s (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) }); +} + +/* +** bf_rev_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** rev (z[0-9]+)\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_rev_s (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) }); +} + +/* +** hi_rev_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** rev (z[0-9]+)\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_rev_d (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) }); +} + +/* +** hf_rev_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** rev (z[0-9]+)\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_rev_d (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) }); +} + +/* +** bf_rev_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** rev (z[0-9]+)\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_rev_d (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) }); +} + +/* +** si_rev_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** rev (z[0-9]+)\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_rev_d (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (31) }); +} + +/* +** sf_rev_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** rev (z[0-9]+)\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_rev_d (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (31) }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c new file mode 100644 index 0000000000000000000000000000000000000000..62de8127584f1d1913ff6491f50cae4127873e40 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c @@ -0,0 +1,127 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef __bf16 v64bf __attribute__((vector_size(128))); + +#define PERM0(B) B + 1, B +#define PERM1(B) PERM0 (B), PERM0 (B + 2) +#define PERM2(B) PERM1 (B), PERM1 (B + 4) +#define PERM3(B) PERM2 (B), PERM2 (B + 8) +#define PERM4(B) PERM3 (B), PERM3 (B + 16) +#define PERM5(B) PERM4 (B), PERM4 (B + 32) +#define PERM6(B) PERM5 (B), PERM5 (B + 64) + +/* +** qi_revh_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** revh (z[0-9]+)\.s, \1/m, \2\.s +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_revh_s (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) }); +} + +/* +** qi_revw_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** revw (z[0-9]+)\.d, \1/m, \2\.d +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_revw_d (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) }); +} + +/* +** hi_revw_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** revw (z[0-9]+)\.d, \1/m, \2\.d +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_revw_d (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +/* +** hf_revw_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** revw (z[0-9]+)\.d, \1/m, \2\.d +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_revw_d (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +/* +** bf_revw_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** revw (z[0-9]+)\.d, \1/m, \2\.d +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_revw_d (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +#undef PERM1 +#define PERM1(B) PERM0 (B + 2), PERM0 (B) + +/* +** qi_revh_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** revh (z[0-9]+)\.d, \1/m, \2\.d +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_revh_d (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) }); +} + +v64qi +qi_revw_q (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) }); +} + +v64hi +hi_revw_q (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +#undef PERM2 +#define PERM2(B) PERM0 (B + 4), PERM0 (B) + +v128qi +qi_revh_q (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) }); +} + +/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c new file mode 100644 index 0000000000000000000000000000000000000000..7634d01b2c42176b22a5f5ebe44ef625a429193a --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c @@ -0,0 +1,127 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mbig-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef __bf16 v64bf __attribute__((vector_size(128))); + +#define PERM0(B) B + 1, B +#define PERM1(B) PERM0 (B), PERM0 (B + 2) +#define PERM2(B) PERM1 (B), PERM1 (B + 4) +#define PERM3(B) PERM2 (B), PERM2 (B + 8) +#define PERM4(B) PERM3 (B), PERM3 (B + 16) +#define PERM5(B) PERM4 (B), PERM4 (B + 32) +#define PERM6(B) PERM5 (B), PERM5 (B + 64) + +/* +** qi_revh_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** revh (z[0-9]+)\.s, \1/m, \2\.s +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_revh_s (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) }); +} + +/* +** qi_revw_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** revw (z[0-9]+)\.d, \1/m, \2\.d +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_revw_d (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) }); +} + +/* +** hi_revw_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** revw (z[0-9]+)\.d, \1/m, \2\.d +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_revw_d (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +/* +** hf_revw_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** revw (z[0-9]+)\.d, \1/m, \2\.d +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_revw_d (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +/* +** bf_revw_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** revw (z[0-9]+)\.d, \1/m, \2\.d +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_revw_d (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +#undef PERM1 +#define PERM1(B) PERM0 (B + 2), PERM0 (B) + +/* +** qi_revh_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** revh (z[0-9]+)\.d, \1/m, \2\.d +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_revh_d (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) }); +} + +v64qi +qi_revw_q (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) }); +} + +v64hi +hi_revw_q (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +#undef PERM2 +#define PERM2(B) PERM0 (B + 4), PERM0 (B) + +v128qi +qi_revh_q (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) }); +} + +/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c new file mode 100644 index 0000000000000000000000000000000000000000..fe25000b0bf89a26d0c6328e15daa3f099b18ebd --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +void +f (short *restrict s, signed char *restrict c) +{ + for (int i = 0; i < 8; i += 2) + { + s[i] = c[i]; + s[i + 1] = c[i]; + } +} + +/* Ideally this would use LD1SB, but currently we use LD1B and + sign-extend it after the permute. */ +/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl6\n} } } */ +/* { dg-final { scan-assembler {\tld1s?b\tz[0-9]+\.h} } } */ +/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.h,} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c new file mode 100644 index 0000000000000000000000000000000000000000..df059ddbc8d98715a58a2d805c5f1ff694510d75 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c @@ -0,0 +1,403 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B, C) B, B + C +#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C) +#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C) +#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C) +#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C) +#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C) +#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C) + +/* +** qi_trn1_h_a: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** trn1 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_trn1_h_a (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) }); +} + +/* +** qi_trn1_h_b: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** trn1 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_trn1_h_b (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) }); +} + +/* +** qi_trn1_h_c: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** trn1 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_trn1_h_c (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) }); +} + +/* +** qi_trn1_h_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** trn1 \3\.h, \3\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** trn1 \4\.h, \4\.h, \5\.h +** st1b \4\.h, \1, \[x8\] +** ) +** ret +*/ +v128qi +qi_trn1_h_two_op (v128qi x, v128qi y) +{ + return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) }); +} + +/* +** qi_trn1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** trn1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_trn1_s (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) }); +} + +/* +** qi_trn1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** trn1 \3\.s, \3\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** trn1 \4\.s, \4\.s, \5\.s +** st1b \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64qi +qi_trn1_s_two_op (v64qi x, v64qi y) +{ + return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) }); +} + +/* +** qi_trn1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_trn1_d (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) }); +} + +/* +** qi_trn1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 \3\.d, \3\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** trn1 \4\.d, \4\.d, \5\.d +** st1b \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32qi +qi_trn1_d_two_op (v32qi x, v32qi y) +{ + return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) }); +} + +/* +** hi_trn1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_trn1_s (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) }); +} + +/* +** hi_trn1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** trn1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hi +hi_trn1_s_two_op (v64hi x, v64hi y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) }); +} + +/* +** hf_trn1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_trn1_s (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) }); +} + +/* +** hf_trn1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** trn1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hf +hf_trn1_s_two_op (v64hf x, v64hf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) }); +} + +/* +** bf_trn1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_trn1_s (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) }); +} + +/* +** bf_trn1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** trn1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64bf +bf_trn1_s_two_op (v64bf x, v64bf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) }); +} + +/* +** hi_trn1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_trn1_d (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) }); +} + +/* +** hi_trn1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** trn1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hi +hi_trn1_d_two_op (v32hi x, v32hi y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) }); +} + +/* +** hf_trn1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_trn1_d (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) }); +} + +/* +** hf_trn1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** trn1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hf +hf_trn1_d_two_op (v32hf x, v32hf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) }); +} + +/* +** bf_trn1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_trn1_d (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) }); +} + +/* +** bf_trn1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** trn1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32bf +bf_trn1_d_two_op (v32bf x, v32bf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) }); +} + +/* +** si_trn1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_trn1_d (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) }); +} + +/* +** sf_trn1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** trn1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_trn1_d (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c new file mode 100644 index 0000000000000000000000000000000000000000..290ce8e980ce320f00274268ab0688683294f569 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c @@ -0,0 +1,403 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B, C) B, B + C +#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C) +#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C) +#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C) +#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C) +#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C) +#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C) + +/* +** qi_trn2_h_a: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** trn2 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_trn2_h_a (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) }); +} + +/* +** qi_trn2_h_b: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** trn2 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_trn2_h_b (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 128) }); +} + +/* +** qi_trn2_h_c: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** trn2 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_trn2_h_c (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) }); +} + +/* +** qi_trn2_h_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** trn2 \3\.h, \3\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** trn2 \4\.h, \4\.h, \5\.h +** st1b \4\.h, \1, \[x8\] +** ) +** ret +*/ +v128qi +qi_trn2_h_two_op (v128qi x, v128qi y) +{ + return __builtin_shuffle (x, y, (v128qi) { PERM6 (1, 128) }); +} + +/* +** qi_trn2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** trn2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_trn2_s (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (1, 64) }); +} + +/* +** qi_trn2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** trn2 \3\.s, \3\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** trn2 \4\.s, \4\.s, \5\.s +** st1b \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64qi +qi_trn2_s_two_op (v64qi x, v64qi y) +{ + return __builtin_shuffle (x, y, (v64qi) { PERM5 (1, 64) }); +} + +/* +** qi_trn2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_trn2_d (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (1, 32) }); +} + +/* +** qi_trn2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 \3\.d, \3\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** trn2 \4\.d, \4\.d, \5\.d +** st1b \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32qi +qi_trn2_d_two_op (v32qi x, v32qi y) +{ + return __builtin_shuffle (x, y, (v32qi) { PERM4 (1, 32) }); +} + +/* +** hi_trn2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_trn2_s (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) }); +} + +/* +** hi_trn2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** trn2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hi +hi_trn2_s_two_op (v64hi x, v64hi y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) }); +} + +/* +** hf_trn2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_trn2_s (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) }); +} + +/* +** hf_trn2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** trn2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hf +hf_trn2_s_two_op (v64hf x, v64hf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) }); +} + +/* +** bf_trn2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_trn2_s (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) }); +} + +/* +** bf_trn2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** trn2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** trn2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64bf +bf_trn2_s_two_op (v64bf x, v64bf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) }); +} + +/* +** hi_trn2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_trn2_d (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) }); +} + +/* +** hi_trn2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** trn2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hi +hi_trn2_d_two_op (v32hi x, v32hi y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) }); +} + +/* +** hf_trn2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_trn2_d (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) }); +} + +/* +** hf_trn2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** trn2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hf +hf_trn2_d_two_op (v32hf x, v32hf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) }); +} + +/* +** bf_trn2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_trn2_d (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) }); +} + +/* +** bf_trn2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** trn2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32bf +bf_trn2_d_two_op (v32bf x, v32bf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) }); +} + +/* +** si_trn2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_trn2_d (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) }); +} + +/* +** sf_trn2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** trn2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_trn2_d (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c new file mode 100644 index 0000000000000000000000000000000000000000..e2f2692c7cfeb76275bd52fd8c85c1699c466140 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c @@ -0,0 +1,375 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B) B, B + 2 +#define PERM1(B) PERM0 (B), PERM0 (B + 4) +#define PERM2(B) PERM1 (B), PERM1 (B + 8) +#define PERM3(B) PERM2 (B), PERM2 (B + 16) +#define PERM4(B) PERM3 (B), PERM3 (B + 32) +#define PERM5(B) PERM4 (B), PERM4 (B + 64) +#define PERM6(B) PERM5 (B), PERM5 (B + 128) + +/* +** qi_uzp1_h: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_uzp1_h (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) }); +} + +/* +** qi_uzp1_h_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** uzp1 \3\.h, \3\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** uzp1 \4\.h, \4\.h, \5\.h +** st1b \4\.h, \1, \[x8\] +** ) +** ret +*/ +v128qi +qi_uzp1_h_two_op (v128qi x, v128qi y) +{ + return __builtin_shuffle (x, y, (v128qi) { PERM6 (0) }); +} + +/* +** qi_uzp1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_uzp1_s (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) }); +} + +/* +** qi_uzp1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** uzp1 \3\.s, \3\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** uzp1 \4\.s, \4\.s, \5\.s +** st1b \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64qi +qi_uzp1_s_two_op (v64qi x, v64qi y) +{ + return __builtin_shuffle (x, y, (v64qi) { PERM5 (0) }); +} + +/* +** qi_uzp1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_uzp1_d (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (0) }); +} + +/* +** qi_uzp1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 \3\.d, \3\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** uzp1 \4\.d, \4\.d, \5\.d +** st1b \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32qi +qi_uzp1_d_two_op (v32qi x, v32qi y) +{ + return __builtin_shuffle (x, y, (v32qi) { PERM4 (0) }); +} + +/* +** hi_uzp1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_uzp1_s (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +/* +** hi_uzp1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** uzp1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hi +hi_uzp1_s_two_op (v64hi x, v64hi y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) }); +} + +/* +** hf_uzp1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_uzp1_s (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +/* +** hf_uzp1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** uzp1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hf +hf_uzp1_s_two_op (v64hf x, v64hf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) }); +} + +/* +** bf_uzp1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_uzp1_s (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) }); +} + +/* +** bf_uzp1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** uzp1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64bf +bf_uzp1_s_two_op (v64bf x, v64bf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) }); +} + +/* +** hi_uzp1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_uzp1_d (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) }); +} + +/* +** hi_uzp1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** uzp1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hi +hi_uzp1_d_two_op (v32hi x, v32hi y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) }); +} + +/* +** hf_uzp1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_uzp1_d (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) }); +} + +/* +** hf_uzp1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** uzp1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hf +hf_uzp1_d_two_op (v32hf x, v32hf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) }); +} + +/* +** bf_uzp1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_uzp1_d (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) }); +} + +/* +** bf_uzp1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** uzp1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32bf +bf_uzp1_d_two_op (v32bf x, v32bf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) }); +} + +/* +** si_uzp1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_uzp1_d (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (0) }); +} + +/* +** sf_uzp1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** uzp1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_uzp1_d (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (0) }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c new file mode 100644 index 0000000000000000000000000000000000000000..0d8eda567cf142c22ac50163a66be4678246208b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c @@ -0,0 +1,375 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B) B, B + 2 +#define PERM1(B) PERM0 (B), PERM0 (B + 4) +#define PERM2(B) PERM1 (B), PERM1 (B + 8) +#define PERM3(B) PERM2 (B), PERM2 (B + 16) +#define PERM4(B) PERM3 (B), PERM3 (B + 32) +#define PERM5(B) PERM4 (B), PERM4 (B + 64) +#define PERM6(B) PERM5 (B), PERM5 (B + 128) + +/* +** qi_uzp2_h: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_uzp2_h (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) }); +} + +/* +** qi_uzp2_h_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** uzp2 \3\.h, \3\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** uzp2 \4\.h, \4\.h, \5\.h +** st1b \4\.h, \1, \[x8\] +** ) +** ret +*/ +v128qi +qi_uzp2_h_two_op (v128qi x, v128qi y) +{ + return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) }); +} + +/* +** qi_uzp2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_uzp2_s (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) }); +} + +/* +** qi_uzp2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** uzp2 \3\.s, \3\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** uzp2 \4\.s, \4\.s, \5\.s +** st1b \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64qi +qi_uzp2_s_two_op (v64qi x, v64qi y) +{ + return __builtin_shuffle (x, y, (v64qi) { PERM5 (1) }); +} + +/* +** qi_uzp2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_uzp2_d (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) }); +} + +/* +** qi_uzp2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 \3\.d, \3\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** uzp2 \4\.d, \4\.d, \5\.d +** st1b \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32qi +qi_uzp2_d_two_op (v32qi x, v32qi y) +{ + return __builtin_shuffle (x, y, (v32qi) { PERM4 (1) }); +} + +/* +** hi_uzp2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_uzp2_s (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** hi_uzp2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** uzp2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hi +hi_uzp2_s_two_op (v64hi x, v64hi y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) }); +} + +/* +** hf_uzp2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_uzp2_s (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** hf_uzp2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** uzp2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hf +hf_uzp2_s_two_op (v64hf x, v64hf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) }); +} + +/* +** bf_uzp2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_uzp2_s (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) }); +} + +/* +** bf_uzp2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** uzp2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** uzp2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64bf +bf_uzp2_s_two_op (v64bf x, v64bf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) }); +} + +/* +** hi_uzp2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_uzp2_d (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** hi_uzp2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** uzp2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hi +hi_uzp2_d_two_op (v32hi x, v32hi y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) }); +} + +/* +** hf_uzp2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_uzp2_d (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** hf_uzp2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** uzp2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hf +hf_uzp2_d_two_op (v32hf x, v32hf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) }); +} + +/* +** bf_uzp2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_uzp2_d (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) }); +} + +/* +** bf_uzp2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** uzp2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32bf +bf_uzp2_d_two_op (v32bf x, v32bf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) }); +} + +/* +** si_uzp2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_uzp2_d (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (1) }); +} + +/* +** sf_uzp2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** uzp2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_uzp2_d (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (1) }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c new file mode 100644 index 0000000000000000000000000000000000000000..395b96f5f0d649f6768a64fac9644e88ba439161 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c @@ -0,0 +1,403 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B, C) B, B + C +#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C) +#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C) +#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C) +#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C) +#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C) +#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C) + +/* +** qi_zip1_h_a: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** zip1 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_zip1_h_a (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) }); +} + +/* +** qi_zip1_h_b: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** zip1 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_zip1_h_b (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) }); +} + +/* +** qi_zip1_h_c: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** zip1 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_zip1_h_c (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) }); +} + +/* +** qi_zip1_h_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** zip1 \3\.h, \3\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** zip1 \4\.h, \4\.h, \5\.h +** st1b \4\.h, \1, \[x8\] +** ) +** ret +*/ +v128qi +qi_zip1_h_two_op (v128qi x, v128qi y) +{ + return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) }); +} + +/* +** qi_zip1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** zip1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_zip1_s (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) }); +} + +/* +** qi_zip1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** zip1 \3\.s, \3\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** zip1 \4\.s, \4\.s, \5\.s +** st1b \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64qi +qi_zip1_s_two_op (v64qi x, v64qi y) +{ + return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) }); +} + +/* +** qi_zip1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_zip1_d (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) }); +} + +/* +** qi_zip1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 \3\.d, \3\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** zip1 \4\.d, \4\.d, \5\.d +** st1b \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32qi +qi_zip1_d_two_op (v32qi x, v32qi y) +{ + return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) }); +} + +/* +** hi_zip1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_zip1_s (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) }); +} + +/* +** hi_zip1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** zip1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hi +hi_zip1_s_two_op (v64hi x, v64hi y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) }); +} + +/* +** hf_zip1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_zip1_s (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) }); +} + +/* +** hf_zip1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** zip1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hf +hf_zip1_s_two_op (v64hf x, v64hf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) }); +} + +/* +** bf_zip1_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip1 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_zip1_s (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) }); +} + +/* +** bf_zip1_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip1 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** zip1 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64bf +bf_zip1_s_two_op (v64bf x, v64bf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) }); +} + +/* +** hi_zip1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_zip1_d (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) }); +} + +/* +** hi_zip1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** zip1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hi +hi_zip1_d_two_op (v32hi x, v32hi y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) }); +} + +/* +** hf_zip1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_zip1_d (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) }); +} + +/* +** hf_zip1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** zip1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hf +hf_zip1_d_two_op (v32hf x, v32hf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) }); +} + +/* +** bf_zip1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_zip1_d (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) }); +} + +/* +** bf_zip1_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** zip1 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32bf +bf_zip1_d_two_op (v32bf x, v32bf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) }); +} + +/* +** si_zip1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_zip1_d (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) }); +} + +/* +** sf_zip1_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** zip1 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_zip1_d (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) }); +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c new file mode 100644 index 0000000000000000000000000000000000000000..9158ace156447314530af8cb358c92431d00a815 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c @@ -0,0 +1,403 @@ +/* { dg-do assemble { target aarch64_asm_sve_ok } } */ +/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */ +/* { dg-final { check-function-bodies "**" "" } } */ + +typedef unsigned char v128qi __attribute__((vector_size(128))); +typedef unsigned char v64qi __attribute__((vector_size(64))); +typedef unsigned char v32qi __attribute__((vector_size(32))); +typedef unsigned short v64hi __attribute__((vector_size(128))); +typedef unsigned short v32hi __attribute__((vector_size(64))); +typedef _Float16 v64hf __attribute__((vector_size(128))); +typedef _Float16 v32hf __attribute__((vector_size(64))); +typedef __bf16 v64bf __attribute__((vector_size(128))); +typedef __bf16 v32bf __attribute__((vector_size(64))); +typedef unsigned int v32si __attribute__((vector_size(128))); +typedef float v32sf __attribute__((vector_size(128))); + +#define PERM0(B, C) B, B + C +#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C) +#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C) +#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C) +#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C) +#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C) +#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C) + +/* +** qi_zip2_h_a: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** zip2 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_zip2_h_a (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) }); +} + +/* +** qi_zip2_h_b: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** zip2 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_zip2_h_b (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) }); +} + +/* +** qi_zip2_h_c: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** zip2 (z[0-9]+)\.h, \2\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** ret +*/ +v128qi +qi_zip2_h_c (v128qi x) +{ + return __builtin_shuffle (x, x, (v128qi) { PERM6 (192, 0) }); +} + +/* +** qi_zip2_h_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** zip2 \3\.h, \3\.h, \2\.h +** st1b \3\.h, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.h, \1/z, \[x0\] +** ld1b (z[0-9]+)\.h, \1/z, \[x1\] +** zip2 \4\.h, \4\.h, \5\.h +** st1b \4\.h, \1, \[x8\] +** ) +** ret +*/ +v128qi +qi_zip2_h_two_op (v128qi x, v128qi y) +{ + return __builtin_shuffle (x, y, (v128qi) { PERM6 (64, 128) }); +} + +/* +** qi_zip2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** zip2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** ret +*/ +v64qi +qi_zip2_s (v64qi x) +{ + return __builtin_shuffle (x, x, (v64qi) { PERM5 (32, 64) }); +} + +/* +** qi_zip2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** zip2 \3\.s, \3\.s, \2\.s +** st1b \3\.s, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.s, \1/z, \[x0\] +** ld1b (z[0-9]+)\.s, \1/z, \[x1\] +** zip2 \4\.s, \4\.s, \5\.s +** st1b \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64qi +qi_zip2_s_two_op (v64qi x, v64qi y) +{ + return __builtin_shuffle (x, y, (v64qi) { PERM5 (32, 64) }); +} + +/* +** qi_zip2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** ret +*/ +v32qi +qi_zip2_d (v32qi x) +{ + return __builtin_shuffle (x, x, (v32qi) { PERM4 (16, 32) }); +} + +/* +** qi_zip2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 \3\.d, \3\.d, \2\.d +** st1b \3\.d, \1, \[x8\] +** | +** ld1b (z[0-9]+)\.d, \1/z, \[x0\] +** ld1b (z[0-9]+)\.d, \1/z, \[x1\] +** zip2 \4\.d, \4\.d, \5\.d +** st1b \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32qi +qi_zip2_d_two_op (v32qi x, v32qi y) +{ + return __builtin_shuffle (x, y, (v32qi) { PERM4 (16, 32) }); +} + +/* +** hi_zip2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hi +hi_zip2_s (v64hi x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) }); +} + +/* +** hi_zip2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** zip2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hi +hi_zip2_s_two_op (v64hi x, v64hi y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) }); +} + +/* +** hf_zip2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64hf +hf_zip2_s (v64hf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) }); +} + +/* +** hf_zip2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** zip2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64hf +hf_zip2_s_two_op (v64hf x, v64hf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) }); +} + +/* +** bf_zip2_s: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip2 (z[0-9]+)\.s, \2\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** ret +*/ +v64bf +bf_zip2_s (v64bf x) +{ + return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) }); +} + +/* +** bf_zip2_s_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** zip2 \3\.s, \3\.s, \2\.s +** st1h \3\.s, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.s, \1/z, \[x0\] +** ld1h (z[0-9]+)\.s, \1/z, \[x1\] +** zip2 \4\.s, \4\.s, \5\.s +** st1h \4\.s, \1, \[x8\] +** ) +** ret +*/ +v64bf +bf_zip2_s_two_op (v64bf x, v64bf y) +{ + return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) }); +} + +/* +** hi_zip2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hi +hi_zip2_d (v32hi x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) }); +} + +/* +** hi_zip2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** zip2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hi +hi_zip2_d_two_op (v32hi x, v32hi y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) }); +} + +/* +** hf_zip2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32hf +hf_zip2_d (v32hf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) }); +} + +/* +** hf_zip2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** zip2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32hf +hf_zip2_d_two_op (v32hf x, v32hf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) }); +} + +/* +** bf_zip2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** ret +*/ +v32bf +bf_zip2_d (v32bf x) +{ + return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) }); +} + +/* +** bf_zip2_d_two_op: +** ptrue (p[0-7])\.b, vl256 +** ( +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 \3\.d, \3\.d, \2\.d +** st1h \3\.d, \1, \[x8\] +** | +** ld1h (z[0-9]+)\.d, \1/z, \[x0\] +** ld1h (z[0-9]+)\.d, \1/z, \[x1\] +** zip2 \4\.d, \4\.d, \5\.d +** st1h \4\.d, \1, \[x8\] +** ) +** ret +*/ +v32bf +bf_zip2_d_two_op (v32bf x, v32bf y) +{ + return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) }); +} + +/* +** si_zip2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32si +si_zip2_d (v32si x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) }); +} + +/* +** sf_zip2_d: +** ptrue (p[0-7])\.b, vl256 +** ld1w (z[0-9]+)\.d, \1/z, \[x0\] +** zip2 (z[0-9]+)\.d, \2\.d, \2\.d +** st1w \3\.d, \1, \[x8\] +** ret +*/ +v32sf +sf_zip2_d (v32sf x) +{ + return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) }); +}