diff --git a/gcc/config/aarch64/aarch64-modes.def b/gcc/config/aarch64/aarch64-modes.def
index af972e8f72b0d5568afdc26e9d18dc2f747f107e..f304992e3edd47b5e451d2926766cf1298f55d23 100644
--- a/gcc/config/aarch64/aarch64-modes.def
+++ b/gcc/config/aarch64/aarch64-modes.def
@@ -136,11 +136,13 @@ ADJUST_NUNITS (VNx2QI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2HI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2SI, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2HF, aarch64_sve_vg);
+ADJUST_NUNITS (VNx2BF, aarch64_sve_vg);
 ADJUST_NUNITS (VNx2SF, aarch64_sve_vg);
 
 ADJUST_NUNITS (VNx4QI, aarch64_sve_vg * 2);
 ADJUST_NUNITS (VNx4HI, aarch64_sve_vg * 2);
 ADJUST_NUNITS (VNx4HF, aarch64_sve_vg * 2);
+ADJUST_NUNITS (VNx4BF, aarch64_sve_vg * 2);
 
 ADJUST_NUNITS (VNx8QI, aarch64_sve_vg * 4);
 
@@ -151,7 +153,9 @@ ADJUST_ALIGNMENT (VNx8QI, 1);
 ADJUST_ALIGNMENT (VNx2HI, 2);
 ADJUST_ALIGNMENT (VNx4HI, 2);
 ADJUST_ALIGNMENT (VNx2HF, 2);
+ADJUST_ALIGNMENT (VNx2BF, 2);
 ADJUST_ALIGNMENT (VNx4HF, 2);
+ADJUST_ALIGNMENT (VNx4BF, 2);
 
 ADJUST_ALIGNMENT (VNx2SI, 4);
 ADJUST_ALIGNMENT (VNx2SF, 4);
diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md
index 31a8c5a5aefc24b36c5115157cde0482b7a7927b..4b0a1ebe9e1dd8bcbf683c5c136d9458b61dd943 100644
--- a/gcc/config/aarch64/aarch64-sve.md
+++ b/gcc/config/aarch64/aarch64-sve.md
@@ -3009,6 +3009,22 @@
   "<sve_int_op>\t%0.<Vetype>, %1/m, %2.<Vetype>"
 )
 
+;; Another way of expressing the REVB, REVH and REVW patterns, with this
+;; form being easier for permutes.  The predicate mode determines the number
+;; of lanes and the data mode decides the granularity of the reversal within
+;; each lane.
+(define_insn "@aarch64_sve_revbhw_<SVE_ALL:mode><PRED_HSD:mode>"
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
+	  [(match_operand:PRED_HSD 1 "register_operand" "Upl")
+	   (unspec:SVE_ALL
+	     [(match_operand:SVE_ALL 2 "register_operand" "w")]
+	     UNSPEC_REVBHW)]
+	  UNSPEC_PRED_X))]
+  "TARGET_SVE && <PRED_HSD:elem_bits> > <SVE_ALL:container_bits>"
+  "rev<SVE_ALL:Vcwtype>\t%0.<PRED_HSD:Vetype>, %1/m, %2.<PRED_HSD:Vetype>"
+)
+
 ;; Predicated integer unary operations with merging.
 (define_insn "@cond_<optab><mode>"
   [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w, ?&w")
@@ -8273,14 +8289,14 @@
 
 ;; Duplicate one element of a vector.
 (define_insn "@aarch64_sve_dup_lane<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(vec_duplicate:SVE_FULL
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(vec_duplicate:SVE_ALL
 	  (vec_select:<VEL>
-	    (match_operand:SVE_FULL 1 "register_operand" "w")
+	    (match_operand:SVE_ALL 1 "register_operand" "w")
 	    (parallel [(match_operand:SI 2 "const_int_operand")]))))]
   "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[2]) * GET_MODE_SIZE (<VEL>mode), 0, 63)"
-  "dup\t%0.<Vetype>, %1.<Vetype>[%2]"
+   && IN_RANGE (INTVAL (operands[2]) * <container_bits> / 8, 0, 63)"
+  "dup\t%0.<Vctype>, %1.<Vctype>[%2]"
 )
 
 ;; Use DUP.Q to duplicate a 128-bit segment of a register.
@@ -8321,17 +8337,18 @@
 
 ;; Reverse the order of elements within a full vector.
 (define_insn "@aarch64_sve_rev<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(unspec:SVE_FULL
-	  [(match_operand:SVE_FULL 1 "register_operand" "w")]
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
+	  [(match_operand:SVE_ALL 1 "register_operand" "w")]
 	  UNSPEC_REV))]
   "TARGET_SVE"
-  "rev\t%0.<Vetype>, %1.<Vetype>")
+  "rev\t%0.<Vctype>, %1.<Vctype>")
 
 ;; -------------------------------------------------------------------------
 ;; ---- [INT,FP] Special-purpose binary permutes
 ;; -------------------------------------------------------------------------
 ;; Includes:
+;; - EXT
 ;; - SPLICE
 ;; - TRN1
 ;; - TRN2
@@ -8359,13 +8376,13 @@
 ;; Permutes that take half the elements from one vector and half the
 ;; elements from the other.
 (define_insn "@aarch64_sve_<perm_insn><mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w")
-	(unspec:SVE_FULL
-	  [(match_operand:SVE_FULL 1 "register_operand" "w")
-	   (match_operand:SVE_FULL 2 "register_operand" "w")]
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w")
+	(unspec:SVE_ALL
+	  [(match_operand:SVE_ALL 1 "register_operand" "w")
+	   (match_operand:SVE_ALL 2 "register_operand" "w")]
 	  PERMUTE))]
   "TARGET_SVE"
-  "<perm_insn>\t%0.<Vetype>, %1.<Vetype>, %2.<Vetype>"
+  "<perm_insn>\t%0.<Vctype>, %1.<Vctype>, %2.<Vctype>"
 )
 
 ;; Apply PERMUTE to 128-bit sequences.  The behavior of these patterns
@@ -8383,16 +8400,16 @@
 ;; Concatenate two vectors and extract a subvector.  Note that the
 ;; immediate (third) operand is the lane index not the byte index.
 (define_insn "@aarch64_sve_ext<mode>"
-  [(set (match_operand:SVE_FULL 0 "register_operand" "=w, ?&w")
-	(unspec:SVE_FULL
-	  [(match_operand:SVE_FULL 1 "register_operand" "0, w")
-	   (match_operand:SVE_FULL 2 "register_operand" "w, w")
+  [(set (match_operand:SVE_ALL 0 "register_operand" "=w, ?&w")
+	(unspec:SVE_ALL
+	  [(match_operand:SVE_ALL 1 "register_operand" "0, w")
+	   (match_operand:SVE_ALL 2 "register_operand" "w, w")
 	   (match_operand:SI 3 "const_int_operand")]
 	  UNSPEC_EXT))]
   "TARGET_SVE
-   && IN_RANGE (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode), 0, 255)"
+   && IN_RANGE (INTVAL (operands[3]) * <container_bits> / 8, 0, 255)"
   {
-    operands[3] = GEN_INT (INTVAL (operands[3]) * GET_MODE_SIZE (<VEL>mode));
+    operands[3] = GEN_INT (INTVAL (operands[3]) * <container_bits> / 8);
     return (which_alternative == 0
 	    ? "ext\\t%0.b, %0.b, %2.b, #%3"
 	    : "movprfx\t%0, %1\;ext\\t%0.b, %0.b, %2.b, #%3");
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 0ae6c8b53f6e7ae629bafc2ec033a440012cbe42..97cb68980e975dfb2c0c0c0a05f9153beb64a2ad 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -2226,6 +2226,9 @@ aarch64_classify_vector_mode (machine_mode mode)
     /* Partial SVE HF vectors.  */
     case E_VNx2HFmode:
     case E_VNx4HFmode:
+    /* Partial SVE BF vectors.  */
+    case E_VNx2BFmode:
+    case E_VNx4BFmode:
     /* Partial SVE SF vector.  */
     case E_VNx2SFmode:
       return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
@@ -20468,18 +20471,21 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
       || !diff)
     return false;
 
-  size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
-  if (size == 8)
+  if (d->vec_flags & VEC_SVE_DATA)
+    size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
+  else
+    size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
+  if (size == 64)
     {
       unspec = UNSPEC_REV64;
       pred_mode = VNx2BImode;
     }
-  else if (size == 4)
+  else if (size == 32)
     {
       unspec = UNSPEC_REV32;
       pred_mode = VNx4BImode;
     }
-  else if (size == 2)
+  else if (size == 16)
     {
       unspec = UNSPEC_REV16;
       pred_mode = VNx8BImode;
@@ -20496,28 +20502,11 @@ aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
   if (d->testing_p)
     return true;
 
-  if (d->vec_flags == VEC_SVE_DATA)
-    {
-      machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
-      rtx target = gen_reg_rtx (int_mode);
-      if (BYTES_BIG_ENDIAN)
-	/* The act of taking a subreg between INT_MODE and d->vmode
-	   is itself a reversing operation on big-endian targets;
-	   see the comment at the head of aarch64-sve.md for details.
-	   First reinterpret OP0 as INT_MODE without using a subreg
-	   and without changing the contents.  */
-	emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
-      else
-	{
-	  /* For SVE we use REV[BHW] unspecs derived from the element size
-	     of v->mode and vector modes whose elements have SIZE bytes.
-	     This ensures that the vector modes match the predicate modes.  */
-	  int unspec = aarch64_sve_rev_unspec (d->vmode);
-	  rtx pred = aarch64_ptrue_reg (pred_mode);
-	  emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
-				       gen_lowpart (int_mode, d->op0)));
-	}
-      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+  if (d->vec_flags & VEC_SVE_DATA)
+    {
+      rtx pred = aarch64_ptrue_reg (pred_mode);
+      emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
+					 d->target, pred, d->op0));
       return true;
     }
   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
@@ -20562,7 +20551,8 @@ aarch64_evpc_dup (struct expand_vec_perm_d *d)
       || !d->perm[0].is_constant (&elt))
     return false;
 
-  if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
+  if ((d->vec_flags & VEC_SVE_DATA)
+      && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
     return false;
 
   /* Success! */
@@ -20782,6 +20772,7 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
 
   if ((d->vec_flags == VEC_ADVSIMD
        || d->vec_flags == VEC_SVE_DATA
+       || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
        || d->vec_flags == VEC_SVE_PRED)
       && known_gt (nelt, 1))
     {
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 054fd8515c6ebf136da699e2993f6ebb348c3b1a..fb1426b7752890848cb49722ef7442d96cb1408b 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -400,7 +400,7 @@
 (define_mode_iterator SVE_ALL [VNx16QI VNx8QI VNx4QI VNx2QI
 			       VNx8HI VNx4HI VNx2HI
 			       VNx8HF VNx4HF VNx2HF
-			       VNx8BF
+			       VNx8BF VNx4BF VNx2BF
 			       VNx4SI VNx2SI
 			       VNx4SF VNx2SF
 			       VNx2DI
@@ -418,11 +418,13 @@
 				VNx2DI])
 
 ;; SVE modes with 2 or 4 elements.
-(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF
-			      VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+(define_mode_iterator SVE_24 [VNx2QI VNx2HI VNx2HF VNx2BF VNx2SI VNx2SF
+			      VNx2DI VNx2DF
+			      VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])
 
 ;; SVE modes with 2 elements.
-(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2SI VNx2SF VNx2DI VNx2DF])
+(define_mode_iterator SVE_2 [VNx2QI VNx2HI VNx2HF VNx2BF
+			     VNx2SI VNx2SF VNx2DI VNx2DF])
 
 ;; SVE integer modes with 2 elements, excluding the widest element.
 (define_mode_iterator SVE_2BHSI [VNx2QI VNx2HI VNx2SI])
@@ -431,7 +433,7 @@
 (define_mode_iterator SVE_2HSDI [VNx2HI VNx2SI VNx2DI])
 
 ;; SVE modes with 4 elements.
-(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4SI VNx4SF])
+(define_mode_iterator SVE_4 [VNx4QI VNx4HI VNx4HF VNx4BF VNx4SI VNx4SF])
 
 ;; SVE integer modes with 4 elements, excluding the widest element.
 (define_mode_iterator SVE_4BHI [VNx4QI VNx4HI])
@@ -621,6 +623,7 @@
     UNSPEC_REVB		; Used in aarch64-sve.md.
     UNSPEC_REVH		; Used in aarch64-sve.md.
     UNSPEC_REVW		; Used in aarch64-sve.md.
+    UNSPEC_REVBHW	; Used in aarch64-sve.md.
     UNSPEC_SMUL_HIGHPART ; Used in aarch64-sve.md.
     UNSPEC_UMUL_HIGHPART ; Used in aarch64-sve.md.
     UNSPEC_FMLA		; Used in aarch64-sve.md.
@@ -968,6 +971,16 @@
 			     (VNx4SI "32") (VNx2DI "64")
 			     (VNx8HF "16") (VNx4SF "32") (VNx2DF "64")])
 
+;; The number of bits in a vector container.
+(define_mode_attr container_bits [(VNx16QI "8")
+				  (VNx8HI "16") (VNx8QI "16") (VNx8HF "16")
+				  (VNx8BF "16")
+				  (VNx4SI "32") (VNx4HI "32") (VNx4QI "32")
+				  (VNx4SF "32") (VNx4HF "32") (VNx4BF "32")
+				  (VNx2DI "64") (VNx2SI "64") (VNx2HI "64")
+				  (VNx2QI "64") (VNx2DF "64") (VNx2SF "64")
+				  (VNx2HF "64") (VNx2BF "64")])
+
 ;; Attribute to describe constants acceptable in logical operations
 (define_mode_attr lconst [(SI "K") (DI "L")])
 
@@ -1029,7 +1042,7 @@
 			  (VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
 			  (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
 			  (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
-			  (VNx8BF "h")
+			  (VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
 			  (VNx4SI "s") (VNx2SI "s")
 			  (VNx4SF "s") (VNx2SF "s")
 			  (VNx2DI "d")
@@ -1047,7 +1060,7 @@
 (define_mode_attr Vesize [(VNx16QI "b") (VNx8QI "b") (VNx4QI "b") (VNx2QI "b")
 			  (VNx8HI "h") (VNx4HI "h") (VNx2HI "h")
 			  (VNx8HF "h") (VNx4HF "h") (VNx2HF "h")
-			  (VNx8BF "h")
+			  (VNx8BF "h") (VNx4BF "h") (VNx2BF "h")
 			  (VNx4SI "w") (VNx2SI "w")
 			  (VNx4SF "w") (VNx2SF "w")
 			  (VNx2DI "d")
@@ -1066,12 +1079,23 @@
 (define_mode_attr Vctype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "s") (VNx2QI "d")
 			  (VNx8HI "h") (VNx4HI "s") (VNx2HI "d")
 			  (VNx8HF "h") (VNx4HF "s") (VNx2HF "d")
-			  (VNx8BF "h")
+			  (VNx8BF "h") (VNx4BF "s") (VNx2BF "d")
 			  (VNx4SI "s") (VNx2SI "d")
 			  (VNx4SF "s") (VNx2SF "d")
 			  (VNx2DI "d")
 			  (VNx2DF "d")])
 
+;; The instruction mnemonic suffix for an SVE mode's element container,
+;; i.e. the Vewtype of full SVE modes that have the same number of elements.
+(define_mode_attr Vcwtype [(VNx16QI "b") (VNx8QI "h") (VNx4QI "w") (VNx2QI "d")
+			   (VNx8HI "h") (VNx4HI "w") (VNx2HI "d")
+			   (VNx8HF "h") (VNx4HF "w") (VNx2HF "d")
+			   (VNx8BF "h") (VNx4BF "w") (VNx2BF "d")
+			   (VNx4SI "w") (VNx2SI "d")
+			   (VNx4SF "w") (VNx2SF "d")
+			   (VNx2DI "d")
+			   (VNx2DF "d")])
+
 ;; Vetype is used everywhere in scheduling type and assembly output,
 ;; sometimes they are not the same, for example HF modes on some
 ;; instructions.  stype is defined to represent scheduling type
@@ -1107,7 +1131,7 @@
 		       (VNx16QI "QI") (VNx8QI "QI") (VNx4QI "QI") (VNx2QI "QI")
 		       (VNx8HI "HI") (VNx4HI "HI") (VNx2HI "HI")
 		       (VNx8HF "HF") (VNx4HF "HF") (VNx2HF "HF")
-		       (VNx8BF "BF")
+		       (VNx8BF "BF") (VNx4BF "BF") (VNx2BF "BF")
 		       (VNx4SI "SI") (VNx2SI "SI")
 		       (VNx4SF "SF") (VNx2SF "SF")
 		       (VNx2DI "DI")
@@ -1127,7 +1151,7 @@
 		       (VNx16QI "qi") (VNx8QI "qi") (VNx4QI "qi") (VNx2QI "qi")
 		       (VNx8HI "hi") (VNx4HI "hi") (VNx2HI "hi")
 		       (VNx8HF "hf") (VNx4HF "hf") (VNx2HF "hf")
-		       (VNx8BF "bf")
+		       (VNx8BF "bf") (VNx4BF "bf") (VNx2BF "bf")
 		       (VNx4SI "si") (VNx2SI "si")
 		       (VNx4SF "sf") (VNx2SF "sf")
 		       (VNx2DI "di")
@@ -1310,7 +1334,7 @@
 			  (VNx16QI "w") (VNx8QI "w") (VNx4QI "w") (VNx2QI "w")
 			  (VNx8HI "w") (VNx4HI "w") (VNx2HI "w")
 			  (VNx8HF "w") (VNx4HF "w") (VNx2HF "w")
-			  (VNx8BF "w")
+			  (VNx8BF "w") (VNx4BF "w") (VNx2BF "w")
 			  (VNx4SI "w") (VNx2SI "w")
 			  (VNx4SF "w") (VNx2SF "w")
 			  (VNx2DI "x")
@@ -1380,6 +1404,8 @@
 				   (VNx2DI "VNx2DI")
 				   (VNx8HF "VNx8HI") (VNx4HF "VNx4SI")
 				   (VNx2HF "VNx2DI")
+				   (VNx8BF "VNx8HI") (VNx4BF "VNx4SI")
+				   (VNx2BF "VNx2DI")
 				   (VNx4SF "VNx4SI") (VNx2SF "VNx2DI")
 				   (VNx2DF "VNx2DI")])
 
@@ -1392,6 +1418,8 @@
 				   (VNx2DI "vnx2di")
 				   (VNx8HF "vnx8hi") (VNx4HF "vnx4si")
 				   (VNx2HF "vnx2di")
+				   (VNx8BF "vnx8hi") (VNx4BF "vnx4si")
+				   (VNx2BF "vnx2di")
 				   (VNx4SF "vnx4si") (VNx2SF "vnx2di")
 				   (VNx2DF "vnx2di")])
 
@@ -1617,7 +1645,7 @@
 			 (VNx4QI "VNx4BI") (VNx2QI "VNx2BI")
 			 (VNx8HI "VNx8BI") (VNx4HI "VNx4BI") (VNx2HI "VNx2BI")
 			 (VNx8HF "VNx8BI") (VNx4HF "VNx4BI") (VNx2HF "VNx2BI")
-			 (VNx8BF "VNx8BI")
+			 (VNx8BF "VNx8BI") (VNx4BF "VNx4BI") (VNx2BF "VNx2BI")
 			 (VNx4SI "VNx4BI") (VNx2SI "VNx2BI")
 			 (VNx4SF "VNx4BI") (VNx2SF "VNx2BI")
 			 (VNx2DI "VNx2BI")
@@ -1643,7 +1671,7 @@
 			 (VNx4QI "vnx4bi") (VNx2QI "vnx2bi")
 			 (VNx8HI "vnx8bi") (VNx4HI "vnx4bi") (VNx2HI "vnx2bi")
 			 (VNx8HF "vnx8bi") (VNx4HF "vnx4bi") (VNx2HF "vnx2bi")
-			 (VNx8BF "vnx8bi")
+			 (VNx8BF "vnx8bi") (VNx4BF "vnx4bi") (VNx2BF "vnx2bi")
 			 (VNx4SI "vnx4bi") (VNx2SI "vnx2bi")
 			 (VNx4SF "vnx4bi") (VNx2SF "vnx2bi")
 			 (VNx2DI "vnx2bi")
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..3d74ff98e6d61789df05827eef89628730b898e3
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_2.c
@@ -0,0 +1,331 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B
+#define PERM1(B) PERM0 (B), PERM0 (B)
+#define PERM2(B) PERM1 (B), PERM1 (B)
+#define PERM3(B) PERM2 (B), PERM2 (B)
+#define PERM4(B) PERM3 (B), PERM3 (B)
+#define PERM5(B) PERM4 (B), PERM4 (B)
+#define PERM6(B) PERM5 (B), PERM5 (B)
+
+/*
+** qi_dup_h_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.h, \2\.h\[1\]
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_dup_h_1 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_dup_h_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.h, \2\.h\[31\]
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_dup_h_31 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (31) });
+}
+
+/*
+** qi_dup_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[1\]
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_dup_s_1 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_dup_s_15:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[15\]
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_dup_s_15 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (15) });
+}
+
+/*
+** qi_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_dup_d_1 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_dup_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[7\]
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_dup_d_7 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (7) });
+}
+
+/*
+** hi_dup_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[1\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_dup_s_1 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_dup_s_15:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[15\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_dup_s_15 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (15) });
+}
+
+/*
+** hf_dup_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[1\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_dup_s_1 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_dup_s_11:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[11\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_dup_s_11 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (11) });
+}
+
+/*
+** bf_dup_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[1\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_dup_s_1 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_dup_s_13:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.s, \2\.s\[13\]
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_dup_s_13 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (13) });
+}
+
+/*
+** hi_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_dup_d_1 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_dup_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[7\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_dup_d_7 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
+}
+
+/*
+** hf_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_dup_d_1 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_dup_d_5:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[5\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_dup_d_5 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (5) });
+}
+
+/*
+** bf_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_dup_d_1 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_dup_d_6:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[6\]
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_dup_d_6 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (6) });
+}
+
+/*
+** si_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_dup_d_1 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** si_dup_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[7\]
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_dup_d_7 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
+}
+
+/*
+** sf_dup_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[1\]
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_dup_d_1 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_dup_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	dup	(z[0-9]+)\.d, \2\.d\[7\]
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_dup_d_7 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (7) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c
new file mode 100644
index 0000000000000000000000000000000000000000..50f73a1aa23afb74bd7b9bc9b4520ad92613e247
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/dup_lane_3.c
@@ -0,0 +1,90 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B
+#define PERM1(B) PERM0 (B), PERM0 (B)
+#define PERM2(B) PERM1 (B), PERM1 (B)
+#define PERM3(B) PERM2 (B), PERM2 (B)
+#define PERM4(B) PERM3 (B), PERM3 (B)
+#define PERM5(B) PERM4 (B), PERM4 (B)
+#define PERM6(B) PERM5 (B), PERM5 (B)
+
+v128qi
+qi_dup_h_32 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (32) });
+}
+
+v64qi
+qi_dup_s_16 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (16) });
+}
+
+v32qi
+qi_dup_d_8 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (8) });
+}
+
+v64hi
+hi_dup_s_16 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v64hf
+hf_dup_s_16 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v64bf
+bf_dup_s_16 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (16) });
+}
+
+v32hi
+hi_dup_d_8 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32hf
+hf_dup_d_8 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32bf
+bf_dup_d_8 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (8) });
+}
+
+v32si
+si_dup_d_8 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
+}
+
+v32sf
+sf_dup_d_8 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (8) });
+}
+
+/* { dg-final { scan-assembler-not {\tdup\tz} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c b/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c
new file mode 100644
index 0000000000000000000000000000000000000000..4637b5cdc7a5b9b956521379e679ff0e9f7b5edc
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/ext_4.c
@@ -0,0 +1,353 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 1
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_ext_h_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #2
+**	st1b	\2\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_ext_h_1 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_ext_h_1_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ext	\3\.b, \3\.b, \2\.b, #2
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ext	\4\.b, \4\.b, \5\.b, #2
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_ext_h_1_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_ext_h_127:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #254
+**	st1b	\2\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_ext_h_127 (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
+}
+
+/*
+** qi_ext_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #4
+**	st1b	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_ext_s_1 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_ext_s_63:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #252
+**	st1b	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_ext_s_63 (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
+}
+
+/*
+** qi_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1b	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_ext_d_1 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_ext_d_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #248
+**	st1b	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_ext_d_31 (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
+}
+
+/*
+** hi_ext_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #4
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_ext_s_1 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_ext_s_63:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #252
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_ext_s_63 (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hf_ext_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #4
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_ext_s_1 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_ext_s_60:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #240
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_ext_s_60 (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (60) });
+}
+
+/*
+** bf_ext_s_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #4
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_ext_s_1 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_ext_s_40:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #160
+**	st1h	\2\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_ext_s_40 (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (40) });
+}
+
+/*
+** hi_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_ext_d_1 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_ext_d_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #248
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_ext_d_31 (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** hf_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_ext_d_1 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_ext_d_18:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #144
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_ext_d_18 (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (18) });
+}
+
+/*
+** bf_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_ext_d_1 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_ext_d_7:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #56
+**	st1h	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_ext_d_7 (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (7) });
+}
+
+/*
+** si_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1w	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_ext_d_1 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** si_ext_d_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #248
+**	st1w	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_ext_d_31 (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
+
+/*
+** sf_ext_d_1:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #8
+**	st1w	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_ext_d_1 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_ext_d_31:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ext	\2\.b, \2\.b, \2\.b, #248
+**	st1w	\2\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_ext_d_31 (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c b/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..417da37500ad498995c043ab8fcf8df337c74591
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/rev_2.c
@@ -0,0 +1,177 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B - 1
+#define PERM1(B) PERM0 (B), PERM0 (B - 2)
+#define PERM2(B) PERM1 (B), PERM1 (B - 4)
+#define PERM3(B) PERM2 (B), PERM2 (B - 8)
+#define PERM4(B) PERM3 (B), PERM3 (B - 16)
+#define PERM5(B) PERM4 (B), PERM4 (B - 32)
+#define PERM6(B) PERM5 (B), PERM5 (B - 64)
+
+/*
+** qi_rev_h:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_rev_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (127) });
+}
+
+/*
+** qi_rev_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_rev_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (63) });
+}
+
+/*
+** qi_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_rev_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (31) });
+}
+
+/*
+** hi_rev_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_rev_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hf_rev_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_rev_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** bf_rev_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_rev_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (63) });
+}
+
+/*
+** hi_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_rev_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** hf_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_rev_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** bf_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_rev_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (31) });
+}
+
+/*
+** si_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_rev_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
+
+/*
+** sf_rev_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	rev	(z[0-9]+)\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_rev_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (31) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c b/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c
new file mode 100644
index 0000000000000000000000000000000000000000..62de8127584f1d1913ff6491f50cae4127873e40
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revhw_1.c
@@ -0,0 +1,127 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+
+#define PERM0(B) B + 1, B
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_revh_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	revh	(z[0-9]+)\.s, \1/m, \2\.s
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_revh_s (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_revw_d (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** hi_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_revw_d (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_revw_d (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_revw_d (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM1
+#define PERM1(B) PERM0 (B + 2), PERM0 (B)
+
+/*
+** qi_revh_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	revh	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_revh_d (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+v64qi
+qi_revw_q (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+v64hi
+hi_revw_q (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM2
+#define PERM2(B) PERM0 (B + 4), PERM0 (B)
+
+v128qi
+qi_revh_q (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c b/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..7634d01b2c42176b22a5f5ebe44ef625a429193a
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/revhw_2.c
@@ -0,0 +1,127 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mbig-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+
+#define PERM0(B) B + 1, B
+#define PERM1(B) PERM0 (B), PERM0 (B + 2)
+#define PERM2(B) PERM1 (B), PERM1 (B + 4)
+#define PERM3(B) PERM2 (B), PERM2 (B + 8)
+#define PERM4(B) PERM3 (B), PERM3 (B + 16)
+#define PERM5(B) PERM4 (B), PERM4 (B + 32)
+#define PERM6(B) PERM5 (B), PERM5 (B + 64)
+
+/*
+** qi_revh_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	revh	(z[0-9]+)\.s, \1/m, \2\.s
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_revh_s (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_revw_d (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** hi_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_revw_d (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_revw_d (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_revw_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	revw	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_revw_d (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM1
+#define PERM1(B) PERM0 (B + 2), PERM0 (B)
+
+/*
+** qi_revh_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	revh	(z[0-9]+)\.d, \1/m, \2\.d
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_revh_d (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+v64qi
+qi_revw_q (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+v64hi
+hi_revw_q (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+#undef PERM2
+#define PERM2(B) PERM0 (B + 4), PERM0 (B)
+
+v128qi
+qi_revh_q (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/* { dg-final { scan-assembler-times {\trev.\t} 6 } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c
new file mode 100644
index 0000000000000000000000000000000000000000..fe25000b0bf89a26d0c6328e15daa3f099b18ebd
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/slp_perm_8.c
@@ -0,0 +1,18 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-vectorize" } */
+
+void
+f (short *restrict s, signed char *restrict c)
+{
+  for (int i = 0; i < 8; i += 2)
+    {
+      s[i] = c[i];
+      s[i + 1] = c[i];
+    }
+}
+
+/* Ideally this would use LD1SB, but currently we use LD1B and
+   sign-extend it after the permute.  */
+/* { dg-final { scan-assembler {\tptrue\tp[0-7]\.h, vl6\n} } } */
+/* { dg-final { scan-assembler {\tld1s?b\tz[0-9]+\.h} } } */
+/* { dg-final { scan-assembler {\ttrn1\tz[0-9]+\.h,} } } */
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..df059ddbc8d98715a58a2d805c5f1ff694510d75
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/trn1_2.c
@@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
+
+/*
+** qi_trn1_h_a:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn1_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
+}
+
+/*
+** qi_trn1_h_b:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn1_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_trn1_h_c:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn1_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
+}
+
+/*
+** qi_trn1_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn1	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	trn1	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_trn1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_trn1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_trn1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_trn1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn1	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_trn1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_trn1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** qi_trn1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn1	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_trn1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_trn1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_trn1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_trn1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_trn1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_trn1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_trn1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_trn1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_trn1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_trn1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_trn1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_trn1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_trn1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_trn1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_trn1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_trn1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_trn1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_trn1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_trn1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_trn1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_trn1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_trn1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** si_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_trn1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
+
+/*
+** sf_trn1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_trn1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..290ce8e980ce320f00274268ab0688683294f569
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/trn2_2.c
@@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 2, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 4, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 8, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 16, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 32, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 64, C)
+
+/*
+** qi_trn2_h_a:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn2_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
+}
+
+/*
+** qi_trn2_h_b:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn2_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 128) });
+}
+
+/*
+** qi_trn2_h_c:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_trn2_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1, 0) });
+}
+
+/*
+** qi_trn2_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	trn2	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	trn2	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_trn2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1, 128) });
+}
+
+/*
+** qi_trn2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_trn2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1, 64) });
+}
+
+/*
+** qi_trn2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn2	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_trn2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (1, 64) });
+}
+
+/*
+** qi_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_trn2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1, 32) });
+}
+
+/*
+** qi_trn2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn2	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_trn2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (1, 32) });
+}
+
+/*
+** hi_trn2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_trn2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hi_trn2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_trn2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hf_trn2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_trn2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hf_trn2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_trn2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** bf_trn2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_trn2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** bf_trn2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	trn2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	trn2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_trn2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1, 64) });
+}
+
+/*
+** hi_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_trn2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hi_trn2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_trn2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hf_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_trn2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** hf_trn2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_trn2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** bf_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_trn2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** bf_trn2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	trn2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_trn2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1, 32) });
+}
+
+/*
+** si_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_trn2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
+}
+
+/*
+** sf_trn2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	trn2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_trn2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1, 32) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..e2f2692c7cfeb76275bd52fd8c85c1699c466140
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/uzp1_2.c
@@ -0,0 +1,375 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 2
+#define PERM1(B) PERM0 (B), PERM0 (B + 4)
+#define PERM2(B) PERM1 (B), PERM1 (B + 8)
+#define PERM3(B) PERM2 (B), PERM2 (B + 16)
+#define PERM4(B) PERM3 (B), PERM3 (B + 32)
+#define PERM5(B) PERM4 (B), PERM4 (B + 64)
+#define PERM6(B) PERM5 (B), PERM5 (B + 128)
+
+/*
+** qi_uzp1_h:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_uzp1_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_uzp1_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	uzp1	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	uzp1	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_uzp1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0) });
+}
+
+/*
+** qi_uzp1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_uzp1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0) });
+}
+
+/*
+** qi_uzp1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp1	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_uzp1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0) });
+}
+
+/*
+** qi_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_uzp1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0) });
+}
+
+/*
+** qi_uzp1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp1	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_uzp1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0) });
+}
+
+/*
+** hi_uzp1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_uzp1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hi_uzp1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_uzp1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_uzp1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_uzp1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hf_uzp1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_uzp1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_uzp1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_uzp1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0) });
+}
+
+/*
+** bf_uzp1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_uzp1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0) });
+}
+
+/*
+** hi_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_uzp1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hi_uzp1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_uzp1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hf_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_uzp1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** hf_uzp1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_uzp1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** bf_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_uzp1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0) });
+}
+
+/*
+** bf_uzp1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_uzp1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0) });
+}
+
+/*
+** si_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_uzp1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
+}
+
+/*
+** sf_uzp1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_uzp1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..0d8eda567cf142c22ac50163a66be4678246208b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/uzp2_2.c
@@ -0,0 +1,375 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B) B, B + 2
+#define PERM1(B) PERM0 (B), PERM0 (B + 4)
+#define PERM2(B) PERM1 (B), PERM1 (B + 8)
+#define PERM3(B) PERM2 (B), PERM2 (B + 16)
+#define PERM4(B) PERM3 (B), PERM3 (B + 32)
+#define PERM5(B) PERM4 (B), PERM4 (B + 64)
+#define PERM6(B) PERM5 (B), PERM5 (B + 128)
+
+/*
+** qi_uzp2_h:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_uzp2_h (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_uzp2_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	uzp2	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	uzp2	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_uzp2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (1) });
+}
+
+/*
+** qi_uzp2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_uzp2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_uzp2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp2	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_uzp2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (1) });
+}
+
+/*
+** qi_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_uzp2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (1) });
+}
+
+/*
+** qi_uzp2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp2	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_uzp2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (1) });
+}
+
+/*
+** hi_uzp2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_uzp2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_uzp2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_uzp2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_uzp2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_uzp2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hf_uzp2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_uzp2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_uzp2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_uzp2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (1) });
+}
+
+/*
+** bf_uzp2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	uzp2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	uzp2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_uzp2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (1) });
+}
+
+/*
+** hi_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_uzp2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hi_uzp2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_uzp2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_uzp2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** hf_uzp2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_uzp2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_uzp2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (1) });
+}
+
+/*
+** bf_uzp2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	uzp2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_uzp2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (1) });
+}
+
+/*
+** si_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_uzp2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
+
+/*
+** sf_uzp2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	uzp2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_uzp2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (1) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c b/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..395b96f5f0d649f6768a64fac9644e88ba439161
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/zip1_2.c
@@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
+
+/*
+** qi_zip1_h_a:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip1_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 0) });
+}
+
+/*
+** qi_zip1_h_b:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip1_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_zip1_h_c:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip1_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (128, 0) });
+}
+
+/*
+** qi_zip1_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip1	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	zip1	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_zip1_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (0, 128) });
+}
+
+/*
+** qi_zip1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_zip1_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_zip1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip1	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_zip1_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (0, 64) });
+}
+
+/*
+** qi_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_zip1_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** qi_zip1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip1	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_zip1_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_zip1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_zip1_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_zip1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_zip1_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_zip1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_zip1_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hf_zip1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_zip1_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_zip1_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_zip1_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** bf_zip1_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip1	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip1	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_zip1_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (0, 64) });
+}
+
+/*
+** hi_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_zip1_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hi_zip1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_zip1_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_zip1_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** hf_zip1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_zip1_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_zip1_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** bf_zip1_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip1	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_zip1_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (0, 32) });
+}
+
+/*
+** si_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_zip1_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
+
+/*
+** sf_zip1_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip1	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_zip1_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (0, 32) });
+}
diff --git a/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c b/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c
new file mode 100644
index 0000000000000000000000000000000000000000..9158ace156447314530af8cb358c92431d00a815
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/sve/zip2_2.c
@@ -0,0 +1,403 @@
+/* { dg-do assemble { target aarch64_asm_sve_ok } } */
+/* { dg-options "-O -msve-vector-bits=2048 -mlittle-endian --save-temps" } */
+/* { dg-final { check-function-bodies "**" "" } } */
+
+typedef unsigned char v128qi __attribute__((vector_size(128)));
+typedef unsigned char v64qi __attribute__((vector_size(64)));
+typedef unsigned char v32qi __attribute__((vector_size(32)));
+typedef unsigned short v64hi __attribute__((vector_size(128)));
+typedef unsigned short v32hi __attribute__((vector_size(64)));
+typedef _Float16 v64hf __attribute__((vector_size(128)));
+typedef _Float16 v32hf __attribute__((vector_size(64)));
+typedef __bf16 v64bf __attribute__((vector_size(128)));
+typedef __bf16 v32bf __attribute__((vector_size(64)));
+typedef unsigned int v32si __attribute__((vector_size(128)));
+typedef float v32sf __attribute__((vector_size(128)));
+
+#define PERM0(B, C) B, B + C
+#define PERM1(B, C) PERM0 (B, C), PERM0 (B + 1, C)
+#define PERM2(B, C) PERM1 (B, C), PERM1 (B + 2, C)
+#define PERM3(B, C) PERM2 (B, C), PERM2 (B + 4, C)
+#define PERM4(B, C) PERM3 (B, C), PERM3 (B + 8, C)
+#define PERM5(B, C) PERM4 (B, C), PERM4 (B + 16, C)
+#define PERM6(B, C) PERM5 (B, C), PERM5 (B + 32, C)
+
+/*
+** qi_zip2_h_a:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip2_h_a (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_h_b:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip2_h_b (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_h_c:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.h, \2\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+**	ret
+*/
+v128qi
+qi_zip2_h_c (v128qi x)
+{
+  return __builtin_shuffle (x, x, (v128qi) { PERM6 (192, 0) });
+}
+
+/*
+** qi_zip2_h_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	zip2	\3\.h, \3\.h, \2\.h
+**	st1b	\3\.h, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.h, \1/z, \[x1\]
+**	zip2	\4\.h, \4\.h, \5\.h
+**	st1b	\4\.h, \1, \[x8\]
+** )
+**	ret
+*/
+v128qi
+qi_zip2_h_two_op (v128qi x, v128qi y)
+{
+  return __builtin_shuffle (x, y, (v128qi) { PERM6 (64, 128) });
+}
+
+/*
+** qi_zip2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64qi
+qi_zip2_s (v64qi x)
+{
+  return __builtin_shuffle (x, x, (v64qi) { PERM5 (32, 64) });
+}
+
+/*
+** qi_zip2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	\3\.s, \3\.s, \2\.s
+**	st1b	\3\.s, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip2	\4\.s, \4\.s, \5\.s
+**	st1b	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64qi
+qi_zip2_s_two_op (v64qi x, v64qi y)
+{
+  return __builtin_shuffle (x, y, (v64qi) { PERM5 (32, 64) });
+}
+
+/*
+** qi_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32qi
+qi_zip2_d (v32qi x)
+{
+  return __builtin_shuffle (x, x, (v32qi) { PERM4 (16, 32) });
+}
+
+/*
+** qi_zip2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	\3\.d, \3\.d, \2\.d
+**	st1b	\3\.d, \1, \[x8\]
+** |
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1b	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip2	\4\.d, \4\.d, \5\.d
+**	st1b	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32qi
+qi_zip2_d_two_op (v32qi x, v32qi y)
+{
+  return __builtin_shuffle (x, y, (v32qi) { PERM4 (16, 32) });
+}
+
+/*
+** hi_zip2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hi
+hi_zip2_s (v64hi x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hi_zip2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hi
+hi_zip2_s_two_op (v64hi x, v64hi y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hf_zip2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64hf
+hf_zip2_s (v64hf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hf_zip2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64hf
+hf_zip2_s_two_op (v64hf x, v64hf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** bf_zip2_s:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.s, \2\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+**	ret
+*/
+v64bf
+bf_zip2_s (v64bf x)
+{
+  return __builtin_shuffle (x, x, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** bf_zip2_s_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	zip2	\3\.s, \3\.s, \2\.s
+**	st1h	\3\.s, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.s, \1/z, \[x1\]
+**	zip2	\4\.s, \4\.s, \5\.s
+**	st1h	\4\.s, \1, \[x8\]
+** )
+**	ret
+*/
+v64bf
+bf_zip2_s_two_op (v64bf x, v64bf y)
+{
+  return __builtin_shuffle (x, y, (v64hi) { PERM5 (32, 64) });
+}
+
+/*
+** hi_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hi
+hi_zip2_d (v32hi x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hi_zip2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hi
+hi_zip2_d_two_op (v32hi x, v32hi y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hf_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32hf
+hf_zip2_d (v32hf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** hf_zip2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32hf
+hf_zip2_d_two_op (v32hf x, v32hf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** bf_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32bf
+bf_zip2_d (v32bf x)
+{
+  return __builtin_shuffle (x, x, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** bf_zip2_d_two_op:
+**	ptrue	(p[0-7])\.b, vl256
+** (
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	\3\.d, \3\.d, \2\.d
+**	st1h	\3\.d, \1, \[x8\]
+** |
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x0\]
+**	ld1h	(z[0-9]+)\.d, \1/z, \[x1\]
+**	zip2	\4\.d, \4\.d, \5\.d
+**	st1h	\4\.d, \1, \[x8\]
+** )
+**	ret
+*/
+v32bf
+bf_zip2_d_two_op (v32bf x, v32bf y)
+{
+  return __builtin_shuffle (x, y, (v32hi) { PERM4 (16, 32) });
+}
+
+/*
+** si_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32si
+si_zip2_d (v32si x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
+}
+
+/*
+** sf_zip2_d:
+**	ptrue	(p[0-7])\.b, vl256
+**	ld1w	(z[0-9]+)\.d, \1/z, \[x0\]
+**	zip2	(z[0-9]+)\.d, \2\.d, \2\.d
+**	st1w	\3\.d, \1, \[x8\]
+**	ret
+*/
+v32sf
+sf_zip2_d (v32sf x)
+{
+  return __builtin_shuffle (x, x, (v32si) { PERM4 (16, 32) });
+}