Skip to content
Snippets Groups Projects
  • Juzhe-Zhong's avatar
    c4ac073d
    RISC-V: Make known NITERS loop be aware of dynamic lmul cost model liveness information · c4ac073d
    Juzhe-Zhong authored
    Consider this following case:
    
    int f[12][100];
    
    void bad1(int v1, int v2)
    {
      for (int r = 0; r < 100; r += 4)
        {
          int i = r + 1;
          f[0][r] = f[1][r] * (f[2][r]) - f[1][i] * (f[2][i]);
          f[0][i] = f[1][r] * (f[2][i]) + f[1][i] * (f[2][r]);
          f[0][r+2] = f[1][r+2] * (f[2][r+2]) - f[1][i+2] * (f[2][i+2]);
          f[0][i+2] = f[1][r+2] * (f[2][i+2]) + f[1][i+2] * (f[2][r+2]);
        }
    }
    
    Pick up LMUL = 8 VLS blindly:
    
            lui     a4,%hi(f)
            addi    a4,a4,%lo(f)
            addi    sp,sp,-592
            addi    a3,a4,800
            lui     a5,%hi(.LANCHOR0)
            vl8re32.v       v24,0(a3)
            addi    a5,a5,%lo(.LANCHOR0)
            addi    a1,a4,400
            addi    a3,sp,140
            vl8re32.v       v16,0(a1)
            vl4re16.v       v4,0(a5)
            addi    a7,a5,192
            vs4r.v  v4,0(a3)
            addi    t0,a5,64
            addi    a3,sp,336
            li      t2,32
            addi    a2,a5,128
            vsetvli a5,zero,e32,m8,ta,ma
            vrgatherei16.vv v8,v16,v4
            vmul.vv v8,v8,v24
            vl8re32.v       v0,0(a7)
            vs8r.v  v8,0(a3)
            vmsltu.vx       v8,v0,t2
            addi    a3,sp,12
            addi    t2,sp,204
            vsm.v   v8,0(t2)
            vl4re16.v       v4,0(t0)
            vl4re16.v       v0,0(a2)
            vs4r.v  v4,0(a3)
            addi    t0,sp,336
            vrgatherei16.vv v8,v24,v4
            addi    a3,sp,208
            vrgatherei16.vv v24,v16,v0
            vs4r.v  v0,0(a3)
            vmul.vv v8,v8,v24
            vlm.v   v0,0(t2)
            vl8re32.v       v24,0(t0)
            addi    a3,sp,208
            vsub.vv v16,v24,v8
            addi    t6,a4,528
            vadd.vv v8,v24,v8
            addi    t5,a4,928
            vmerge.vvm      v8,v8,v16,v0
            addi    t3,a4,128
            vs8r.v  v8,0(a4)
            addi    t4,a4,1056
            addi    t1,a4,656
            addi    a0,a4,256
            addi    a6,a4,1184
            addi    a1,a4,784
            addi    a7,a4,384
            addi    a4,sp,140
            vl4re16.v       v0,0(a3)
            vl8re32.v       v24,0(t6)
            vl4re16.v       v4,0(a4)
            vrgatherei16.vv v16,v24,v0
            addi    a3,sp,12
            vs8r.v  v16,0(t0)
            vl8re32.v       v8,0(t5)
            vrgatherei16.vv v16,v24,v4
            vl4re16.v       v4,0(a3)
            vrgatherei16.vv v24,v8,v4
            vmul.vv v16,v16,v8
            vl8re32.v       v8,0(t0)
            vmul.vv v8,v8,v24
            vsub.vv v24,v16,v8
            vlm.v   v0,0(t2)
            addi    a3,sp,208
            vadd.vv v8,v8,v16
            vl8re32.v       v16,0(t4)
            vmerge.vvm      v8,v8,v24,v0
            vrgatherei16.vv v24,v16,v4
            vs8r.v  v24,0(t0)
            vl4re16.v       v28,0(a3)
            addi    a3,sp,464
            vs8r.v  v8,0(t3)
            vl8re32.v       v8,0(t1)
            vrgatherei16.vv v0,v8,v28
            vs8r.v  v0,0(a3)
            addi    a3,sp,140
            vl4re16.v       v24,0(a3)
            addi    a3,sp,464
            vrgatherei16.vv v0,v8,v24
            vl8re32.v       v24,0(t0)
            vmv8r.v v8,v0
            vl8re32.v       v0,0(a3)
            vmul.vv v8,v8,v16
            vmul.vv v24,v24,v0
            vsub.vv v16,v8,v24
            vadd.vv v8,v8,v24
            vsetivli        zero,4,e32,m8,ta,ma
            vle32.v v24,0(a6)
            vsetvli a4,zero,e32,m8,ta,ma
            addi    a4,sp,12
            vlm.v   v0,0(t2)
            vmerge.vvm      v8,v8,v16,v0
            vl4re16.v       v16,0(a4)
            vrgatherei16.vv v0,v24,v16
            vsetivli        zero,4,e32,m8,ta,ma
            vs8r.v  v0,0(a4)
            addi    a4,sp,208
            vl4re16.v       v0,0(a4)
            vs8r.v  v8,0(a0)
            vle32.v v16,0(a1)
            vsetvli a5,zero,e32,m8,ta,ma
            vrgatherei16.vv v8,v16,v0
            vs8r.v  v8,0(a4)
            addi    a4,sp,140
            vl4re16.v       v4,0(a4)
            addi    a5,sp,12
            vrgatherei16.vv v8,v16,v4
            vl8re32.v       v0,0(a5)
            vsetivli        zero,4,e32,m8,ta,ma
            addi    a5,sp,208
            vmv8r.v v16,v8
            vl8re32.v       v8,0(a5)
            vmul.vv v24,v24,v16
            vmul.vv v8,v0,v8
            vsub.vv v16,v24,v8
            vadd.vv v8,v8,v24
            vsetvli a5,zero,e8,m2,ta,ma
            vlm.v   v0,0(t2)
            vsetivli        zero,4,e32,m8,ta,ma
            vmerge.vvm      v8,v8,v16,v0
            vse32.v v8,0(a7)
            addi    sp,sp,592
            jr      ra
    
    This patch makes loop with known NITERS be aware of liveness estimation, after this patch, choosing LMUL = 4:
    
    	lui	a5,%hi(f)
    	addi	a5,a5,%lo(f)
    	addi	a3,a5,400
    	addi	a4,a5,800
    	vsetivli	zero,8,e32,m2,ta,ma
    	vlseg4e32.v	v16,(a3)
    	vlseg4e32.v	v8,(a4)
    	vmul.vv	v2,v8,v16
    	addi	a3,a5,528
    	vmv.v.v	v24,v10
    	vnmsub.vv	v24,v18,v2
    	addi	a4,a5,928
    	vmul.vv	v2,v12,v22
    	vmul.vv	v6,v8,v18
    	vmv.v.v	v30,v2
    	vmacc.vv	v30,v14,v20
    	vmv.v.v	v26,v6
    	vmacc.vv	v26,v10,v16
    	vmul.vv	v4,v12,v20
    	vmv.v.v	v28,v14
    	vnmsub.vv	v28,v22,v4
    	vsseg4e32.v	v24,(a5)
    	vlseg4e32.v	v16,(a3)
    	vlseg4e32.v	v8,(a4)
    	vmul.vv	v2,v8,v16
    	addi	a6,a5,128
    	vmv.v.v	v24,v10
    	vnmsub.vv	v24,v18,v2
    	addi	a0,a5,656
    	vmul.vv	v2,v12,v22
    	addi	a1,a5,1056
    	vmv.v.v	v30,v2
    	vmacc.vv	v30,v14,v20
    	vmul.vv	v6,v8,v18
    	vmul.vv	v4,v12,v20
    	vmv.v.v	v26,v6
    	vmacc.vv	v26,v10,v16
    	vmv.v.v	v28,v14
    	vnmsub.vv	v28,v22,v4
    	vsseg4e32.v	v24,(a6)
    	vlseg4e32.v	v16,(a0)
    	vlseg4e32.v	v8,(a1)
    	vmul.vv	v2,v8,v16
    	addi	a2,a5,256
    	vmv.v.v	v24,v10
    	vnmsub.vv	v24,v18,v2
    	addi	a3,a5,784
    	vmul.vv	v2,v12,v22
    	addi	a4,a5,1184
    	vmv.v.v	v30,v2
    	vmacc.vv	v30,v14,v20
    	vmul.vv	v6,v8,v18
    	vmul.vv	v4,v12,v20
    	vmv.v.v	v26,v6
    	vmacc.vv	v26,v10,v16
    	vmv.v.v	v28,v14
    	vnmsub.vv	v28,v22,v4
    	addi	a5,a5,384
    	vsseg4e32.v	v24,(a2)
    	vsetivli	zero,1,e32,m2,ta,ma
    	vlseg4e32.v	v16,(a3)
    	vlseg4e32.v	v8,(a4)
    	vmul.vv	v2,v16,v8
    	vmul.vv	v6,v18,v8
    	vmv.v.v	v24,v18
    	vnmsub.vv	v24,v10,v2
    	vmul.vv	v4,v20,v12
    	vmul.vv	v2,v22,v12
    	vmv.v.v	v26,v6
    	vmacc.vv	v26,v16,v10
    	vmv.v.v	v28,v22
    	vnmsub.vv	v28,v14,v4
    	vmv.v.v	v30,v2
    	vmacc.vv	v30,v20,v14
    	vsseg4e32.v	v24,(a5)
    	ret
    
    Tested on both RV32 and RV64 no regressions.
    
    	PR target/113112
    
    gcc/ChangeLog:
    
    	* config/riscv/riscv-vector-costs.cc (is_gimple_assign_or_call): New function.
    	(get_first_lane_point): Ditto.
    	(get_last_lane_point): Ditto.
    	(max_number_of_live_regs): Refine live point dump.
    	(compute_estimated_lmul): Make unknown NITERS loop be aware of liveness.
    	(costs::better_main_loop_than_p): Ditto.
    	* config/riscv/riscv-vector-costs.h (struct stmt_point): Add new member.
    
    gcc/testsuite/ChangeLog:
    
    	* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c:
    	* gcc.dg/vect/costmodel/riscv/rvv/pr113112-3.c: New test.
    c4ac073d
    History
    RISC-V: Make known NITERS loop be aware of dynamic lmul cost model liveness information
    Juzhe-Zhong authored
    Consider this following case:
    
    int f[12][100];
    
    void bad1(int v1, int v2)
    {
      for (int r = 0; r < 100; r += 4)
        {
          int i = r + 1;
          f[0][r] = f[1][r] * (f[2][r]) - f[1][i] * (f[2][i]);
          f[0][i] = f[1][r] * (f[2][i]) + f[1][i] * (f[2][r]);
          f[0][r+2] = f[1][r+2] * (f[2][r+2]) - f[1][i+2] * (f[2][i+2]);
          f[0][i+2] = f[1][r+2] * (f[2][i+2]) + f[1][i+2] * (f[2][r+2]);
        }
    }
    
    Pick up LMUL = 8 VLS blindly:
    
            lui     a4,%hi(f)
            addi    a4,a4,%lo(f)
            addi    sp,sp,-592
            addi    a3,a4,800
            lui     a5,%hi(.LANCHOR0)
            vl8re32.v       v24,0(a3)
            addi    a5,a5,%lo(.LANCHOR0)
            addi    a1,a4,400
            addi    a3,sp,140
            vl8re32.v       v16,0(a1)
            vl4re16.v       v4,0(a5)
            addi    a7,a5,192
            vs4r.v  v4,0(a3)
            addi    t0,a5,64
            addi    a3,sp,336
            li      t2,32
            addi    a2,a5,128
            vsetvli a5,zero,e32,m8,ta,ma
            vrgatherei16.vv v8,v16,v4
            vmul.vv v8,v8,v24
            vl8re32.v       v0,0(a7)
            vs8r.v  v8,0(a3)
            vmsltu.vx       v8,v0,t2
            addi    a3,sp,12
            addi    t2,sp,204
            vsm.v   v8,0(t2)
            vl4re16.v       v4,0(t0)
            vl4re16.v       v0,0(a2)
            vs4r.v  v4,0(a3)
            addi    t0,sp,336
            vrgatherei16.vv v8,v24,v4
            addi    a3,sp,208
            vrgatherei16.vv v24,v16,v0
            vs4r.v  v0,0(a3)
            vmul.vv v8,v8,v24
            vlm.v   v0,0(t2)
            vl8re32.v       v24,0(t0)
            addi    a3,sp,208
            vsub.vv v16,v24,v8
            addi    t6,a4,528
            vadd.vv v8,v24,v8
            addi    t5,a4,928
            vmerge.vvm      v8,v8,v16,v0
            addi    t3,a4,128
            vs8r.v  v8,0(a4)
            addi    t4,a4,1056
            addi    t1,a4,656
            addi    a0,a4,256
            addi    a6,a4,1184
            addi    a1,a4,784
            addi    a7,a4,384
            addi    a4,sp,140
            vl4re16.v       v0,0(a3)
            vl8re32.v       v24,0(t6)
            vl4re16.v       v4,0(a4)
            vrgatherei16.vv v16,v24,v0
            addi    a3,sp,12
            vs8r.v  v16,0(t0)
            vl8re32.v       v8,0(t5)
            vrgatherei16.vv v16,v24,v4
            vl4re16.v       v4,0(a3)
            vrgatherei16.vv v24,v8,v4
            vmul.vv v16,v16,v8
            vl8re32.v       v8,0(t0)
            vmul.vv v8,v8,v24
            vsub.vv v24,v16,v8
            vlm.v   v0,0(t2)
            addi    a3,sp,208
            vadd.vv v8,v8,v16
            vl8re32.v       v16,0(t4)
            vmerge.vvm      v8,v8,v24,v0
            vrgatherei16.vv v24,v16,v4
            vs8r.v  v24,0(t0)
            vl4re16.v       v28,0(a3)
            addi    a3,sp,464
            vs8r.v  v8,0(t3)
            vl8re32.v       v8,0(t1)
            vrgatherei16.vv v0,v8,v28
            vs8r.v  v0,0(a3)
            addi    a3,sp,140
            vl4re16.v       v24,0(a3)
            addi    a3,sp,464
            vrgatherei16.vv v0,v8,v24
            vl8re32.v       v24,0(t0)
            vmv8r.v v8,v0
            vl8re32.v       v0,0(a3)
            vmul.vv v8,v8,v16
            vmul.vv v24,v24,v0
            vsub.vv v16,v8,v24
            vadd.vv v8,v8,v24
            vsetivli        zero,4,e32,m8,ta,ma
            vle32.v v24,0(a6)
            vsetvli a4,zero,e32,m8,ta,ma
            addi    a4,sp,12
            vlm.v   v0,0(t2)
            vmerge.vvm      v8,v8,v16,v0
            vl4re16.v       v16,0(a4)
            vrgatherei16.vv v0,v24,v16
            vsetivli        zero,4,e32,m8,ta,ma
            vs8r.v  v0,0(a4)
            addi    a4,sp,208
            vl4re16.v       v0,0(a4)
            vs8r.v  v8,0(a0)
            vle32.v v16,0(a1)
            vsetvli a5,zero,e32,m8,ta,ma
            vrgatherei16.vv v8,v16,v0
            vs8r.v  v8,0(a4)
            addi    a4,sp,140
            vl4re16.v       v4,0(a4)
            addi    a5,sp,12
            vrgatherei16.vv v8,v16,v4
            vl8re32.v       v0,0(a5)
            vsetivli        zero,4,e32,m8,ta,ma
            addi    a5,sp,208
            vmv8r.v v16,v8
            vl8re32.v       v8,0(a5)
            vmul.vv v24,v24,v16
            vmul.vv v8,v0,v8
            vsub.vv v16,v24,v8
            vadd.vv v8,v8,v24
            vsetvli a5,zero,e8,m2,ta,ma
            vlm.v   v0,0(t2)
            vsetivli        zero,4,e32,m8,ta,ma
            vmerge.vvm      v8,v8,v16,v0
            vse32.v v8,0(a7)
            addi    sp,sp,592
            jr      ra
    
    This patch makes loop with known NITERS be aware of liveness estimation, after this patch, choosing LMUL = 4:
    
    	lui	a5,%hi(f)
    	addi	a5,a5,%lo(f)
    	addi	a3,a5,400
    	addi	a4,a5,800
    	vsetivli	zero,8,e32,m2,ta,ma
    	vlseg4e32.v	v16,(a3)
    	vlseg4e32.v	v8,(a4)
    	vmul.vv	v2,v8,v16
    	addi	a3,a5,528
    	vmv.v.v	v24,v10
    	vnmsub.vv	v24,v18,v2
    	addi	a4,a5,928
    	vmul.vv	v2,v12,v22
    	vmul.vv	v6,v8,v18
    	vmv.v.v	v30,v2
    	vmacc.vv	v30,v14,v20
    	vmv.v.v	v26,v6
    	vmacc.vv	v26,v10,v16
    	vmul.vv	v4,v12,v20
    	vmv.v.v	v28,v14
    	vnmsub.vv	v28,v22,v4
    	vsseg4e32.v	v24,(a5)
    	vlseg4e32.v	v16,(a3)
    	vlseg4e32.v	v8,(a4)
    	vmul.vv	v2,v8,v16
    	addi	a6,a5,128
    	vmv.v.v	v24,v10
    	vnmsub.vv	v24,v18,v2
    	addi	a0,a5,656
    	vmul.vv	v2,v12,v22
    	addi	a1,a5,1056
    	vmv.v.v	v30,v2
    	vmacc.vv	v30,v14,v20
    	vmul.vv	v6,v8,v18
    	vmul.vv	v4,v12,v20
    	vmv.v.v	v26,v6
    	vmacc.vv	v26,v10,v16
    	vmv.v.v	v28,v14
    	vnmsub.vv	v28,v22,v4
    	vsseg4e32.v	v24,(a6)
    	vlseg4e32.v	v16,(a0)
    	vlseg4e32.v	v8,(a1)
    	vmul.vv	v2,v8,v16
    	addi	a2,a5,256
    	vmv.v.v	v24,v10
    	vnmsub.vv	v24,v18,v2
    	addi	a3,a5,784
    	vmul.vv	v2,v12,v22
    	addi	a4,a5,1184
    	vmv.v.v	v30,v2
    	vmacc.vv	v30,v14,v20
    	vmul.vv	v6,v8,v18
    	vmul.vv	v4,v12,v20
    	vmv.v.v	v26,v6
    	vmacc.vv	v26,v10,v16
    	vmv.v.v	v28,v14
    	vnmsub.vv	v28,v22,v4
    	addi	a5,a5,384
    	vsseg4e32.v	v24,(a2)
    	vsetivli	zero,1,e32,m2,ta,ma
    	vlseg4e32.v	v16,(a3)
    	vlseg4e32.v	v8,(a4)
    	vmul.vv	v2,v16,v8
    	vmul.vv	v6,v18,v8
    	vmv.v.v	v24,v18
    	vnmsub.vv	v24,v10,v2
    	vmul.vv	v4,v20,v12
    	vmul.vv	v2,v22,v12
    	vmv.v.v	v26,v6
    	vmacc.vv	v26,v16,v10
    	vmv.v.v	v28,v22
    	vnmsub.vv	v28,v14,v4
    	vmv.v.v	v30,v2
    	vmacc.vv	v30,v20,v14
    	vsseg4e32.v	v24,(a5)
    	ret
    
    Tested on both RV32 and RV64 no regressions.
    
    	PR target/113112
    
    gcc/ChangeLog:
    
    	* config/riscv/riscv-vector-costs.cc (is_gimple_assign_or_call): New function.
    	(get_first_lane_point): Ditto.
    	(get_last_lane_point): Ditto.
    	(max_number_of_live_regs): Refine live point dump.
    	(compute_estimated_lmul): Make unknown NITERS loop be aware of liveness.
    	(costs::better_main_loop_than_p): Ditto.
    	* config/riscv/riscv-vector-costs.h (struct stmt_point): Add new member.
    
    gcc/testsuite/ChangeLog:
    
    	* gcc.dg/vect/costmodel/riscv/rvv/pr113112-1.c:
    	* gcc.dg/vect/costmodel/riscv/rvv/pr113112-3.c: New test.