diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index eabb3248ea0085409ad7dc7a18e852f967f5252a..c1ec92ffb15093e58782807671652ac8e4292b61 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS] #define TARGET_FUSE_ALU_AND_BRANCH \ ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH] +#define TARGET_FUSE_MOV_AND_ALU \ + ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU] #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] #define TARGET_AVOID_LEA_FOR_ADDR \ ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR] diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index d77298b0e34dc403600078da62ed510d524fbb95..c6d5426ae8d383e693fa80745ad8b685d954c780 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -67,7 +67,6 @@ ix86_issue_rate (void) case PROCESSOR_ZNVER2: case PROCESSOR_ZNVER3: case PROCESSOR_ZNVER4: - case PROCESSOR_ZNVER5: case PROCESSOR_CORE2: case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: @@ -91,6 +90,13 @@ ix86_issue_rate (void) return 5; case PROCESSOR_SAPPHIRERAPIDS: + /* For znver5 decoder can handle 4 or 8 instructions per cycle, + op cache 12 instruction/cycle, dispatch 8 instructions + integer rename 8 instructions and Fp 6 instructions. + + The scheduler, without understanding out of order nature of the CPU + is unlikely going to be able to fill all of these. */ + case PROCESSOR_ZNVER5: return 6; default: @@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, enum attr_unit unit = get_attr_unit (insn); int loadcost; + /* TODO: On znver5 complex addressing modes have + greater latency. */ if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) loadcost = 4; else @@ -563,6 +571,60 @@ ix86_macro_fusion_p () return TARGET_FUSE_CMP_AND_BRANCH; } +static bool +ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu) +{ + /* Validate mov: + - It should be reg-reg move with opcode 0x89 or 0x8B. */ + rtx set1 = PATTERN (mov); + if (GET_CODE (set1) != SET + || !GENERAL_REG_P (SET_SRC (set1)) + || !GENERAL_REG_P (SET_DEST (set1))) + return false; + rtx reg = SET_DEST (set1); + /* - it should have 0x89 or 0x8B opcode. */ + if (!INTEGRAL_MODE_P (GET_MODE (reg)) + || GET_MODE_SIZE (GET_MODE (reg)) < 2 + || GET_MODE_SIZE (GET_MODE (reg)) > 8) + return false; + /* Validate ALU. */ + if (GET_CODE (PATTERN (alu)) != PARALLEL) + return false; + rtx set2 = XVECEXP (PATTERN (alu), 0, 0); + if (GET_CODE (set2) != SET) + return false; + /* Match one of: + ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR + We also may add insn attribute to handle some of sporadic + case we output those with different RTX expressions. */ + + if (GET_CODE (SET_SRC (set2)) != PLUS + && GET_CODE (SET_SRC (set2)) != MINUS + && GET_CODE (SET_SRC (set2)) != XOR + && GET_CODE (SET_SRC (set2)) != AND + && GET_CODE (SET_SRC (set2)) != IOR + && GET_CODE (SET_SRC (set2)) != NOT + && GET_CODE (SET_SRC (set2)) != ASHIFT + && GET_CODE (SET_SRC (set2)) != ASHIFTRT + && GET_CODE (SET_SRC (set2)) != LSHIFTRT) + return false; + rtx op0 = XEXP (SET_SRC (set2), 0); + rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL; + /* One of operands should be register. */ + if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg))) + std::swap (op0, op1); + if (!REG_P (op0) || REGNO (op1) != REGNO (reg)) + return false; + if (op1 + && !REG_P (op1) + && !x86_64_immediate_operand (op1, VOIDmode)) + return false; + /* Only one of two paramters must be move destination. */ + if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg)) + return false; + return true; +} + /* Check whether current microarchitecture support macro fusion for insn pair "CONDGEN + CONDJMP". Refer to "Intel Architectures Optimization Reference Manual". */ @@ -570,6 +632,9 @@ ix86_macro_fusion_p () bool ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) { + if (TARGET_FUSE_MOV_AND_ALU + && ix86_fuse_mov_alu_p (condgen, condjmp)) + return true; rtx src, dest; enum rtx_code ccode; rtx compare_set = NULL_RTX, test_if, cond; diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index ed26136faee52c9ea0f54fdb7f039091a0c791f9..d7e2ad7fd250b71a494feb6e3c910b9910276b0e 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -143,10 +143,17 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional jump instruction when the alu instruction produces the CCFLAG consumed by - the conditional jump instruction. */ + the conditional jump instruction. + + TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND, + There is also limitation for immediate and displacement supported. */ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", - m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5) +/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov + and the destination is used by alu. alu must be one of + ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */ +DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5) /*****************************************************************************/ /* Function prologue, epilogue and function calling sequences. */