From 3a579e0930abe3ed91977a71284021399339860c Mon Sep 17 00:00:00 2001 From: Vladimir Yakovlev <vladimir.b.yakovlev@intel.com> Date: Tue, 15 Jan 2013 10:07:08 +0000 Subject: [PATCH] i386-c.c (ix86_target_macros_internal): New case. * config/i386/i386-c.c (ix86_target_macros_internal): New case. (ix86_target_macros_internal): Likewise. * config/i386/i386.c (m_CORE2I7): Removed. (m_CORE_HASWELL): New macro. (m_CORE_ALL): Likewise. (initial_ix86_tune_features): m_CORE2I7 is replaced by m_CORE_ALL. (initial_ix86_arch_features): Likewise. (processor_target_table): Initializations for Core avx2. (cpu_names): New names "core-avx2". (ix86_option_override_internal): Changed PROCESSOR_COREI7 by PROCESSOR_CORE_HASWELL. (ix86_issue_rate): New case. (ia32_multipass_dfa_lookahead): Likewise. (ix86_sched_init_global): Likewise. * config/i386/i386.h (TARGET_HASWELL): New macro. (target_cpu_default): New TARGET_CPU_DEFAULT_haswell. (processor_type): New PROCESSOR_HASWELL. From-SVN: r195191 --- gcc/ChangeLog | 22 ++++++++++++++ gcc/config/i386/i386-c.c | 7 +++++ gcc/config/i386/i386.c | 63 ++++++++++++++++++++++------------------ gcc/config/i386/i386.h | 3 ++ 4 files changed, 67 insertions(+), 28 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index ea0e4fa01a07..efe553bd04b7 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,25 @@ +2013-01-15 Vladimir Yakovlev <vladimir.b.yakovlev@intel.com> + + * config/i386/i386-c.c (ix86_target_macros_internal): New case. + (ix86_target_macros_internal): Likewise. + + * config/i386/i386.c (m_CORE2I7): Removed. + (m_CORE_HASWELL): New macro. + (m_CORE_ALL): Likewise. + (initial_ix86_tune_features): m_CORE2I7 is replaced by m_CORE_ALL. + (initial_ix86_arch_features): Likewise. + (processor_target_table): Initializations for Core avx2. + (cpu_names): New names "core-avx2". + (ix86_option_override_internal): Changed PROCESSOR_COREI7 by + PROCESSOR_CORE_HASWELL. + (ix86_issue_rate): New case. + (ia32_multipass_dfa_lookahead): Likewise. + (ix86_sched_init_global): Likewise. + + * config/i386/i386.h (TARGET_HASWELL): New macro. + (target_cpu_default): New TARGET_CPU_DEFAULT_haswell. + (processor_type): New PROCESSOR_HASWELL. + 2013-01-15 Jakub Jelinek <jakub@redhat.com> PR tree-optimization/55955 diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c index 49545d16f62e..51fec844bdf0 100644 --- a/gcc/config/i386/i386-c.c +++ b/gcc/config/i386/i386-c.c @@ -141,6 +141,10 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, def_or_undef (parse_in, "__corei7"); def_or_undef (parse_in, "__corei7__"); break; + case PROCESSOR_HASWELL: + def_or_undef (parse_in, "__core_avx2"); + def_or_undef (parse_in, "__core_avx2__"); + break; case PROCESSOR_ATOM: def_or_undef (parse_in, "__atom"); def_or_undef (parse_in, "__atom__"); @@ -231,6 +235,9 @@ ix86_target_macros_internal (HOST_WIDE_INT isa_flag, case PROCESSOR_COREI7: def_or_undef (parse_in, "__tune_corei7__"); break; + case PROCESSOR_HASWELL: + def_or_undef (parse_in, "__tune_core_avx2__"); + break; case PROCESSOR_ATOM: def_or_undef (parse_in, "__tune_atom__"); break; diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index 4f778c1c8559..0e98a1b56756 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -1730,7 +1730,8 @@ const struct processor_costs *ix86_cost = &pentium_cost; #define m_P4_NOCONA (m_PENT4 | m_NOCONA) #define m_CORE2 (1<<PROCESSOR_CORE2) #define m_COREI7 (1<<PROCESSOR_COREI7) -#define m_CORE2I7 (m_CORE2 | m_COREI7) +#define m_HASWELL (1<<PROCESSOR_HASWELL) +#define m_CORE_ALL (m_CORE2 | m_COREI7 | m_HASWELL) #define m_ATOM (1<<PROCESSOR_ATOM) #define m_GEODE (1<<PROCESSOR_GEODE) @@ -1766,16 +1767,16 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { negatively, so enabling for Generic64 seems like good code size tradeoff. We can't enable it for 32bit generic because it does not work well with PPro base chips. */ - m_386 | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64, + m_386 | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC64, /* X86_TUNE_PUSH_MEMORY */ - m_386 | m_P4_NOCONA | m_CORE2I7 | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, + m_386 | m_P4_NOCONA | m_CORE_ALL | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_ZERO_EXTEND_WITH_AND */ m_486 | m_PENT, /* X86_TUNE_UNROLL_STRLEN */ - m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE2I7 | m_K6 | m_AMD_MULTIPLE | m_GENERIC, + m_486 | m_PENT | m_PPRO | m_ATOM | m_CORE_ALL | m_K6 | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_BRANCH_PREDICTION_HINTS: Branch hints were put in P4 based on simulation result. But after P4 was made, no performance benefit @@ -1787,11 +1788,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { ~m_386, /* X86_TUNE_USE_SAHF */ - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC, /* X86_TUNE_MOVX: Enable to zero extend integer registers to avoid partial dependencies. */ - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_PARTIAL_REG_STALL: We probably ought to watch for partial register stalls on Generic32 compilation setting as well. However @@ -1804,17 +1805,17 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { m_PPRO, /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ - m_CORE2I7 | m_GENERIC, + m_CORE_ALL | m_GENERIC, /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall * on 16-bit immediate moves into memory on Core2 and Corei7. */ - m_CORE2I7 | m_GENERIC, + m_CORE_ALL | m_GENERIC, /* X86_TUNE_USE_HIMODE_FIOP */ m_386 | m_486 | m_K6_GEODE, /* X86_TUNE_USE_SIMODE_FIOP */ - ~(m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC), + ~(m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC), /* X86_TUNE_USE_MOV0 */ m_K6, @@ -1835,7 +1836,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { ~(m_PENT | m_PPRO), /* X86_TUNE_PROMOTE_QIMODE */ - m_386 | m_486 | m_PENT | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, + m_386 | m_486 | m_PENT | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_FAST_PREFIX */ ~(m_386 | m_486 | m_PENT), @@ -1876,10 +1877,10 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_INTEGER_DFMODE_MOVES: Enable if integer moves are preferred for DFmode copies */ - ~(m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC), + ~(m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GEODE | m_AMD_MULTIPLE | m_ATOM | m_GENERIC), /* X86_TUNE_PARTIAL_REG_DEPENDENCY */ - m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_SSE_PARTIAL_REG_DEPENDENCY: In the Generic model we have a conflict here in between PPro/Pentium4 based chips that thread 128bit @@ -1890,7 +1891,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { shows that disabling this option on P4 brings over 20% SPECfp regression, while enabling it on K8 brings roughly 2.4% regression that can be partly masked by careful scheduling of moves. */ - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMDFAM10 | m_BDVER | m_GENERIC, /* X86_TUNE_SSE_UNALIGNED_LOAD_OPTIMAL */ m_COREI7 | m_AMDFAM10 | m_BDVER | m_BTVER, @@ -1914,7 +1915,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { m_PPRO | m_P4_NOCONA, /* X86_TUNE_MEMORY_MISMATCH_STALL */ - m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_PROLOGUE_USING_MOVE */ m_PPRO | m_ATHLON_K8, @@ -1936,28 +1937,28 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_FOUR_JUMP_LIMIT: Some CPU cores are not able to predict more than 4 branch instructions in the 16 byte window. */ - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_SCHEDULE */ - m_PENT | m_PPRO | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, + m_PENT | m_PPRO | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_USE_BT */ - m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, + m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_USE_INCDEC */ - ~(m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_GENERIC), + ~(m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_GENERIC), /* X86_TUNE_PAD_RETURNS */ - m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC, + m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC, /* X86_TUNE_PAD_SHORT_FUNCTION: Pad short funtion. */ m_ATOM, /* X86_TUNE_EXT_80387_CONSTANTS */ - m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, + m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_K6_GEODE | m_ATHLON_K8 | m_GENERIC, /* X86_TUNE_AVOID_VECTOR_DECODE */ - m_CORE2I7 | m_K8 | m_GENERIC64, + m_CORE_ALL | m_K8 | m_GENERIC64, /* X86_TUNE_PROMOTE_HIMODE_IMUL: Modern CPUs have same latency for HImode and SImode multiply, but 386 and 486 do HImode multiply faster. */ @@ -1965,11 +1966,11 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_SLOW_IMUL_IMM32_MEM: Imul of 32-bit constant and memory is vector path on AMD machines. */ - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, + m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, /* X86_TUNE_SLOW_IMUL_IMM8: Imul of 8-bit constant is vector path on AMD machines. */ - m_CORE2I7 | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, + m_CORE_ALL | m_K8 | m_AMDFAM10 | m_BDVER | m_BTVER | m_GENERIC64, /* X86_TUNE_MOVE_M1_VIA_OR: On pentiums, it is faster to load -1 via OR than a MOV. */ @@ -1986,7 +1987,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_USE_VECTOR_FP_CONVERTS: Prefer vector packed SSE conversion from FP to FP. */ - m_CORE2I7 | m_AMDFAM10 | m_GENERIC, + m_CORE_ALL | m_AMDFAM10 | m_GENERIC, /* X86_TUNE_USE_VECTOR_CONVERTS: Prefer vector packed SSE conversion from integer to FP. */ @@ -2024,7 +2025,7 @@ static unsigned int initial_ix86_tune_features[X86_TUNE_LAST] = { /* X86_TUNE_GENERAL_REGS_SSE_SPILL: Try to spill general regs to SSE regs instead of memory. */ - m_COREI7 | m_CORE2I7, + m_CORE_ALL, /* X86_TUNE_AVOID_MEM_OPND_FOR_CMOVE: Try to avoid memory operands for a conditional move. */ @@ -2054,10 +2055,10 @@ static unsigned int initial_ix86_arch_features[X86_ARCH_LAST] = { }; static const unsigned int x86_accumulate_outgoing_args - = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE2I7 | m_AMD_MULTIPLE | m_GENERIC; + = m_PPRO | m_P4_NOCONA | m_ATOM | m_CORE_ALL | m_AMD_MULTIPLE | m_GENERIC; static const unsigned int x86_arch_always_fancy_math_387 - = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE2I7 | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; + = m_PENT | m_PPRO | m_P4_NOCONA | m_CORE_ALL | m_ATOM | m_AMD_MULTIPLE | m_GENERIC; static const unsigned int x86_avx256_split_unaligned_load = m_COREI7 | m_GENERIC; @@ -2432,6 +2433,8 @@ static const struct ptt processor_target_table[PROCESSOR_max] = {&core_cost, 16, 10, 16, 10, 16}, /* Core i7 */ {&core_cost, 16, 10, 16, 10, 16}, + /* Core avx2 */ + {&core_cost, 16, 10, 16, 10, 16}, {&generic32_cost, 16, 7, 16, 7, 16}, {&generic64_cost, 16, 10, 16, 10, 16}, {&amdfam10_cost, 32, 24, 32, 7, 32}, @@ -2459,6 +2462,7 @@ static const char *const cpu_names[TARGET_CPU_DEFAULT_max] = "nocona", "core2", "corei7", + "core-avx2", "atom", "geode", "k6", @@ -2910,7 +2914,7 @@ ix86_option_override_internal (bool main_args_p) | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE | PTA_RDRND | PTA_F16C | PTA_FXSR | PTA_XSAVE | PTA_XSAVEOPT}, - {"core-avx2", PROCESSOR_COREI7, CPU_COREI7, + {"core-avx2", PROCESSOR_HASWELL, CPU_COREI7, PTA_64BIT | PTA_MMX | PTA_SSE | PTA_SSE2 | PTA_SSE3 | PTA_SSSE3 | PTA_SSE4_1 | PTA_SSE4_2 | PTA_AVX | PTA_AVX2 | PTA_CX16 | PTA_POPCNT | PTA_AES | PTA_PCLMUL | PTA_FSGSBASE @@ -24048,6 +24052,7 @@ ix86_issue_rate (void) case PROCESSOR_PENTIUM4: case PROCESSOR_CORE2: case PROCESSOR_COREI7: + case PROCESSOR_HASWELL: case PROCESSOR_ATHLON: case PROCESSOR_K8: case PROCESSOR_AMDFAM10: @@ -24304,6 +24309,7 @@ ia32_multipass_dfa_lookahead (void) case PROCESSOR_CORE2: case PROCESSOR_COREI7: + case PROCESSOR_HASWELL: case PROCESSOR_ATOM: /* Generally, we want haifa-sched:max_issue() to look ahead as far as many instructions can be executed on a cycle, i.e., @@ -24848,6 +24854,7 @@ ix86_sched_init_global (FILE *dump ATTRIBUTE_UNUSED, { case PROCESSOR_CORE2: case PROCESSOR_COREI7: + case PROCESSOR_HASWELL: /* Do not perform multipass scheduling for pre-reload schedule to save compile time. */ if (reload_completed) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index f86e346f6640..af293b428b36 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -246,6 +246,7 @@ extern const struct processor_costs ix86_size_cost; #define TARGET_NOCONA (ix86_tune == PROCESSOR_NOCONA) #define TARGET_CORE2 (ix86_tune == PROCESSOR_CORE2) #define TARGET_COREI7 (ix86_tune == PROCESSOR_COREI7) +#define TARGET_HASWELL (ix86_tune == PROCESSOR_HASWELL) #define TARGET_GENERIC32 (ix86_tune == PROCESSOR_GENERIC32) #define TARGET_GENERIC64 (ix86_tune == PROCESSOR_GENERIC64) #define TARGET_GENERIC (TARGET_GENERIC32 || TARGET_GENERIC64) @@ -604,6 +605,7 @@ enum target_cpu_default TARGET_CPU_DEFAULT_nocona, TARGET_CPU_DEFAULT_core2, TARGET_CPU_DEFAULT_corei7, + TARGET_CPU_DEFAULT_haswell, TARGET_CPU_DEFAULT_atom, TARGET_CPU_DEFAULT_geode, @@ -2096,6 +2098,7 @@ enum processor_type PROCESSOR_NOCONA, PROCESSOR_CORE2, PROCESSOR_COREI7, + PROCESSOR_HASWELL, PROCESSOR_GENERIC32, PROCESSOR_GENERIC64, PROCESSOR_AMDFAM10, -- GitLab