diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc index df7afd2fd78a4502a3606d2de6d8c12bce14e6dc..9cb574fa315fc353071663acf80d48858ea84a22 100644 --- a/gcc/gimple-ssa-store-merging.cc +++ b/gcc/gimple-ssa-store-merging.cc @@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg, n->range *= BITS_PER_UNIT; } +/* Helper function for find_bswap_or_nop, + Return true if N is a swap or nop with MASK. */ +static bool +is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg, + uint64_t cmpnop, uint64_t* mask, + bool* bswap) +{ + *mask = ~(uint64_t) 0; + if (n == cmpnop) + *bswap = false; + else if (n == cmpxchg) + *bswap = true; + else + { + int set = 0; + for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) + if ((n & msk) == 0) + *mask &= ~msk; + else if ((n & msk) == (cmpxchg & msk)) + set++; + else + return false; + + if (set < 2) + return false; + *bswap = true; + } + return true; +} + + /* Check if STMT completes a bswap implementation or a read in a given endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP accordingly. It also sets N to represent the kind of operations @@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, uint64_t *cmpxchg, gimple * find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, - bool *cast64_to_32, uint64_t *mask) + bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate) { tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt))); if (!tree_fits_uhwi_p (type_size)) @@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, } uint64_t cmpxchg, cmpnop; + uint64_t orig_range = n->range * BITS_PER_UNIT; find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32); /* A complete byte swap should make the symbolic number to start with the largest digit in the highest order byte. Unchanged symbolic number indicates a read with same endianness as target architecture. */ - *mask = ~(uint64_t) 0; - if (n->n == cmpnop) - *bswap = false; - else if (n->n == cmpxchg) - *bswap = true; - else + *l_rotate = 0; + uint64_t tmp_n = n->n; + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap)) { - int set = 0; - for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) - if ((n->n & msk) == 0) - *mask &= ~msk; - else if ((n->n & msk) == (cmpxchg & msk)) - set++; - else - return NULL; - if (set < 2) + /* Try bswap + lrotate. */ + /* TODO, handle cast64_to_32 and big/litte_endian memory + source when rsize < range. */ + if (n->range == orig_range + && ((orig_range == 32 + && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing) + || (orig_range == 64 + && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing)) + && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT) + { + uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER; + uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER; + /* .i.e. hanlde 0x203040506070800 when lower byte is zero. */ + if (!count) + { + for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++) + { + count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK; + if (count) + { + /* Count should be meaningful not 0xff. */ + if (count <= range / BITS_PER_MARKER) + { + count = (count + i) * BITS_PER_MARKER % range; + break; + } + else + return NULL; + } + } + } + tmp_n = tmp_n >> count | tmp_n << (range - count); + if (orig_range == 32) + tmp_n &= (1ULL << 32) - 1; + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap)) + return NULL; + *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT; + gcc_assert (*bswap); + } + else return NULL; - *bswap = true; } /* Useless bit manipulation performed by code. */ @@ -1099,10 +1158,10 @@ bswap_view_convert (gimple_stmt_iterator *gsi, tree type, tree val, tree bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, tree bswap_type, tree load_type, struct symbolic_number *n, - bool bswap, uint64_t mask) + bool bswap, uint64_t mask, uint64_t l_rotate) { tree src, tmp, tgt = NULL_TREE; - gimple *bswap_stmt, *mask_stmt = NULL; + gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL; tree_code conv_code = NOP_EXPR; gimple *cur_stmt = gsi_stmt (gsi); @@ -1332,6 +1391,16 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, tmp = tgt; } + if (l_rotate) + { + tree m = build_int_cst (bswap_type, l_rotate); + tmp = make_temp_ssa_name (bswap_type, NULL, + mask_stmt ? "bswapmaskdst" : "bswapdst"); + gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); + rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m); + tmp = tgt; + } + /* Convert the result if necessary. */ if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type)) { @@ -1344,7 +1413,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT); } - gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); + gimple_set_lhs (rotl_stmt ? rotl_stmt + : mask_stmt ? mask_stmt : bswap_stmt, tmp); if (dump_file) { @@ -1361,6 +1431,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, if (cur_stmt) { + if (rotl_stmt) + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT); if (mask_stmt) gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT); gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT); @@ -1371,6 +1443,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT); if (mask_stmt) gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT); + if (rotl_stmt) + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT); } return tgt; } @@ -1432,9 +1506,9 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) } bool cast64_to_32; - uint64_t mask; + uint64_t mask, l_rotate; gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, - &cast64_to_32, &mask); + &cast64_to_32, &mask, &l_rotate); if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz || cast64_to_32 @@ -1447,7 +1521,8 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) memset (&nop_stats, 0, sizeof (nop_stats)); memset (&bswap_stats, 0, sizeof (bswap_stats)); return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, - bswap_type, load_type, &n, bswap, mask) != NULL_TREE; + bswap_type, load_type, &n, bswap, mask, + l_rotate) != NULL_TREE; } /* Find manual byte swap implementations as well as load in a given @@ -1502,7 +1577,7 @@ pass_optimize_bswap::execute (function *fun) enum tree_code code; struct symbolic_number n; bool bswap, cast64_to_32; - uint64_t mask; + uint64_t mask, l_rotate; /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt might be moved to a different basic block by bswap_replace and gsi @@ -1542,7 +1617,7 @@ pass_optimize_bswap::execute (function *fun) } ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, - &cast64_to_32, &mask); + &cast64_to_32, &mask, &l_rotate); if (!ins_stmt) continue; @@ -1579,7 +1654,8 @@ pass_optimize_bswap::execute (function *fun) continue; if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, - bswap_type, load_type, &n, bswap, mask)) + bswap_type, load_type, &n, bswap, mask, + l_rotate)) changed = true; } } @@ -4271,7 +4347,7 @@ imm_store_chain_info::output_merged_store (merged_store_group *group) } bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl, bswap_type, load_type, n, bswap, - ~(uint64_t) 0); + ~(uint64_t) 0, 0); gcc_assert (bswap_res); } diff --git a/gcc/testsuite/gcc.target/i386/pr108938-1.c b/gcc/testsuite/gcc.target/i386/pr108938-1.c new file mode 100644 index 0000000000000000000000000000000000000000..67f245d1441e016d213e2ac1e707d8c13f36abbd --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-1.c @@ -0,0 +1,79 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-movbe" } */ +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */ + +#include<stdint.h> + +uint64_t +__attribute__((noipa)) +swap_rotate_64 (uint64_t x) +{ + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) | + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48)); +} + +uint64_t +__attribute__((noipa)) +swap_rotate_64_mask_1 (uint64_t x) +{ + return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) | + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48)); +} + +uint64_t +__attribute__((noipa)) +swap_rotate_64_mask_2 (uint64_t x) +{ + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) | + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | + (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48)); +} + + +uint32_t +__attribute__((noipa)) +swap_rotate_32 (uint32_t x) +{ + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) | + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); +} + +uint32_t +__attribute__((noipa)) +swap_rotate_32_mask_1 (uint32_t x) +{ + return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) | + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); +} + +uint32_t +__attribute__((noipa)) +swap_rotate_32_mask_2 (uint32_t x) +{ + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) | + (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) | + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); +} diff --git a/gcc/testsuite/gcc.target/i386/pr108938-2.c b/gcc/testsuite/gcc.target/i386/pr108938-2.c new file mode 100644 index 0000000000000000000000000000000000000000..47a2c89f1c4e6bed31028df91c8bc822b4bf6c70 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-2.c @@ -0,0 +1,35 @@ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +#include "pr108938-1.c" + +int main () +{ + uint64_t a = 0x0807060504030201ULL; + uint64_t res = swap_rotate_64 (a); + if (res != 0x0203040506070801ULL) + __builtin_abort (); + + res = swap_rotate_64_mask_1 (a); + if (res != 0x0203040506070800ULL) + __builtin_abort (); + + res = swap_rotate_64_mask_2 (a); + if (res != 0x0203040506070001ULL) + __builtin_abort (); + + uint32_t b = 0x04030201; + uint32_t res2 = swap_rotate_32 (b); + if (res2 != 0x03040102) + __builtin_abort (); + + res2 = swap_rotate_32_mask_1 (b); + if (res2 != 0x03040002) + __builtin_abort (); + + res2 = swap_rotate_32_mask_2 (b); + if (res2 != 0x03040100) + __builtin_abort (); + + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/pr108938-3.c b/gcc/testsuite/gcc.target/i386/pr108938-3.c new file mode 100644 index 0000000000000000000000000000000000000000..32ac544c7edc8b30a34eac849cad4e70f5393f85 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-3.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */ +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */ + +void +foo1 (char* a, unsigned int* __restrict b) +{ + a[0] = b[0] >> 24; + a[1] = b[0] >> 16; + a[2] = b[0] >> 8; + a[3] = b[0]; + a[4] = b[1] >> 24; + a[5] = b[1] >> 16; + a[6] = b[1] >> 8; + a[7] = b[1]; +} + +void +foo2 (char* a, short* __restrict b) +{ + a[0] = b[0] >> 8; + a[1] = b[0]; + a[2] = b[1] >> 8; + a[3] = b[1]; +} diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-1.c b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c new file mode 100644 index 0000000000000000000000000000000000000000..50d3a505c81deb385821b514f3ba9522e04cf952 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c @@ -0,0 +1,69 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -mno-movbe" } */ +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */ + +#include<stdint.h> + +uint64_t +__attribute__((noipa)) +swap_rotate_64 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 0 | + (uint64_t)(x[1]) << 56 | + (uint64_t)(x[2]) << 48 | + (uint64_t)(x[3]) << 40 | + (uint64_t)(x[4]) << 32 | + (uint64_t)(x[5]) << 24 | + (uint64_t)(x[6]) << 16 | + (uint64_t)(x[7]) << 8); +} + +uint64_t +__attribute__((noipa)) +swap_rotate_64_mask_1 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 24 | + (uint64_t)(x[1]) << 16 | + (uint64_t)(x[2]) << 8 | + (uint64_t)(0) << 0 | + (uint64_t)(x[4]) << 56 | + (uint64_t)(x[5]) << 48 | + (uint64_t)(x[6]) << 40 | + (uint64_t)(x[7]) << 32); +} + +uint64_t +__attribute__((noipa)) +swap_rotate_64_mask_2 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 0 | + (uint64_t)(x[1]) << 56 | + (uint64_t)(x[2]) << 48 | + (uint64_t)(0) << 40 | + (uint64_t)(x[4]) << 32 | + (uint64_t)(x[5]) << 24 | + (uint64_t)(x[6]) << 16 | + (uint64_t)(x[7]) << 8); +} + + +uint32_t +__attribute__((noipa)) +swap_rotate_32 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 8 | + (uint64_t)(x[1]) << 0 | + (uint64_t)(x[2]) << 24 | + (uint64_t)(x[3]) << 16); +} + +uint32_t +__attribute__((noipa)) +swap_rotate_32_mask_1 (unsigned char* x) +{ + return ((uint64_t)(x[0]) << 8 | + (uint64_t)(0) << 0 | + (uint64_t)(x[2]) << 24 | + (uint64_t)(x[3]) << 16); +} diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-2.c b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c new file mode 100644 index 0000000000000000000000000000000000000000..51a8102f9950e420fb2c679750269dc3762c7e0b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c @@ -0,0 +1,30 @@ +/* { dg-do run } */ +/* { dg-options "-O2" } */ + +#include "pr108938-load-1.c" + +int main () +{ + unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8}; + uint64_t res = swap_rotate_64 (a); + if (res != 0x0203040506070801ULL) + __builtin_abort (); + + res = swap_rotate_64_mask_1 (a); + if (res != 0x0506070801020300ULL) + __builtin_abort (); + + res = swap_rotate_64_mask_2 (a); + if (res != 0x0203000506070801ULL) + __builtin_abort (); + + uint32_t res2 = swap_rotate_32 (a); + if (res2 != 0x03040102) + __builtin_abort (); + + res2 = swap_rotate_32_mask_1 (a); + if (res2 != 0x03040100) + __builtin_abort (); + + return 0; +}