From 99a6c1065de2db04d0f56f4b2cc89acecf21b72e Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Tue, 24 Oct 2023 16:45:08 +0100
Subject: [PATCH] i386: Fine tune STV register conversion costs for -Os.

The eagle-eyed may have spotted that my recent testcases for DImode shifts
on x86_64 included -mno-stv in the dg-options.  This is because the
Scalar-To-Vector (STV) pass currently transforms these shifts to use
SSE vector operations, producing larger code even with -Os.  The issue
is that the compute_convert_gain currently underestimates the size of
instructions required for interunit moves, which is corrected with the
patch below.

For the simple test case:

unsigned long long shl1(unsigned long long x) { return x << 1; }

without this patch, GCC -m32 -Os -mavx2 currently generates:

shl1:	push   %ebp		 // 1 byte
	mov    %esp,%ebp	 // 2 bytes
	vmovq  0x8(%ebp),%xmm0	 // 5 bytes
	pop    %ebp		 // 1 byte
	vpaddq %xmm0,%xmm0,%xmm0 // 4 bytes
	vmovd  %xmm0,%eax	 // 4 bytes
	vpextrd $0x1,%xmm0,%edx  // 6 bytes
	ret			 // 1 byte  = 24 bytes total

with this patch, we now generate the shorter

shl1:	push   %ebp		// 1 byte
	mov    %esp,%ebp	// 2 bytes
	mov    0x8(%ebp),%eax	// 3 bytes
	mov    0xc(%ebp),%edx	// 3 bytes
	pop    %ebp		// 1 byte
	add    %eax,%eax	// 2 bytes
	adc    %edx,%edx	// 2 bytes
	ret			// 1 byte  = 15 bytes total

Benchmarking using CSiBE, shows that this patch saves 1361 bytes
when compiling with -m32 -Os, and saves 172 bytes when compiling
with -Os.

2023-10-24  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* config/i386/i386-features.cc (compute_convert_gain): Provide
	more accurate values (sizes) for inter-unit moves with -Os.
---
 gcc/config/i386/i386-features.cc | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
index cead3970cfce..6fac67e9a285 100644
--- a/gcc/config/i386/i386-features.cc
+++ b/gcc/config/i386/i386-features.cc
@@ -752,11 +752,33 @@ general_scalar_chain::compute_convert_gain ()
     fprintf (dump_file, "  Instruction conversion gain: %d\n", gain);
 
   /* Cost the integer to sse and sse to integer moves.  */
-  cost += n_sse_to_integer * ix86_cost->sse_to_integer;
-  /* ???  integer_to_sse but we only have that in the RA cost table.
-     Assume sse_to_integer/integer_to_sse are the same which they
-     are at the moment.  */
-  cost += n_integer_to_sse * ix86_cost->sse_to_integer;
+  if (!optimize_function_for_size_p (cfun))
+    {
+      cost += n_sse_to_integer * ix86_cost->sse_to_integer;
+      /* ???  integer_to_sse but we only have that in the RA cost table.
+              Assume sse_to_integer/integer_to_sse are the same which they
+	      are at the moment.  */
+      cost += n_integer_to_sse * ix86_cost->sse_to_integer;
+    }
+  else if (TARGET_64BIT || smode == SImode)
+    {
+      cost += n_sse_to_integer * COSTS_N_BYTES (4);
+      cost += n_integer_to_sse * COSTS_N_BYTES (4);
+    }
+  else if (TARGET_SSE4_1)
+    {
+      /* vmovd (4 bytes) + vpextrd (6 bytes).  */
+      cost += n_sse_to_integer * COSTS_N_BYTES (10);
+      /* vmovd (4 bytes) + vpinsrd (6 bytes).  */
+      cost += n_integer_to_sse * COSTS_N_BYTES (10);
+    }
+  else
+    {
+      /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes).  */
+      cost += n_sse_to_integer * COSTS_N_BYTES (13);
+      /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes).  */
+      cost += n_integer_to_sse * COSTS_N_BYTES (12);
+    }
 
   if (dump_file)
     fprintf (dump_file, "  Registers conversion cost: %d\n", cost);
-- 
GitLab