From 38900247f3880d6eca2e364a000e5898f8deae64 Mon Sep 17 00:00:00 2001
From: Prathamesh Kulkarni <prathameshk@nvidia.com>
Date: Wed, 7 Aug 2024 23:45:38 +0530
Subject: [PATCH] Partially support streaming of poly_int for offloading.

When offloading is enabled, the patch streams out host
NUM_POLY_INT_COEFFS, and changes streaming in as follows:

if (host_num_poly_int_coeffs <= NUM_POLY_INT_COEFFS)
{
  for (i = 0; i < host_num_poly_int_coeffs; i++)
    poly_int.coeffs[i] = stream_in coeff;
  for (; i < NUM_POLY_INT_COEFFS; i++)
    poly_int.coeffs[i] = 0;
}
else
{
  for (i = 0; i < NUM_POLY_INT_COEFFS; i++)
    poly_int.coeffs[i] = stream_in coeff;

  /* Ensure that degree of poly_int <= accel NUM_POLY_INT_COEFFS.  */
  for (; i < host_num_poly_int_coeffs; i++)
    {
      val = stream_in coeff;
      if (val != 0)
	error ();
    }
}

gcc/ChangeLog:
	PR ipa/96265
	PR ipa/111937
	* data-streamer-in.cc (streamer_read_poly_uint64): Remove code for
	streaming, and call poly_int_read_common instead.
	(streamer_read_poly_int64): Likewise.
	* data-streamer.cc (host_num_poly_int_coeffs): Conditionally define
	new variable if ACCEL_COMPILER is defined.
	* data-streamer.h (host_num_poly_int_coeffs): Declare.
	(poly_int_read_common): New function template.
	(bp_unpack_poly_value): Remove code for streaming and call
	poly_int_read_common instead.
	* lto-streamer-in.cc (lto_input_mode_table): Stream-in host
	NUM_POLY_INT_COEFFS into host_num_poly_int_coeffs if ACCEL_COMPILER
	is defined.
	* lto-streamer-out.cc (lto_write_mode_table): Stream out
	NUM_POLY_INT_COEFFS if offloading is enabled.
	* poly-int.h (MAX_NUM_POLY_INT_COEFFS_BITS): New macro.
	* tree-streamer-in.cc (lto_input_ts_poly_tree_pointers): Adjust
	streaming-in of poly_int.

Signed-off-by: Prathamesh Kulkarni <prathameshk@nvidia.com>
---
 gcc/data-streamer-in.cc | 12 ++++------
 gcc/data-streamer.cc    |  8 +++++++
 gcc/data-streamer.h     | 49 +++++++++++++++++++++++++++++++++++++----
 gcc/lto-streamer-in.cc  |  5 +++++
 gcc/lto-streamer-out.cc |  3 +++
 gcc/poly-int.h          |  4 ++++
 gcc/tree-streamer-in.cc | 33 +++++++++++++++++++++++++--
 7 files changed, 100 insertions(+), 14 deletions(-)

diff --git a/gcc/data-streamer-in.cc b/gcc/data-streamer-in.cc
index 7dce2928ef03..07dbc5e2bc36 100644
--- a/gcc/data-streamer-in.cc
+++ b/gcc/data-streamer-in.cc
@@ -182,10 +182,8 @@ streamer_read_hwi (class lto_input_block *ib)
 poly_uint64
 streamer_read_poly_uint64 (class lto_input_block *ib)
 {
-  poly_uint64 res;
-  for (unsigned int i = 0; i < NUM_POLY_INT_COEFFS; ++i)
-    res.coeffs[i] = streamer_read_uhwi (ib);
-  return res;
+  using coeff_type = poly_int_traits<poly_uint64>::coeff_type;
+  return poly_int_read_common<coeff_type> (streamer_read_uhwi, ib);
 }
 
 /* Read a poly_int64 from IB.  */
@@ -193,10 +191,8 @@ streamer_read_poly_uint64 (class lto_input_block *ib)
 poly_int64
 streamer_read_poly_int64 (class lto_input_block *ib)
 {
-  poly_int64 res;
-  for (unsigned int i = 0; i < NUM_POLY_INT_COEFFS; ++i)
-    res.coeffs[i] = streamer_read_hwi (ib);
-  return res;
+  using coeff_type = poly_int_traits<poly_int64>::coeff_type;
+  return poly_int_read_common<coeff_type> (streamer_read_hwi, ib);
 }
 
 /* Read gcov_type value from IB.  */
diff --git a/gcc/data-streamer.cc b/gcc/data-streamer.cc
index 346b294c72ac..896413e8d2b6 100644
--- a/gcc/data-streamer.cc
+++ b/gcc/data-streamer.cc
@@ -28,6 +28,14 @@ along with GCC; see the file COPYING3.  If not see
 #include "cgraph.h"
 #include "data-streamer.h"
 
+/* For offloading -- While streaming-out, host NUM_POLY_INT_COEFFS is
+   stored at beginning of mode_table.  While streaming-in, the value is read
+   in host_num_poly_int_coeffs.  */
+
+#ifdef ACCEL_COMPILER
+unsigned host_num_poly_int_coeffs = 0;
+#endif
+
 /* Pack WORK into BP in a variant of uleb format.  */
 
 void
diff --git a/gcc/data-streamer.h b/gcc/data-streamer.h
index 6a2596134ceb..b3dc4b984767 100644
--- a/gcc/data-streamer.h
+++ b/gcc/data-streamer.h
@@ -50,6 +50,7 @@ void bp_pack_real_value (struct bitpack_d *, const REAL_VALUE_TYPE *);
 void bp_unpack_real_value (struct bitpack_d *, REAL_VALUE_TYPE *);
 unsigned HOST_WIDE_INT bp_unpack_var_len_unsigned (struct bitpack_d *);
 HOST_WIDE_INT bp_unpack_var_len_int (struct bitpack_d *);
+extern unsigned host_num_poly_int_coeffs;
 
 /* In data-streamer-out.cc  */
 void streamer_write_zero (struct output_block *);
@@ -194,15 +195,55 @@ bp_unpack_value (struct bitpack_d *bp, unsigned nbits)
   return val & mask;
 }
 
+/* Common code for reading poly_int.  */
+
+template<typename C, typename F, typename ...Args>
+poly_int<NUM_POLY_INT_COEFFS, C>
+poly_int_read_common (F read_coeff, Args ...args)
+{
+  poly_int<NUM_POLY_INT_COEFFS, C> x;
+  unsigned i;
+
+#ifdef ACCEL_COMPILER
+  /* Ensure that we have streamed-in host_num_poly_int_coeffs.  */
+  const unsigned num_poly_int_coeffs = host_num_poly_int_coeffs;
+  gcc_assert (host_num_poly_int_coeffs > 0);
+#else
+  const unsigned num_poly_int_coeffs = NUM_POLY_INT_COEFFS;
+#endif
+
+  if (num_poly_int_coeffs <= NUM_POLY_INT_COEFFS)
+    {
+      for (i = 0; i < num_poly_int_coeffs; i++)
+	x.coeffs[i] = read_coeff (args...);
+      for (; i < NUM_POLY_INT_COEFFS; i++)
+	x.coeffs[i] = 0;
+    }
+  else
+    {
+      for (i = 0; i < NUM_POLY_INT_COEFFS; i++)
+	x.coeffs[i] = read_coeff (args...);
+
+      /* Ensure that degree of poly_int <= accel NUM_POLY_INT_COEFFS.  */
+      for (; i < num_poly_int_coeffs; i++)
+	{
+	  C val = read_coeff (args...);
+	  if (val != 0)
+	    fatal_error (input_location,
+			 "degree of %<poly_int%> exceeds "
+			 "%<NUM_POLY_INT_COEFFS%> (%d)",
+			 NUM_POLY_INT_COEFFS);
+	}
+    }
+  return x;
+}
+
 /* Unpacks a polynomial value from the bit-packing context BP in which each
    coefficient has NBITS bits.  */
 inline poly_int<NUM_POLY_INT_COEFFS, bitpack_word_t>
 bp_unpack_poly_value (struct bitpack_d *bp, unsigned nbits)
 {
-  poly_int<NUM_POLY_INT_COEFFS, bitpack_word_t> x;
-  for (int i = 0; i < NUM_POLY_INT_COEFFS; ++i)
-    x.coeffs[i] = bp_unpack_value (bp, nbits);
-  return x;
+  return poly_int_read_common<bitpack_word_t> (bp_unpack_value, bp, nbits);
 }
 
 
diff --git a/gcc/lto-streamer-in.cc b/gcc/lto-streamer-in.cc
index 2e592be80823..cbf6041fd685 100644
--- a/gcc/lto-streamer-in.cc
+++ b/gcc/lto-streamer-in.cc
@@ -2013,6 +2013,11 @@ lto_input_mode_table (struct lto_file_decl_data *file_data)
 				header->string_size, vNULL);
   bitpack_d bp = streamer_read_bitpack (&ib);
 
+#ifdef ACCEL_COMPILER
+  host_num_poly_int_coeffs
+    = bp_unpack_value (&bp, MAX_NUM_POLY_INT_COEFFS_BITS);
+#endif
+
   unsigned mode_bits = bp_unpack_value (&bp, 5);
   unsigned char *table = ggc_cleared_vec_alloc<unsigned char> (1 << mode_bits);
 
diff --git a/gcc/lto-streamer-out.cc b/gcc/lto-streamer-out.cc
index c329ac8af958..523d6dad221e 100644
--- a/gcc/lto-streamer-out.cc
+++ b/gcc/lto-streamer-out.cc
@@ -3192,6 +3192,9 @@ lto_write_mode_table (void)
   ob = create_output_block (LTO_section_mode_table);
   bitpack_d bp = bitpack_create (ob->main_stream);
 
+  if (lto_stream_offload_p)
+    bp_pack_value (&bp, NUM_POLY_INT_COEFFS, MAX_NUM_POLY_INT_COEFFS_BITS);
+
   /* Ensure that for GET_MODE_INNER (m) != m we have
      also the inner mode marked.  */
   for (int i = 0; i < (int) MAX_MACHINE_MODE; i++)
diff --git a/gcc/poly-int.h b/gcc/poly-int.h
index e3f8d4df7164..947081659616 100644
--- a/gcc/poly-int.h
+++ b/gcc/poly-int.h
@@ -354,6 +354,10 @@ struct poly_result<T1, T2, 2>
    ? (void) ((RES).coeffs[I] = VALUE) \
    : (void) ((RES).coeffs[I].~C (), new (&(RES).coeffs[I]) C (VALUE)))
 
+/* Number of bits needed to represent maximum value of
+   NUM_POLY_INT_COEFFS defined by any target.  */
+#define MAX_NUM_POLY_INT_COEFFS_BITS	2
+
 /* poly_int_full and poly_int_hungry are used internally within poly_int
    for delegated initializers.  poly_int_full indicates that a parameter
    pack has enough elements to initialize every coefficient.  poly_int_hungry
diff --git a/gcc/tree-streamer-in.cc b/gcc/tree-streamer-in.cc
index c248a74f7a1a..40029437199c 100644
--- a/gcc/tree-streamer-in.cc
+++ b/gcc/tree-streamer-in.cc
@@ -671,8 +671,37 @@ static void
 lto_input_ts_poly_tree_pointers (class lto_input_block *ib,
 				 class data_in *data_in, tree expr)
 {
-  for (unsigned int i = 0; i < NUM_POLY_INT_COEFFS; ++i)
-    POLY_INT_CST_COEFF (expr, i) = stream_read_tree_ref (ib, data_in);
+#ifdef ACCEL_COMPILER
+  /* Ensure that we have streamed-in host_num_poly_int_coeffs.  */
+  const unsigned num_poly_int_coeffs = host_num_poly_int_coeffs;
+  gcc_assert (num_poly_int_coeffs > 0);
+#else
+  const unsigned num_poly_int_coeffs = NUM_POLY_INT_COEFFS;
+#endif
+
+  unsigned i;
+  if (num_poly_int_coeffs <= NUM_POLY_INT_COEFFS)
+    {
+      for (i = 0; i < num_poly_int_coeffs; i++)
+	POLY_INT_CST_COEFF (expr, i) = stream_read_tree_ref (ib, data_in);
+
+      tree coeff_type = TREE_TYPE (POLY_INT_CST_COEFF (expr, 0));
+      for (; i < NUM_POLY_INT_COEFFS; i++)
+	POLY_INT_CST_COEFF (expr, i) = build_zero_cst (coeff_type);
+    }
+  else
+    {
+      for (i = 0; i < NUM_POLY_INT_COEFFS; i++)
+	POLY_INT_CST_COEFF (expr, i) = stream_read_tree_ref (ib, data_in);
+      for (; i < num_poly_int_coeffs; i++)
+	{
+	  tree val = stream_read_tree_ref (ib, data_in);
+	  if (!integer_zerop (val))
+	    fatal_error (input_location,
+			 "degree of %<poly_int%> exceeds "
+			 "%<NUM_POLY_INT_COEFFS%>");
+	}
+    }
 }
 
 
-- 
GitLab