From b4557008c45c0a44ae848c71d31ce4ed6316d043 Mon Sep 17 00:00:00 2001
From: Chung-Lin Tang <cltang@codesourcery.com>
Date: Thu, 26 May 2016 13:28:25 +0000
Subject: [PATCH] oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int
 parameter.

2016-05-26  Chung-Lin Tang  <cltang@codesourcery.com>

	libgomp/
	* oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter.
	* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async'
	parameter, use to set async stream around call to gomp_unmap_vars,
	call gomp_unmap_vars() with 'do_copyfrom' set to true.
	* plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field.
	(event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP
	events and call GOMP_PLUGIN_async_unmap_vars() for each of them.
	(event_add): Add int parameter, initialize 'val' field when
	adding new ptx_event struct.
	(nvptx_evec): Adjust event_add() call arguments.
	(nvptx_host2dev): Likewise.
	(nvptx_dev2host): Likewise.
	(nvptx_wait_async): Likewise.
	(nvptx_wait_all_async): Likewise.
	(GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter,
	pass to event_add() call.
	* oacc-host.c (host_openacc_register_async_cleanup): Add 'int async'
	parameter.
	* oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to
	call openacc.register_async_cleanup_func() hook.
	* oacc-parallel.c (GOACC_parallel_keyed): Likewise.
	* target.c (gomp_copy_from_async): Delete function.
	(gomp_map_vars): Remove async_refcount.
	(gomp_unmap_vars): Likewise.
	(gomp_load_image_to_device): Likewise.
	(omp_target_associate_ptr): Likewise.
	* libgomp.h (struct splay_tree_key_s): Remove async_refcount.
	(acc_dispatch_t.register_async_cleanup_func): Add int parameter.
	(gomp_copy_from_async): Remove.

From-SVN: r236772
---
 libgomp/ChangeLog             | 32 +++++++++++++++++++++++
 libgomp/libgomp.h             |  5 +---
 libgomp/oacc-host.c           |  3 ++-
 libgomp/oacc-mem.c            |  5 +---
 libgomp/oacc-parallel.c       |  5 +---
 libgomp/oacc-plugin.c         |  7 +++--
 libgomp/oacc-plugin.h         |  2 +-
 libgomp/plugin/plugin-nvptx.c | 41 ++++++++++++++++++++++-------
 libgomp/target.c              | 49 +++--------------------------------
 9 files changed, 77 insertions(+), 72 deletions(-)

diff --git a/libgomp/ChangeLog b/libgomp/ChangeLog
index e2496ff5833f..b4ae304dabb9 100644
--- a/libgomp/ChangeLog
+++ b/libgomp/ChangeLog
@@ -1,3 +1,35 @@
+2016-05-26  Chung-Lin Tang  <cltang@codesourcery.com>
+
+	* oacc-plugin.h (GOMP_PLUGIN_async_unmap_vars): Add int parameter.
+	* oacc-plugin.c (GOMP_PLUGIN_async_unmap_vars): Add 'int async'
+	parameter, use to set async stream around call to gomp_unmap_vars,
+	call gomp_unmap_vars() with 'do_copyfrom' set to true.
+	* plugin/plugin-nvptx.c (struct ptx_event): Add 'int val' field.
+	(event_gc): Adjust event handling loop, collect PTX_EVT_ASYNC_CLEANUP
+	events and call GOMP_PLUGIN_async_unmap_vars() for each of them.
+	(event_add): Add int parameter, initialize 'val' field when
+	adding new ptx_event struct.
+	(nvptx_evec): Adjust event_add() call arguments.
+	(nvptx_host2dev): Likewise.
+	(nvptx_dev2host): Likewise.
+	(nvptx_wait_async): Likewise.
+	(nvptx_wait_all_async): Likewise.
+	(GOMP_OFFLOAD_openacc_register_async_cleanup): Add async parameter,
+	pass to event_add() call.
+	* oacc-host.c (host_openacc_register_async_cleanup): Add 'int async'
+	parameter.
+	* oacc-mem.c (gomp_acc_remove_pointer): Adjust async case to
+	call openacc.register_async_cleanup_func() hook.
+	* oacc-parallel.c (GOACC_parallel_keyed): Likewise.
+	* target.c (gomp_copy_from_async): Delete function.
+	(gomp_map_vars): Remove async_refcount.
+	(gomp_unmap_vars): Likewise.
+	(gomp_load_image_to_device): Likewise.
+	(omp_target_associate_ptr): Likewise.
+	* libgomp.h (struct splay_tree_key_s): Remove async_refcount.
+	(acc_dispatch_t.register_async_cleanup_func): Add int parameter.
+	(gomp_copy_from_async): Remove.
+
 2016-05-26  Chung-Lin Tang  <cltang@codesourcery.com>
 
 	* target.c (gomp_device_copy): New function.
diff --git a/libgomp/libgomp.h b/libgomp/libgomp.h
index f0c048b151bc..7b2671ba49dd 100644
--- a/libgomp/libgomp.h
+++ b/libgomp/libgomp.h
@@ -835,8 +835,6 @@ struct splay_tree_key_s {
   uintptr_t tgt_offset;
   /* Reference count.  */
   uintptr_t refcount;
-  /* Asynchronous reference count.  */
-  uintptr_t async_refcount;
   /* Pointer to the original mapping of "omp declare target link" object.  */
   splay_tree_key link_key;
 };
@@ -872,7 +870,7 @@ typedef struct acc_dispatch_t
 		     unsigned *, void *);
 
   /* Async cleanup callback registration.  */
-  void (*register_async_cleanup_func) (void *);
+  void (*register_async_cleanup_func) (void *, int);
 
   /* Asynchronous routines.  */
   int (*async_test_func) (int);
@@ -977,7 +975,6 @@ extern struct target_mem_desc *gomp_map_vars (struct gomp_device_descr *,
 					      size_t, void **, void **,
 					      size_t *, void *, bool,
 					      enum gomp_map_vars_kind);
-extern void gomp_copy_from_async (struct target_mem_desc *);
 extern void gomp_unmap_vars (struct target_mem_desc *, bool);
 extern void gomp_init_device (struct gomp_device_descr *);
 extern void gomp_free_memmap (struct splay_tree_s *);
diff --git a/libgomp/oacc-host.c b/libgomp/oacc-host.c
index a24899c7f60c..fd3a672e4a55 100644
--- a/libgomp/oacc-host.c
+++ b/libgomp/oacc-host.c
@@ -148,7 +148,8 @@ host_openacc_exec (void (*fn) (void *),
 }
 
 static void
-host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)))
+host_openacc_register_async_cleanup (void *targ_mem_desc __attribute__ ((unused)),
+				     int async __attribute__ ((unused)))
 {
 }
 
diff --git a/libgomp/oacc-mem.c b/libgomp/oacc-mem.c
index 2aaa0d295cbe..bd4b62b006e8 100644
--- a/libgomp/oacc-mem.c
+++ b/libgomp/oacc-mem.c
@@ -704,10 +704,7 @@ gomp_acc_remove_pointer (void *h, bool force_copyfrom, int async, int mapnum)
   if (async < acc_async_noval)
     gomp_unmap_vars (t, true);
   else
-    {
-      gomp_copy_from_async (t);
-      acc_dev->openacc.register_async_cleanup_func (t);
-    }
+    t->device_descr->openacc.register_async_cleanup_func (t, async);
 
   gomp_debug (0, "  %s: mappings restored\n", __FUNCTION__);
 }
diff --git a/libgomp/oacc-parallel.c b/libgomp/oacc-parallel.c
index 1fdb01d927b8..ecdd75adcb81 100644
--- a/libgomp/oacc-parallel.c
+++ b/libgomp/oacc-parallel.c
@@ -186,10 +186,7 @@ GOACC_parallel_keyed (int device, void (*fn) (void *),
   if (async < acc_async_noval)
     gomp_unmap_vars (tgt, true);
   else
-    {
-      gomp_copy_from_async (tgt);
-      acc_dev->openacc.register_async_cleanup_func (tgt);
-    }
+    tgt->device_descr->openacc.register_async_cleanup_func (tgt, async);
 
   acc_dev->openacc.async_set_async_func (acc_async_sync);
 }
diff --git a/libgomp/oacc-plugin.c b/libgomp/oacc-plugin.c
index 54d8840b27df..889d86c8c7a8 100644
--- a/libgomp/oacc-plugin.c
+++ b/libgomp/oacc-plugin.c
@@ -31,11 +31,14 @@
 #include "oacc-int.h"
 
 void
-GOMP_PLUGIN_async_unmap_vars (void *ptr)
+GOMP_PLUGIN_async_unmap_vars (void *ptr, int async)
 {
   struct target_mem_desc *tgt = ptr;
+  struct gomp_device_descr *devicep = tgt->device_descr;
 
-  gomp_unmap_vars (tgt, false);
+  devicep->openacc.async_set_async_func (async);
+  gomp_unmap_vars (tgt, true);
+  devicep->openacc.async_set_async_func (acc_async_sync);
 }
 
 /* Return the target-specific part of the TLS data for the current thread.  */
diff --git a/libgomp/oacc-plugin.h b/libgomp/oacc-plugin.h
index d2e4fbff01fa..57fced5a6e45 100644
--- a/libgomp/oacc-plugin.h
+++ b/libgomp/oacc-plugin.h
@@ -27,7 +27,7 @@
 #ifndef OACC_PLUGIN_H
 #define OACC_PLUGIN_H 1
 
-extern void GOMP_PLUGIN_async_unmap_vars (void *);
+extern void GOMP_PLUGIN_async_unmap_vars (void *, int);
 extern void *GOMP_PLUGIN_acc_thread (void);
 
 #endif
diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c
index 2b6a888cbd2d..327500c01aa7 100644
--- a/libgomp/plugin/plugin-nvptx.c
+++ b/libgomp/plugin/plugin-nvptx.c
@@ -329,6 +329,7 @@ struct ptx_event
   int type;
   void *addr;
   int ord;
+  int val;
 
   struct ptx_event *next;
 };
@@ -789,6 +790,7 @@ static void
 event_gc (bool memmap_lockable)
 {
   struct ptx_event *ptx_event = ptx_events;
+  struct ptx_event *async_cleanups = NULL;
   struct nvptx_thread *nvthd = nvptx_thread ();
 
   pthread_mutex_lock (&ptx_event_lock);
@@ -806,6 +808,7 @@ event_gc (bool memmap_lockable)
       r = cuEventQuery (*e->evt);
       if (r == CUDA_SUCCESS)
 	{
+	  bool append_async = false;
 	  CUevent *te;
 
 	  te = e->evt;
@@ -830,7 +833,7 @@ event_gc (bool memmap_lockable)
 		if (!memmap_lockable)
 		  continue;
 
-		GOMP_PLUGIN_async_unmap_vars (e->addr);
+		append_async = true;
 	      }
 	      break;
 	    }
@@ -838,6 +841,7 @@ event_gc (bool memmap_lockable)
 	  cuEventDestroy (*te);
 	  free ((void *)te);
 
+	  /* Unlink 'e' from ptx_events list.  */
 	  if (ptx_events == e)
 	    ptx_events = ptx_events->next;
 	  else
@@ -848,15 +852,31 @@ event_gc (bool memmap_lockable)
 	      e_->next = e_->next->next;
 	    }
 
-	  free (e);
+	  if (append_async)
+	    {
+	      e->next = async_cleanups;
+	      async_cleanups = e;
+	    }
+	  else
+	    free (e);
 	}
     }
 
   pthread_mutex_unlock (&ptx_event_lock);
+
+  /* We have to do these here, after ptx_event_lock is released.  */
+  while (async_cleanups)
+    {
+      struct ptx_event *e = async_cleanups;
+      async_cleanups = async_cleanups->next;
+
+      GOMP_PLUGIN_async_unmap_vars (e->addr, e->val);
+      free (e);
+    }
 }
 
 static void
-event_add (enum ptx_event_type type, CUevent *e, void *h)
+event_add (enum ptx_event_type type, CUevent *e, void *h, int val)
 {
   struct ptx_event *ptx_event;
   struct nvptx_thread *nvthd = nvptx_thread ();
@@ -869,6 +889,7 @@ event_add (enum ptx_event_type type, CUevent *e, void *h)
   ptx_event->evt = e;
   ptx_event->addr = h;
   ptx_event->ord = nvthd->ptx_dev->ord;
+  ptx_event->val = val;
 
   pthread_mutex_lock (&ptx_event_lock);
 
@@ -975,7 +996,7 @@ nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs,
 
       CUDA_CALL_ASSERT (cuEventRecord, *e, dev_str->stream);
 
-      event_add (PTX_EVT_KNL, e, (void *)dev_str);
+      event_add (PTX_EVT_KNL, e, (void *)dev_str, 0);
     }
 #else
   r = cuCtxSynchronize ();
@@ -1071,7 +1092,7 @@ nvptx_host2dev (void *d, const void *h, size_t s)
       CUDA_CALL (cuMemcpyHtoDAsync,
 		 (CUdeviceptr) d, h, s, nvthd->current_stream->stream);
       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
-      event_add (PTX_EVT_MEM, e, (void *)h);
+      event_add (PTX_EVT_MEM, e, (void *)h, 0);
     }
   else
 #endif
@@ -1127,7 +1148,7 @@ nvptx_dev2host (void *h, const void *d, size_t s)
       CUDA_CALL (cuMemcpyDtoHAsync,
 		 h, (CUdeviceptr) d, s, nvthd->current_stream->stream);
       CUDA_CALL (cuEventRecord, *e, nvthd->current_stream->stream);
-      event_add (PTX_EVT_MEM, e, (void *)h);
+      event_add (PTX_EVT_MEM, e, (void *)h, 0);
     }
   else
 #endif
@@ -1240,7 +1261,7 @@ nvptx_wait_async (int async1, int async2)
 
   CUDA_CALL_ASSERT (cuEventRecord, *e, s1->stream);
 
-  event_add (PTX_EVT_SYNC, e, NULL);
+  event_add (PTX_EVT_SYNC, e, NULL, 0);
 
   CUDA_CALL_ASSERT (cuStreamWaitEvent, s2->stream, *e, 0);
 }
@@ -1313,7 +1334,7 @@ nvptx_wait_all_async (int async)
       /* Record an event on the waited-for stream.  */
       CUDA_CALL_ASSERT (cuEventRecord, *e, other_stream->stream);
 
-      event_add (PTX_EVT_SYNC, e, NULL);
+      event_add (PTX_EVT_SYNC, e, NULL, 0);
 
       CUDA_CALL_ASSERT (cuStreamWaitEvent, waiting_stream->stream, *e, 0);
    }
@@ -1646,14 +1667,14 @@ GOMP_OFFLOAD_openacc_parallel (void (*fn) (void *), size_t mapnum,
 }
 
 void
-GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc)
+GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc, int async)
 {
   struct nvptx_thread *nvthd = nvptx_thread ();
   CUevent *e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent));
 
   CUDA_CALL_ASSERT (cuEventCreate, e, CU_EVENT_DISABLE_TIMING);
   CUDA_CALL_ASSERT (cuEventRecord, *e, nvthd->current_stream->stream);
-  event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc);
+  event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc, async);
 }
 
 int
diff --git a/libgomp/target.c b/libgomp/target.c
index 5a86fc077e6e..48b9ab8e0767 100644
--- a/libgomp/target.c
+++ b/libgomp/target.c
@@ -707,7 +707,6 @@ gomp_map_vars (struct gomp_device_descr *devicep, size_t mapnum,
 		tgt->list[i].offset = 0;
 		tgt->list[i].length = k->host_end - k->host_start;
 		k->refcount = 1;
-		k->async_refcount = 0;
 		tgt->refcount++;
 		array->left = NULL;
 		array->right = NULL;
@@ -854,43 +853,9 @@ gomp_unmap_tgt (struct target_mem_desc *tgt)
   free (tgt);
 }
 
-/* Decrease the refcount for a set of mapped variables, and queue asychronous
-   copies from the device back to the host after any work that has been issued.
-   Because the regions are still "live", increment an asynchronous reference
-   count to indicate that they should not be unmapped from host-side data
-   structures until the asynchronous copy has completed.  */
-
-attribute_hidden void
-gomp_copy_from_async (struct target_mem_desc *tgt)
-{
-  struct gomp_device_descr *devicep = tgt->device_descr;
-  size_t i;
-
-  gomp_mutex_lock (&devicep->lock);
-
-  for (i = 0; i < tgt->list_count; i++)
-    if (tgt->list[i].key == NULL)
-      ;
-    else if (tgt->list[i].key->refcount > 1)
-      {
-	tgt->list[i].key->refcount--;
-	tgt->list[i].key->async_refcount++;
-      }
-    else
-      {
-	splay_tree_key k = tgt->list[i].key;
-	if (tgt->list[i].copy_from)
-	  gomp_copy_dev2host (devicep, (void *) k->host_start,
-			      (void *) (k->tgt->tgt_start + k->tgt_offset),
-			      k->host_end - k->host_start);
-      }
-
-  gomp_mutex_unlock (&devicep->lock);
-}
-
 /* Unmap variables described by TGT.  If DO_COPYFROM is true, copy relevant
    variables back from device to host: if it is false, it is assumed that this
-   has been done already, i.e. by gomp_copy_from_async above.  */
+   has been done already.  */
 
 attribute_hidden void
 gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
@@ -924,13 +889,8 @@ gomp_unmap_vars (struct target_mem_desc *tgt, bool do_copyfrom)
 	k->refcount--;
       else if (k->refcount == 1)
 	{
-	  if (k->async_refcount > 0)
-	    k->async_refcount--;
-	  else
-	    {
-	      k->refcount--;
-	      do_unmap = true;
-	    }
+	  k->refcount--;
+	  do_unmap = true;
 	}
 
       if ((do_unmap && do_copyfrom && tgt->list[i].copy_from)
@@ -1076,7 +1036,6 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       k->tgt = tgt;
       k->tgt_offset = target_table[i].start;
       k->refcount = REFCOUNT_INFINITY;
-      k->async_refcount = 0;
       k->link_key = NULL;
       array->left = NULL;
       array->right = NULL;
@@ -1109,7 +1068,6 @@ gomp_load_image_to_device (struct gomp_device_descr *devicep, unsigned version,
       k->tgt = tgt;
       k->tgt_offset = target_var->start;
       k->refcount = target_size & link_bit ? REFCOUNT_LINK : REFCOUNT_INFINITY;
-      k->async_refcount = 0;
       k->link_key = NULL;
       array->left = NULL;
       array->right = NULL;
@@ -2332,7 +2290,6 @@ omp_target_associate_ptr (void *host_ptr, void *device_ptr, size_t size,
       k->tgt = tgt;
       k->tgt_offset = (uintptr_t) device_ptr + device_offset;
       k->refcount = REFCOUNT_INFINITY;
-      k->async_refcount = 0;
       array->left = NULL;
       array->right = NULL;
       splay_tree_insert (&devicep->mem_map, array);
-- 
GitLab