diff --git a/include/cuda/cuda.h b/include/cuda/cuda.h index 09c3c2b8dbee048173780765adc90e63abb2fe74..94fc64a488d943ece763bbaa277daa3a96182f10 100644 --- a/include/cuda/cuda.h +++ b/include/cuda/cuda.h @@ -147,7 +147,7 @@ typedef struct { size_t dstXInBytes, dstY; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; size_t dstPitch; @@ -162,16 +162,16 @@ typedef struct { const void *srcHost; CUdeviceptr srcDevice; CUarray srcArray; - void *dummy; + void *reserved0; size_t srcPitch, srcHeight; size_t dstXInBytes, dstY, dstZ; size_t dstLOD; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; - void *dummy2; + void *reserved1; size_t dstPitch, dstHeight; size_t WidthInBytes, Height, Depth; @@ -190,7 +190,7 @@ typedef struct { size_t dstXInBytes, dstY, dstZ; size_t dstLOD; CUmemorytype dstMemoryType; - const void *dstHost; + void *dstHost; CUdeviceptr dstDevice; CUarray dstArray; CUcontext dstContext; @@ -246,6 +246,8 @@ CUresult cuMemAlloc (CUdeviceptr *, size_t); CUresult cuMemAllocHost (void **, size_t); CUresult cuMemHostAlloc (void **, size_t, unsigned int); CUresult cuMemcpy (CUdeviceptr, CUdeviceptr, size_t); +CUresult cuMemcpyPeer (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); +CUresult cuMemcpyPeerAsync (CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); #define cuMemcpyDtoDAsync cuMemcpyDtoDAsync_v2 CUresult cuMemcpyDtoDAsync (CUdeviceptr, CUdeviceptr, size_t, CUstream); #define cuMemcpyDtoH cuMemcpyDtoH_v2 diff --git a/libgomp/plugin/plugin-nvptx.c b/libgomp/plugin/plugin-nvptx.c index 9cdc55cac6b077cce65c8ff011369d3eb895f28a..00d4241ae02bf3f56a85b924a4b2288b66f8e4d5 100644 --- a/libgomp/plugin/plugin-nvptx.c +++ b/libgomp/plugin/plugin-nvptx.c @@ -1794,6 +1794,8 @@ GOMP_OFFLOAD_memcpy2d (int dst_ord, int src_ord, size_t dim1_size, /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ CUDA_MEMCPY2D data; + + memset (&data, 0, sizeof (data)); data.WidthInBytes = dim1_size; data.Height = dim0_len; @@ -1855,6 +1857,8 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, /* TODO: Consider using CU_MEMORYTYPE_UNIFIED if supported. */ CUDA_MEMCPY3D data; + + memset (&data, 0, sizeof (data)); data.WidthInBytes = dim2_size; data.Height = dim1_len; data.Depth = dim0_len; @@ -1874,7 +1878,6 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, data.dstXInBytes = dst_offset2_size; data.dstY = dst_offset1_len; data.dstZ = dst_offset0_len; - data.dstLOD = 0; if (src_ord == -1) { @@ -1891,7 +1894,6 @@ GOMP_OFFLOAD_memcpy3d (int dst_ord, int src_ord, size_t dim2_size, data.srcXInBytes = src_offset2_size; data.srcY = src_offset1_len; data.srcZ = src_offset0_len; - data.srcLOD = 0; CUDA_CALL (cuMemcpy3D, &data); return true; diff --git a/libgomp/target.c b/libgomp/target.c index 5cf2e8dce3733ab5af34ef508df269894f37d472..cd4cc1b01ca778d1e94df13a2057479bde31ddb9 100644 --- a/libgomp/target.c +++ b/libgomp/target.c @@ -4540,33 +4540,22 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, || __builtin_mul_overflow (element_size, dst_offsets[0], &dst_off) || __builtin_mul_overflow (element_size, src_offsets[0], &src_off)) return EINVAL; - if (src_devicep != NULL && src_devicep == dst_devicep) - ret = src_devicep->dev2dev_func (src_devicep->target_id, - (char *) dst + dst_off, - (const char *) src + src_off, - length); - else if (src_devicep != NULL - && (dst_devicep == NULL - || (dst_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))) - ret = src_devicep->dev2host_func (src_devicep->target_id, + if (dst_devicep == NULL && src_devicep == NULL) + { + memcpy ((char *) dst + dst_off, (const char *) src + src_off, + length); + ret = 1; + } + else if (src_devicep == NULL) + ret = dst_devicep->host2dev_func (dst_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); - else if (dst_devicep != NULL - && (src_devicep == NULL - || (src_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))) - ret = dst_devicep->host2dev_func (dst_devicep->target_id, + else if (dst_devicep == NULL) + ret = src_devicep->dev2host_func (src_devicep->target_id, (char *) dst + dst_off, (const char *) src + src_off, length); - else if (dst_devicep == NULL && src_devicep == NULL) - { - memcpy ((char *) dst + dst_off, (const char *) src + src_off, - length); - ret = 1; - } else if (src_devicep == dst_devicep) ret = src_devicep->dev2dev_func (src_devicep->target_id, (char *) dst + dst_off, @@ -4584,7 +4573,8 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, else if (*tmp_size < length) { *tmp_size = length; - *tmp = realloc (*tmp, length); + free (*tmp); + *tmp = malloc (length); if (*tmp == NULL) return ENOMEM; } @@ -4599,7 +4589,7 @@ omp_target_memcpy_rect_worker (void *dst, const void *src, size_t element_size, return ret ? 0 : EINVAL; } - /* host->device, device->host and same-device device->device. */ + /* host->device, device->host and intra device. */ if (num_dims == 2 && ((src_devicep && src_devicep == dst_devicep @@ -4711,16 +4701,8 @@ omp_target_memcpy_rect_copy (void *dst, const void *src, bool lock_src; bool lock_dst; - lock_src = (src_devicep - && (!dst_devicep - || src_devicep == dst_devicep - || !(src_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM))); - lock_dst = (dst_devicep - && (!lock_src - || (src_devicep != dst_devicep - && !(dst_devicep->capabilities - & GOMP_OFFLOAD_CAP_SHARED_MEM)))); + lock_src = src_devicep != NULL; + lock_dst = dst_devicep != NULL && src_devicep != dst_devicep; if (lock_src) gomp_mutex_lock (&src_devicep->lock); if (lock_dst) @@ -5076,8 +5058,8 @@ gomp_load_plugin_for_device (struct gomp_device_descr *device, DLSYM (free); DLSYM (dev2host); DLSYM (host2dev); - DLSYM (memcpy2d); - DLSYM (memcpy3d); + DLSYM_OPT (memcpy2d, memcpy2d); + DLSYM_OPT (memcpy3d, memcpy3d); device->capabilities = device->get_caps_func (); if (device->capabilities & GOMP_OFFLOAD_CAP_OPENMP_400) {