cuda::memcpy_async
Defined in header <cuda/barrier>:
// (1)
template <typename Shape, cuda::thread_scope Scope, typename CompletionFunction>
__host__ __device__
void cuda::memcpy_async(void* destination, void const* source, Shape size,
cuda::barrier<Scope, CompletionFunction>& barrier);
// (2)
template <typename Group,
typename Shape, cuda::thread_scope Scope, typename CompletionFunction>
__host__ __device__
void cuda::memcpy_async(Group const& group,
void* destination, void const* source, Shape size,
cuda::barrier<Scope, CompletionFunction>& barrier);
Defined in header <cuda/pipeline>:
// (3)
template <typename Shape, cuda::thread_scope Scope>
__host__ __device__
void cuda::memcpy_async(void* destination, void const* source, Shape size,
cuda::pipeline<Scope>& pipeline);
// (4)
template <typename Group, typename Shape, cuda::thread_scope Scope>
__host__ __device__
void cuda::memcpy_async(Group const& group,
void* destination, void const* source, Shape size,
cuda::pipeline<Scope>& pipeline);
Defined in header <cuda/annotated_ptr>:
// (5)
template <typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (6)
template<typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (7)
template<typename Group, typename Dst, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Group const& group, Dst* dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
// (8)
template<typename Group, typename Dst, typename DstProperty, typename Src, typename SrcProperty, typename Shape, typename Sync>
__host__ __device__
void memcpy_async(Group const& group, cuda::annotated_ptr<Dst, DstProperty> dst, cuda::annotated_ptr<Src, SrcProperty> src, Shape size, Sync& sync);
cuda::memcpy_async asynchronously copies size bytes from the
memory location pointed to by source to the memory location pointed
to by destination. Both objects are reinterpreted as arrays of
unsigned char.
Binds the asynchronous copy completion to
cuda::barrierand issues the copy in the current thread.Binds the asynchronous copy completion to
cuda::barrierand cooperatively issues the copy across all threads ingroup.Binds the asynchronous copy completion to
cuda::pipelineand issues the copy in the current thread.Binds the asynchronous copy completion to
cuda::pipelineand cooperatively issues the copy across all threads ingroup.5-8: convenience wrappers using
cuda::annotated_ptrwhereSyncis eithercuda::barrierorcuda::pipeline.
Notes
cuda::memcpy_async have similar constraints to std::memcpy,
namely:
If the objects overlap, the behavior is undefined.
If either
destinationorsourceis an invalid or null pointer, the behavior is undefined (even ifcountis zero).If the objects are potentially-overlapping the behavior is undefined.
If the objects are not of TriviallyCopyable type the program is ill-formed, no diagnostic required.
If Shape is cuda::aligned_size_t,
sourceanddestinationare both required to be aligned oncuda::aligned_size_t::align, else the behavior is undefined.If
cuda::pipelineis in a quitted state (see cuda::pipeline::quit), the behavior is undefined.For cooperative variants, if the parameters are not the same across all threads in
group, the behavior is undefined.
Template Parameters
|
A type satisfying the [Group] concept. |
|
Either cuda::std::size_t or cuda::aligned_size_t. |
Parameters
|
The group of threads. |
|
Pointer to the memory location to copy to. |
|
Pointer to the memory location to copy from. |
|
The number of bytes to copy. |
|
The barrier object used to wait on the copy completion. |
|
The pipeline object used to wait on the copy completion. |
Examples
#include <cuda/barrier>
__global__ void example_kernel(char* dst, char* src) {
cuda::barrier<cuda::thread_scope_system> bar;
init(&bar, 1);
cuda::memcpy_async(dst, src, 1, bar);
cuda::memcpy_async(dst + 1, src + 8, 1, bar);
cuda::memcpy_async(dst + 2, src + 16, 1, bar);
cuda::memcpy_async(dst + 3, src + 24, 1, bar);
bar.arrive_and_wait();
}