7#ifndef HEFFTE_BACKEND_ONEAPI_H
8#define HEFFTE_BACKEND_ONEAPI_H
10#include "heffte_r2r_executor.h"
12#ifdef Heffte_ENABLE_ONEAPI
14#include "heffte_backend_vector.h"
16#include <sycl/sycl.hpp>
17#include "oneapi/mkl.hpp"
18#include "oneapi/mkl/dfti.hpp"
20#ifdef Heffte_ENABLE_MAGMA
52 return sycl::queue{ sycl::gpu_selector_v };
53 }
catch(sycl::exception
const&){
54 return sycl::queue{ sycl::cpu_selector_v };
75 template<
typename precision_type,
typename index>
83 template<
typename precision_type,
typename index>
90 template<
typename scalar_type,
typename index>
99 template<
typename scalar_type,
typename index>
107 template<
typename scalar_type,
typename index>
115 template<
typename scalar_type,
typename index>
117 index buff_line_stride, index buff_plane_stride,
int map0,
int map1,
int map2,
126 template<
typename precision>
129 template<
typename precision>
132 template<
typename precision>
135 template<
typename precision>
148 template<
typename precision>
151 template<
typename precision>
154 template<
typename precision>
157 template<
typename precision>
199 sycl::queue&
stream()
const{
return _stream; }
226 template<
typename scalar_type>
233 template<
typename scalar_type>
235 if (pntr ==
nullptr)
return;
236 sycl::free(pntr, stream);
239 template<
typename scalar_type>
244 template<
typename scalar_type>
249 template<
typename scalar_type>
254 template<
typename scalar_type>
259 template<
typename scalar_type>
264 template<
typename scalar_type>
279 template<
typename T>
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
290 template<
typename T>
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
301 template<
typename T>
using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
366 if (!events.empty()){
373 std::vector<sycl::event> events;
392 template<
typename index>
400 block_stride(
box.osize(0) *
box.osize(1)),
401 total_size(
box.count()),
402 embed({0,
static_cast<MKL_LONG>(stride), 0}),
403 init_cplan(
false), init_zplan(
false),
404 cplan(size), zplan(size)
407 template<
typename index>
411 blocks(1), block_stride(0), total_size(
box.count()),
413 cplan({size, size2}), zplan({size, size2})
415 int odir1 = box.find_order(dir1);
416 int odir2 = box.find_order(dir2);
418 if (std::min(odir1, odir2) == 0 and std::max(odir1, odir2) == 1){
421 embed = {0,
static_cast<MKL_LONG
>(stride),
static_cast<MKL_LONG
>(size)};
422 howmanyffts = box.size[2];
423 }
else if (std::min(odir1, odir2) == 1 and std::max(odir1, odir2) == 2){
424 stride = box.size[0];
426 embed = {0,
static_cast<MKL_LONG
>(stride),
static_cast<MKL_LONG
>(size) *
static_cast<MKL_LONG
>(stride)};
427 howmanyffts = box.size[0];
431 embed = {0,
static_cast<MKL_LONG
>(stride),
static_cast<MKL_LONG
>(box.size[1]) *
static_cast<MKL_LONG
>(box.size[0])};
432 howmanyffts = box.size[1];
436 template<
typename index>
439 size(
box.size[0]), size2(
box.size[1]), howmanyffts(
box.size[2]),
441 blocks(1), block_stride(0), total_size(
box.count()),
443 cplan({howmanyffts, size2, size}), zplan({howmanyffts, size2, size})
447 void forward(std::complex<float>
data[], std::complex<float>*)
const override{
448 if (
not init_cplan) make_plan(cplan);
449 for(
int i=0;
i<blocks;
i++)
450 chainer.
keep(oneapi::mkl::dft::compute_forward(cplan,
data +
i * block_stride, chainer.
get_events()));
454 void backward(std::complex<float>
data[], std::complex<float>*)
const override{
455 if (
not init_cplan) make_plan(cplan);
456 for(
int i=0;
i<blocks;
i++)
457 chainer.
keep(oneapi::mkl::dft::compute_backward(cplan,
data +
i * block_stride, chainer.
get_events()));
461 void forward(std::complex<double>
data[], std::complex<double>*)
const override{
462 if (
not init_zplan) make_plan(zplan);
463 for(
int i=0;
i<blocks;
i++)
464 chainer.
keep(oneapi::mkl::dft::compute_forward(zplan,
data +
i * block_stride, chainer.
get_events()));
468 void backward(std::complex<double>
data[], std::complex<double>*)
const override{
469 if (
not init_zplan) make_plan(zplan);
470 for(
int i=0;
i<blocks;
i++)
471 chainer.
keep(oneapi::mkl::dft::compute_backward(zplan,
data +
i * block_stride, chainer.
get_events()));
476 void forward(
float const indata[], std::complex<float>
outdata[], std::complex<float> *workspace)
const override{
486 void forward(
double const indata[], std::complex<double>
outdata[], std::complex<double> *workspace)
const override{
497 int box_size()
const override{
return total_size; }
503 template<
typename onemkl_plan_type>
506 plan.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, 1);
507 plan.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
DFTI_INPLACE);
508 }
else if (size2 == 0){
509 plan.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, (MKL_LONG) howmanyffts);
510 plan.set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
511 plan.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, embed.data());
512 plan.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, embed.data());
513 plan.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, (MKL_LONG) dist);
514 plan.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, (MKL_LONG) dist);
516 plan.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, (MKL_LONG) howmanyffts);
517 plan.set_value(oneapi::mkl::dft::config_param::PLACEMENT, DFTI_INPLACE);
518 plan.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, embed.data());
519 plan.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, embed.data());
520 plan.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, (MKL_LONG) dist);
521 plan.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, (MKL_LONG) dist);
527 if (std::is_same<oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::COMPLEX>, onemkl_plan_type>::value)
534 int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;
535 std::array<MKL_LONG, 3> embed;
537 mutable event_chainer chainer;
538 mutable bool init_cplan, init_zplan;
539 mutable oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::COMPLEX> cplan;
540 mutable oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::COMPLEX> zplan;
562 template<
typename index>
571 rblock_stride(
box.osize(0) *
box.osize(1)),
572 cblock_stride(
box.osize(0) * (
box.osize(1)/2 + 1)),
576 splan(size), dplan(size)
581 if (
not init_splan) make_plan(splan);
582 for(
int i=0;
i<blocks;
i++)
583 chainer.
keep(oneapi::mkl::dft::compute_forward(splan,
const_cast<float*
>(
indata +
i * rblock_stride),
reinterpret_cast<float*
>(
outdata +
i * cblock_stride), chainer.
get_events()));
588 if (
not init_splan) make_plan(splan);
589 for(
int i=0;
i<blocks;
i++)
590 chainer.
keep(oneapi::mkl::dft::compute_backward(splan,
reinterpret_cast<float*
>(
const_cast<std::complex<float>*
>(
indata +
i * cblock_stride)),
outdata +
i * rblock_stride, chainer.
get_events()));
595 if (
not init_dplan) make_plan(dplan);
596 for(
int i=0;
i<blocks;
i++)
597 chainer.
keep(oneapi::mkl::dft::compute_forward(dplan,
const_cast<double*
>(
indata +
i * rblock_stride),
reinterpret_cast<double*
>(
outdata +
i * cblock_stride), chainer.
get_events()));
602 if (
not init_dplan) make_plan(dplan);
603 for(
int i=0;
i<blocks;
i++)
604 chainer.
keep(oneapi::mkl::dft::compute_backward(dplan,
reinterpret_cast<double*
>(
const_cast<std::complex<double>*
>(
indata +
i * cblock_stride)),
outdata +
i * rblock_stride, chainer.
get_events()));
617 template<
typename onemkl_plan_type>
619 plan.set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, (
MKL_LONG) howmanyffts);
620 plan.set_value(oneapi::mkl::dft::config_param::PLACEMENT,
DFTI_NOT_INPLACE);
623 plan.set_value(oneapi::mkl::dft::config_param::FWD_STRIDES, slstride);
624 plan.set_value(oneapi::mkl::dft::config_param::BWD_STRIDES, slstride);
625 plan.set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, (MKL_LONG) rdist);
626 plan.set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, (MKL_LONG) cdist);
629 if (std::is_same<oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>, onemkl_plan_type>::value)
638 int size, howmanyffts, stride, blocks;
639 int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;
640 mutable event_chainer chainer;
641 mutable bool init_splan, init_dplan;
642 mutable oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL> splan;
643 mutable oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL> dplan;
687 template<
typename scalar_type,
typename index>
692 template<
typename scalar_type,
typename index>
702template<>
struct transpose_packer<tag::gpu>{
704 template<
typename scalar_type,
typename index>
709 template<
typename scalar_type,
typename index>
711 oapi::transpose_unpack<scalar_type>(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride,
712 plan.buff_line_stride, plan.buff_plane_stride, plan.map[0], plan.map[1], plan.map[2],
buffer,
data);
720namespace data_scaling {
724 template<
typename scalar_type,
typename index>
725 static void apply(sycl::queue &stream, index num_entries, scalar_type *data,
double scale_factor){
726 oapi::scale_data(stream,
static_cast<long long>(num_entries), data, scale_factor);
731 template<
typename precision_type,
typename index>
732 static void apply(sycl::queue &stream, index num_entries, std::complex<precision_type> *data,
double scale_factor){
733 apply<precision_type>(stream, 2*num_entries,
reinterpret_cast<precision_type*
>(data), scale_factor);
743 static const bool use_reorder =
false;
751 static const bool use_reorder =
true;
759 static const bool use_reorder =
true;
Helper class to chain a series of compute calls with sycl::event.
Definition heffte_backend_oneapi.h:335
const std::vector< sycl::event > & get_events() const
Getter for the vector of events.
Definition heffte_backend_oneapi.h:344
void keep(sycl::event newEvent)
Replace any stored event with the newEvent.
Definition heffte_backend_oneapi.h:354
void wait()
Wait on the vector of events and then empty it.
Definition heffte_backend_oneapi.h:364
event_chainer()
Constructor.
Definition heffte_backend_oneapi.h:338
Base class for all backend executors.
Definition heffte_common.h:561
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition heffte_common.h:594
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition heffte_common.h:570
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition heffte_common.h:566
Wrapper to oneMKL API for real-to-complex transform with shortening of the data.
Definition heffte_backend_oneapi.h:551
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *) const override
Forward transform, single precision.
Definition heffte_backend_oneapi.h:580
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *) const override
Forward transform, double precision.
Definition heffte_backend_oneapi.h:594
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *) const override
Backward transform, single precision.
Definition heffte_backend_oneapi.h:587
onemkl_executor_r2c(sycl::queue &inq, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition heffte_backend_oneapi.h:563
size_t workspace_size() const override
Return the size of the needed workspace.
Definition heffte_backend_oneapi.h:613
int box_size() const override
Returns the size of the box with real data.
Definition heffte_backend_oneapi.h:609
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition heffte_backend_oneapi.h:611
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *) const override
Backward transform, double precision.
Definition heffte_backend_oneapi.h:601
Wrapper around the oneMKL API.
Definition heffte_backend_oneapi.h:383
int box_size() const override
Returns the size of the box.
Definition heffte_backend_oneapi.h:497
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Performs backward float-complex transform and truncates the complex part of the result.
Definition heffte_backend_oneapi.h:481
void backward(std::complex< double > data[], std::complex< double > *) const override
Backward fft, double-complex case.
Definition heffte_backend_oneapi.h:468
onemkl_executor(sycl::queue &inq, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition heffte_backend_oneapi.h:393
void forward(std::complex< float > data[], std::complex< float > *) const override
Forward fft, float-complex case.
Definition heffte_backend_oneapi.h:447
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition heffte_backend_oneapi.h:491
size_t workspace_size() const override
Return the size of the needed workspace.
Definition heffte_backend_oneapi.h:499
onemkl_executor(sycl::queue &inq, box3d< index > const box)
Merges two FFTs into one.
Definition heffte_backend_oneapi.h:437
onemkl_executor(sycl::queue &inq, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition heffte_backend_oneapi.h:408
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the real data to complex and performs double-complex forward transform.
Definition heffte_backend_oneapi.h:486
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the real data to complex and performs float-complex forward transform.
Definition heffte_backend_oneapi.h:476
void backward(std::complex< float > data[], std::complex< float > *) const override
Backward fft, float-complex case.
Definition heffte_backend_oneapi.h:454
void forward(std::complex< double > data[], std::complex< double > *) const override
Forward fft, double-complex case.
Definition heffte_backend_oneapi.h:461
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition heffte_geometry.h:159
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition heffte_geometry.h:169
@ backward
Inverse DFT transform.
@ forward
Forward DFT transform.
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition heffte_backend_cuda.h:837
void convert(sycl::queue &stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.
void direct_unpack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.
void scale_data(sycl::queue &stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.
sycl::queue internal_sycl_queue
Default queue to use in case the user does not provide one.
void direct_pack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.
void transpose_unpack(sycl::queue &stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.
sycl::queue make_sycl_queue()
Creates a new SYCL queue, try to use the GPU but if an issue is encountered then default to the CPU.
Definition heffte_backend_oneapi.h:50
Namespace containing all HeFFTe methods and classes.
Definition heffte_backend_cuda.h:38
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the oneAPI vector container.
Definition heffte_backend_oneapi.h:279
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the oneAPI vector container.
Definition heffte_backend_oneapi.h:290
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the oneAPI vector container.
Definition heffte_backend_oneapi.h:301
Defines the container for the temporary buffers.
Definition heffte_common.h:237
Type-tag for the cuFFT backend.
Definition heffte_common.h:162
cudaStream_t stream_type
The stream type for the device.
Definition heffte_backend_cuda.h:232
static void free(sycl::queue &stream, scalar_type *pntr)
Free memory.
Definition heffte_backend_oneapi.h:234
static scalar_type * allocate(sycl::queue &stream, size_t num_entries)
Allocate memory.
Definition heffte_backend_oneapi.h:227
static void copy_n(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition heffte_backend_oneapi.h:240
static void copy_n(sycl::queue &stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition heffte_backend_oneapi.h:250
static void copy_device_to_host(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition heffte_backend_oneapi.h:255
static void copy_n(sycl::queue &stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition heffte_backend_oneapi.h:245
static void copy_device_to_device(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition heffte_backend_oneapi.h:260
static void copy_host_to_device(sycl::queue &stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition heffte_backend_oneapi.h:265
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition heffte_common.h:59
Defines inverse mapping from the location tag to a default backend tag.
Definition heffte_common.h:430
The CUDA backend uses a CUDA stream.
Definition heffte_backend_cuda.h:202
device_instance(sycl::queue &new_stream)
Constructor assigning the queue.
Definition heffte_backend_oneapi.h:193
void synchronize_device() const
Syncs the execution with the queue.
Definition heffte_backend_oneapi.h:201
std::reference_wrapper< sycl::queue > _stream
The sycl::queue, either user provided or created by heFFTe.
Definition heffte_backend_oneapi.h:203
device_instance(std::reference_wrapper< sycl::queue > &new_stream)
Constructor assigning from an existing wrapper.
Definition heffte_backend_oneapi.h:195
sycl::queue & stream()
Returns the nullptr.
Definition heffte_backend_oneapi.h:197
cudaStream_t stream_type
The type for the internal stream.
Definition heffte_backend_cuda.h:214
sycl::queue & stream() const
Returns the nullptr.
Definition heffte_backend_oneapi.h:199
device_instance()
Empty constructor.
Definition heffte_backend_oneapi.h:191
Holds the auxiliary variables needed by each backend.
Definition heffte_common.h:408
Allows to define whether a specific backend interface has been enabled.
Definition heffte_common.h:226
Type-tag for the Cosine Transform using the oneMKL backend.
Definition heffte_common.h:210
Type-tag for the Sine Transform using the oneMKL backend.
Definition heffte_common.h:215
Type-tag for the oneMKL backend.
Definition heffte_common.h:205
Defines a set of default plan options for a given backend.
Definition heffte_common.h:761
void unpack(sycl::queue &stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition heffte_backend_oneapi.h:693
void pack(sycl::queue &stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition heffte_backend_oneapi.h:688
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition heffte_pack3d.h:83
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition heffte_backend_oneapi.h:124
static void pre_backward(sycl::queue &, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void post_forward(sycl::queue &, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_forward(sycl::queue &, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_oneapi.h:138
static void post_backward(sycl::queue &, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition heffte_backend_oneapi.h:146
static void pre_backward(sycl::queue &, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_oneapi.h:160
static void post_backward(sycl::queue &, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_forward(sycl::queue &, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_forward(sycl::queue &, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
Indicates the structure that will be used by the fft backend.
Definition heffte_common.h:663
Wrapper around cufftHandle plans, set for float or double complex.
Definition heffte_backend_cuda.h:346
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition heffte_common.h:45
void pack(sycl::queue &stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition heffte_backend_oneapi.h:705
void unpack(sycl::queue &stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition heffte_backend_oneapi.h:710