Eigen-unsupported  3.4.90 (git rev 67eeba6e720c5745abc77ae6c92ce0a44aa7b7ae)
TensorConvolutionSycl.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Mehdi Goli Codeplay Software Ltd.
5 // Ralph Potter Codeplay Software Ltd.
6 // Luke Iwanski Codeplay Software Ltd.
7 // Contact: <eigen@codeplay.com>
8 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9 
10 //
11 // This Source Code Form is subject to the terms of the Mozilla
12 // Public License v. 2.0. If a copy of the MPL was not distributed
13 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14 
15 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17 
18 #include "./InternalHeaderCheck.h"
19 
20 namespace Eigen {
21 
30 enum class convolution_type { CONV1D, CONV2D, CONV3D };
31 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
32  typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
33 struct EigenConvolutionKernel;
34 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
35  typename Kernel_accessor, typename Buffer_accessor>
36 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
37  Buffer_accessor, convolution_type::CONV1D> {
38  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
39  Local_accessor;
40  Local_accessor local_acc;
41  Evaluator device_evaluator;
42  Kernel_accessor kernel_filter;
43  Buffer_accessor buffer_acc;
44  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
45  const size_t kernelSize;
46  const cl::sycl::range<2> input_range;
47  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
48  Buffer_accessor buffer_acc_,
49  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
50  const size_t kernelSize_, const cl::sycl::range<2> input_range_)
51  : local_acc(local_acc_),
52  device_evaluator(device_evaluator_),
53  kernel_filter(kernel_filter_),
54  buffer_acc(buffer_acc_),
55  indexMapper(indexMapper_),
56  kernelSize(kernelSize_),
57  input_range(input_range_) {}
58 
59  template <typename BooleanDim2>
60  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
61  return (boolean_check[0] && boolean_check[1]);
62  }
63  void operator()(cl::sycl::nd_item<2> itemID) {
64  auto buffer_ptr = buffer_acc.get_pointer();
65  auto kernel_ptr = kernel_filter.get_pointer();
66  // the required row to be calculated for the for each plane in shered memory
67  const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
68  const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
69  const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
70  const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
72  for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
73  const size_t local_index = i + plane_kernel_offset;
74  const size_t tensor_index =
75  plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
76 
77  local_acc[local_index] =
78  (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
79  ? device_evaluator.coeff(tensor_index)
80  : CoeffReturnType(0);
81  }
82 
83  itemID.barrier(cl::sycl::access::fence_space::local_space);
84 
85  // calculate the convolution // output start x
86  const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
87  if (boundary_check(itemID.get_global_id() < input_range)) {
88  CoeffReturnType result = static_cast<CoeffReturnType>(0);
89  const size_t index = plane_kernel_offset + itemID.get_local_id(0);
90  for (size_t k = 0; k < kernelSize; ++k) {
91  result += (local_acc[k + index] * kernel_ptr[k]);
92  }
93  const size_t tensor_index =
94  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
95  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
96  buffer_ptr[tensor_index] = result;
97  }
98  }
99 };
100 
101 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
102  typename Kernel_accessor, typename Buffer_accessor>
103 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
104  Buffer_accessor, convolution_type::CONV2D> {
105  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
106  Local_accessor;
107  Local_accessor local_acc;
108  Evaluator device_evaluator;
109  Kernel_accessor kernel_filter;
110  Buffer_accessor buffer_acc;
111  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
112  const cl::sycl::range<2> kernel_size;
113  const cl::sycl::range<3> input_range;
114  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
115  Buffer_accessor buffer_acc_,
116  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
117  const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
118  : local_acc(local_acc_),
119  device_evaluator(device_evaluator_),
120  kernel_filter(kernel_filter_),
121  buffer_acc(buffer_acc_),
122  indexMapper(indexMapper_),
123  kernel_size(kernel_size_),
124  input_range(input_range_) {}
125  template <typename BooleanDim3>
126  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
127  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
128  }
129 
130  void operator()(cl::sycl::nd_item<3> itemID) {
131  auto buffer_ptr = buffer_acc.get_pointer();
132  auto kernel_ptr = kernel_filter.get_pointer();
133  // the required row to be calculated for the for each plane in shered memory
134  const auto num_input = cl::sycl::range<2>{
135  (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
136 
137  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
138  const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
139 
140  const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
141  itemID.get_group(1) * itemID.get_local_range()[1]};
142 
143  // fill the local memory
144  bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
145  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
146  const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
147  bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
148  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
149  const size_t local_index = i + local_input_offset;
150  const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
151  i + input_offset[0], j + input_offset[1]);
152  local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
153  in_range_dim1 && in_range_dim2)
154  ? device_evaluator.coeff(tensor_index)
155  : CoeffReturnType(0);
156  }
157  }
158 
159  itemID.barrier(cl::sycl::access::fence_space::local_space);
160 
161  // output offset start for each thread
162  const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
163  itemID.get_group(1) * itemID.get_local_range()[1]};
164 
165  if (boundary_check(itemID.get_global_id() < input_range)) {
166  CoeffReturnType result = static_cast<CoeffReturnType>(0);
167 
168  for (size_t j = 0; j < kernel_size[1]; j++) {
169  size_t kernel_offset = kernel_size[0] * j;
170  const size_t index =
171  (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
172  for (size_t i = 0; i < kernel_size[0]; i++) {
173  result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
174  }
175  }
176  const size_t tensor_index =
177  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
178  indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
179  itemID.get_local_id(1) + output_offset[1]);
180 
181  buffer_ptr[tensor_index] = result;
182  }
183  }
184 };
185 
186 template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
187  typename Kernel_accessor, typename Buffer_accessor>
188 struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
189  Buffer_accessor, convolution_type::CONV3D> {
190  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
191  Local_accessor;
192  Local_accessor local_acc;
193  Evaluator device_evaluator;
194  Kernel_accessor kernel_filter;
195  Buffer_accessor buffer_acc;
196  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
197  const cl::sycl::range<3> kernel_size;
198  const cl::sycl::range<3> input_range;
199  const size_t numP;
200 
201  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
202  Buffer_accessor buffer_acc_,
203  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
204  const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
205  const size_t numP_)
206  : local_acc(local_acc_),
207  device_evaluator(device_evaluator_),
208  kernel_filter(kernel_filter_),
209  buffer_acc(buffer_acc_),
210  indexMapper(indexMapper_),
211  kernel_size(kernel_size_),
212  input_range(input_range_),
213  numP(numP_) {}
214  template <typename BooleanDim3>
215  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
216  return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
217  }
218  void operator()(cl::sycl::nd_item<3> itemID) {
219  auto buffer_ptr = buffer_acc.get_pointer();
220  auto kernel_ptr = kernel_filter.get_pointer();
221  const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
222 
223  const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
224 
225  const auto output_offset =
226  cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
227 
228  for (size_t p = 0; p < numP; p++) {
230  const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
231  for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
232  size_t local_index_dim2 = num_input[0] * num_input[1] * k;
233  bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
234  for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
235  bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
236  size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
237  for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
238  bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
239  const size_t local_index = local_index_dim1 + i;
240  const size_t tensor_index =
241  plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
242  i + input_offset[0], j + input_offset[1], k + input_offset[2]);
243  local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
244  }
245  }
246  }
247  itemID.barrier(cl::sycl::access::fence_space::local_space);
248 
249  // calculate the convolution
250 
251  if (boundary_check(itemID.get_global_id() < input_range)) {
252  CoeffReturnType result = static_cast<CoeffReturnType>(0);
253  for (size_t k = 0; k < kernel_size[2]; k++) {
254  for (size_t j = 0; j < kernel_size[1]; j++) {
255  for (size_t i = 0; i < kernel_size[0]; i++) {
256  const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
257  const size_t local_index =
258  ((i + itemID.get_local_id(0)) +
259  num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
260 
261  result += (local_acc[local_index] * kernel_ptr[kernel_index]);
262  }
263  }
264  }
265  const size_t tensor_index =
266  indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
267  indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
268  buffer_ptr[tensor_index] = result;
269  }
270 
271  itemID.barrier(cl::sycl::access::fence_space::local_space);
272  }
273  }
274 };
275 
276 template <typename Indices, typename InputArgType, typename KernelArgType>
277 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
278  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
279 
280  static constexpr int NumDims =
281  internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
282  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
283  typedef typename XprType::Index Index;
284  typedef DSizes<Index, NumDims> Dimensions;
285  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
286  typedef const Eigen::SyclDevice Device;
287  typedef typename XprType::CoeffReturnType CoeffReturnType;
288  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
289  typedef typename InputArgType::Scalar Scalar;
290  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
291  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
292  typedef typename Storage::Type EvaluatorPointerType;
293  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
294 
295  static constexpr int Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout;
296  enum {
297  IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
298  TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
299  PacketAccess = false,
300  BlockAccess = false,
301  PreferBlockAccess = false,
302  CoordAccess = false, // to be implemented
303  RawAccess = false
304  };
305 
306  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
307  typedef internal::TensorBlockNotImplemented TensorBlock;
308  //===--------------------------------------------------------------------===//
309 
310  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
311  : m_inputImpl(op.inputExpression(), device),
312  m_kernelArg(op.kernelExpression()),
313  m_kernelImpl(op.kernelExpression(), device),
314  m_indices(op.indices()),
315  m_buf(NULL),
316  m_kernel(NULL),
317  m_local_kernel(false),
318  m_device(device) {
319  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
320  static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
321  YOU_MADE_A_PROGRAMMING_MISTAKE);
322 
323  const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
324  const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
325  m_kernelImpl.dimensions();
326 
327  m_dimensions = m_inputImpl.dimensions();
328  for (int i = 0; i < NumKernelDims; ++i) {
329  const Index index = op.indices()[i];
330  const Index input_dim = input_dims[index];
331  const Index kernel_dim = kernel_dims[i];
332  const Index result_dim = input_dim - kernel_dim + 1;
333  m_dimensions[index] = result_dim;
334  }
335  }
336 
337  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
338 
339  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
340  preloadKernel();
341  m_inputImpl.evalSubExprsIfNeeded(NULL);
342  if (data) {
343  executeEval(data);
344  return false;
345  } else {
346  m_buf = (EvaluatorPointerType)m_device.get(
347  (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
348  executeEval(m_buf);
349  return true;
350  }
351  }
352 
353  EIGEN_STRONG_INLINE void cleanup() {
354  m_inputImpl.cleanup();
355  if (m_buf) {
356  m_device.deallocate_temp(m_buf);
357  m_buf = NULL;
358  }
359  if (m_local_kernel) {
360  m_device.deallocate_temp(m_kernel);
361  m_local_kernel = false;
362  }
363  m_kernel = NULL;
364  }
366  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
368  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
369 
370  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
371  // Don't make a local copy of the kernel unless we have to (i.e. it's an
372  // expression that needs to be evaluated)
373  typename KernelStorage::Type in_place = m_kernelImpl.data();
374  if (in_place) {
375  m_kernel = in_place;
376  m_local_kernel = false;
377  } else {
378  ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
379  EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
380  typedef TensorEvalToOp<const KernelArgType> EvalTo;
381  EvalTo evalToTmp(m_device.get(local), m_kernelArg);
382  const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
383  internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
384  m_kernel = local;
385  m_local_kernel = true;
386  }
387  }
388 
389  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
390  typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
391  typedef typename InputEvaluator::Dimensions InputDims;
392  switch (NumKernelDims) {
393  case 1: {
394  const size_t numX = dimensions()[m_indices[0]];
395  const size_t numP = dimensions().TotalSize() / numX;
396  const auto input_dim = std::array<size_t, 2>{numX, numP};
397  auto global_range = cl::sycl::range<2>{};
398  auto local_range = cl::sycl::range<2>{};
399  const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
400 
401  m_device.parallel_for_setup(input_dim, global_range, local_range);
402  const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
403  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
404  const array<Index, 1> indices{{m_indices[0]}};
405  const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
406  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
407 
408  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
409  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
410  ConvKernel;
411 
412  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
413  m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
414  indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
415  break;
416  }
417 
418  case 2: {
419  auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
420  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
421  auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
422  (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
423  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
424  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
425  const size_t numP = dimensions().TotalSize() / (numX * numY);
426  auto input_dim = std::array<size_t, 3>{numX, numY, numP};
427 
428  auto global_range = cl::sycl::range<3>{};
429  auto local_range = cl::sycl::range<3>{};
430 
431  m_device.parallel_for_setup(input_dim, global_range, local_range);
432 
433  const size_t local_memory_size =
434  (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
435  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
436  const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
437  const array<Index, 2> kernel_dims{
438  {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
439  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
440  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
441  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
442  ConvKernel;
443  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
444  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
445  indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
446  break;
447  }
448 
449  case 3: {
450  auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
451  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
452  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
453 
454  auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
455  (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
456  (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
457 
458  const size_t numX = dimensions()[m_indices[kernel_index[0]]];
459  const size_t numY = dimensions()[m_indices[kernel_index[1]]];
460  const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
461  auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
462  const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
463 
464  const array<Index, 3> indices{
465  {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
466  const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
467  m_kernelImpl.dimensions()[kernel_index[1]],
468  m_kernelImpl.dimensions()[kernel_index[2]]}};
469 
470  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
471 
472  auto global_range = cl::sycl::range<3>{};
473  auto local_range = cl::sycl::range<3>{};
474 
475  m_device.parallel_for_setup(input_dim, global_range, local_range);
476  auto local_memory_range = (local_range + kernel_size - 1);
477  const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
478 
479  gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
480  typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
481  typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
482  ConvKernel;
483  m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
484  m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
485  indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
486  break;
487  }
488 
489  default: {
490  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
491  THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
492  }
493  }
494  }
495 
496  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
497  eigen_assert(m_buf != NULL);
498  eigen_assert(index < m_dimensions.TotalSize());
499  return m_buf[index];
500  }
501 
502  template <int LoadMode>
503  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
504  eigen_assert(m_buf != NULL);
505  eigen_assert(index < m_dimensions.TotalSize());
506  return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
507  }
508 
509  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
510  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
511  // model.
512  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
513  // We ignore the use of fused multiply-add.
514  const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
515  const double firstIndex_compute_cost =
516  NumDims *
517  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
518  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
519  kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
520  TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
521  }
522  // binding placeholder accessors to a command group handler for SYCL
523  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
524  m_kernelImpl.bind(cgh);
525  m_inputImpl.bind(cgh);
526  m_buf.bind(cgh);
527  m_kernel.bind(cgh);
528  }
529 
530  private:
531  // No assignment (copies are needed by the kernels)
532  TensorEvaluator &operator=(const TensorEvaluator &);
533  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
534  KernelArgType m_kernelArg;
535  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
536  Indices m_indices;
537  Dimensions m_dimensions;
538  EvaluatorPointerType m_buf;
539  typename KernelStorage::Type m_kernel;
540  bool m_local_kernel;
541  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
542 }; // namespace Eigen
543 
544 } // end namespace Eigen
545 
546 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index