Eigen-unsupported  3.4.90 (git rev 67eeba6e720c5745abc77ae6c92ce0a44aa7b7ae)
TensorConvolution.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
24 namespace internal {
25 
26 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
27 class IndexMapper {
28  public:
29  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
30  const array<Index, NumKernelDims>& indices) {
31 
32  array<Index, NumDims> dimensions = input_dims;
33  for (int i = 0; i < NumKernelDims; ++i) {
34  const Index index = indices[i];
35  const Index input_dim = input_dims[index];
36  const Index kernel_dim = kernel_dims[i];
37  const Index result_dim = input_dim - kernel_dim + 1;
38  dimensions[index] = result_dim;
39  }
40 
41  array<Index, NumDims> inputStrides;
42  array<Index, NumDims> outputStrides;
43  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
44  inputStrides[0] = 1;
45  outputStrides[0] = 1;
46  for (int i = 1; i < NumDims; ++i) {
47  inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
48  outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
49  }
50  } else {
51  inputStrides[NumDims - 1] = 1;
52  outputStrides[NumDims - 1] = 1;
53  for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
54  inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
55  outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
56  }
57  }
58 
59  array<Index, NumDims> gpuInputDimensions;
60  array<Index, NumDims> gpuOutputDimensions;
61  array<Index, NumDims> tmp = dimensions;
62  array<Index, NumDims> ordering;
63  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
64  ? 0
65  : NumDims - NumKernelDims;
66  for (int i = 0; i < NumKernelDims; ++i) {
67  const Index index = i + offset;
68  ordering[index] = indices[i];
69  tmp[indices[i]] = -1;
70  gpuInputDimensions[index] = input_dims[indices[i]];
71  gpuOutputDimensions[index] = dimensions[indices[i]];
72  }
73 
74  int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
75  ? NumKernelDims
76  : 0;
77  for (int i = 0; i < NumDims; ++i) {
78  if (tmp[i] >= 0) {
79  ordering[written] = i;
80  gpuInputDimensions[written] = input_dims[i];
81  gpuOutputDimensions[written] = dimensions[i];
82  ++written;
83  }
84  }
85 
86  for (int i = 0; i < NumDims; ++i) {
87  m_inputStrides[i] = inputStrides[ordering[i]];
88  m_outputStrides[i] = outputStrides[ordering[i]];
89  }
90 
91  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
92  for (int i = 0; i < NumDims; ++i) {
93  if (i > NumKernelDims) {
94  m_gpuInputStrides[i] =
95  m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
96  m_gpuOutputStrides[i] =
97  m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
98  } else {
99  m_gpuInputStrides[i] = 1;
100  m_gpuOutputStrides[i] = 1;
101  }
102  }
103  } else {
104  for (int i = NumDims - 1; i >= 0; --i) {
105  if (static_cast<size_t>(i + 1) < offset) {
106  m_gpuInputStrides[i] =
107  m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
108  m_gpuOutputStrides[i] =
109  m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
110  } else {
111  m_gpuInputStrides[i] = 1;
112  m_gpuOutputStrides[i] = 1;
113  }
114  }
115  }
116  }
117 
118  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
119  Index inputIndex = 0;
120  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
121  for (int d = NumDims - 1; d > NumKernelDims; --d) {
122  const Index idx = p / m_gpuInputStrides[d];
123  inputIndex += idx * m_inputStrides[d];
124  p -= idx * m_gpuInputStrides[d];
125  }
126  inputIndex += p * m_inputStrides[NumKernelDims];
127  } else {
128  std::ptrdiff_t limit = 0;
129  if (NumKernelDims < NumDims) {
130  limit = NumDims - NumKernelDims - 1;
131  }
132  for (int d = 0; d < limit; ++d) {
133  const Index idx = p / m_gpuInputStrides[d];
134  inputIndex += idx * m_inputStrides[d];
135  p -= idx * m_gpuInputStrides[d];
136  }
137  inputIndex += p * m_inputStrides[limit];
138  }
139  return inputIndex;
140  }
141 
142  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
143  Index outputIndex = 0;
144  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
145  for (int d = NumDims - 1; d > NumKernelDims; --d) {
146  const Index idx = p / m_gpuOutputStrides[d];
147  outputIndex += idx * m_outputStrides[d];
148  p -= idx * m_gpuOutputStrides[d];
149  }
150  outputIndex += p * m_outputStrides[NumKernelDims];
151  } else {
152  std::ptrdiff_t limit = 0;
153  if (NumKernelDims < NumDims) {
154  limit = NumDims - NumKernelDims - 1;
155  }
156  for (int d = 0; d < limit; ++d) {
157  const Index idx = p / m_gpuOutputStrides[d];
158  outputIndex += idx * m_outputStrides[d];
159  p -= idx * m_gpuOutputStrides[d];
160  }
161  outputIndex += p * m_outputStrides[limit];
162  }
163  return outputIndex;
164  }
165 
166  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
167  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
168  ? 0
169  : NumDims - NumKernelDims;
170  return i * m_inputStrides[offset];
171  }
172 
173  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
174  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
175  ? 0
176  : NumDims - NumKernelDims;
177  return i * m_outputStrides[offset];
178  }
179 
180  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
181  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
182  ? 0
183  : NumDims - NumKernelDims;
184  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
185  }
186 
187  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
188  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
189  ? 0
190  : NumDims - NumKernelDims;
191  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
192  }
193 
194  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
195  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
196  ? 0
197  : NumDims - NumKernelDims;
198  return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
199  k * m_inputStrides[offset + 2];
200  }
201 
202  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
203  const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
204  ? 0
205  : NumDims - NumKernelDims;
206  return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
207  k * m_outputStrides[offset + 2];
208  }
209 
210  private:
211  static constexpr int NumDims = internal::array_size<InputDims>::value;
212  array<Index, NumDims> m_inputStrides;
213  array<Index, NumDims> m_outputStrides;
214  array<Index, NumDims> m_gpuInputStrides;
215  array<Index, NumDims> m_gpuOutputStrides;
216 };
217 
218 
219 
220 template<typename Dimensions, typename InputXprType, typename KernelXprType>
221 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
222 {
223  // Type promotion to handle the case where the types of the lhs and the rhs are different.
224  typedef typename promote_storage_type<typename InputXprType::Scalar,
225  typename KernelXprType::Scalar>::ret Scalar;
226  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
227  typename traits<KernelXprType>::StorageKind>::ret StorageKind;
228  typedef typename promote_index_type<typename traits<InputXprType>::Index,
229  typename traits<KernelXprType>::Index>::type Index;
230  typedef typename InputXprType::Nested LhsNested;
231  typedef typename KernelXprType::Nested RhsNested;
232  typedef std::remove_reference_t<LhsNested> LhsNested_;
233  typedef std::remove_reference_t<RhsNested> RhsNested_;
234  static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;
235  static constexpr int Layout = traits<InputXprType>::Layout;
236  typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
237  typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType> PointerType;
238 
239  enum {
240  Flags = 0
241  };
242 };
243 
244 template<typename Dimensions, typename InputXprType, typename KernelXprType>
245 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
246 {
247  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
248 };
249 
250 template<typename Dimensions, typename InputXprType, typename KernelXprType>
251 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
252 {
253  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
254 };
255 
256 } // end namespace internal
257 
258 
259 
260 template<typename Indices, typename InputXprType, typename KernelXprType>
261 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
262 {
263  public:
264  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
265  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
266  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
267  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
268  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
269  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
270  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
271 
272  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
273  : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
274 
275  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
276  const Indices& indices() const { return m_indices; }
277 
279  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
280  const internal::remove_all_t<typename InputXprType::Nested>&
281  inputExpression() const { return m_input_xpr; }
282 
283  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
284  const internal::remove_all_t<typename KernelXprType::Nested>&
285  kernelExpression() const { return m_kernel_xpr; }
286 
287  protected:
288  typename InputXprType::Nested m_input_xpr;
289  typename KernelXprType::Nested m_kernel_xpr;
290  const Indices m_indices;
291 };
292 
293 
294 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
295 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
296 {
297  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
298 
299  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
300  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
301  typedef typename XprType::Index Index;
302  typedef DSizes<Index, NumDims> Dimensions;
303 
304  typedef typename XprType::Scalar Scalar;
305  typedef typename XprType::CoeffReturnType CoeffReturnType;
306  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
307  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
308  typedef StorageMemory<Scalar, Device> Storage;
309  typedef typename Storage::Type EvaluatorPointerType;
310 
311  static constexpr int Layout = TensorEvaluator<InputArgType, Device>::Layout;
312  enum {
313  IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
314  PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
315  BlockAccess = false,
316  PreferBlockAccess = false,
317  CoordAccess = false, // to be implemented
318  RawAccess = false
319  };
320 
321  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
322  typedef internal::TensorBlockNotImplemented TensorBlock;
323  //===--------------------------------------------------------------------===//
324 
325  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
326  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
327  {
328  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
329 
330  const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
331  const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
332 
333  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
334  m_inputStride[0] = 1;
335  for (int i = 1; i < NumDims; ++i) {
336  m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
337  }
338  } else {
339  m_inputStride[NumDims - 1] = 1;
340  for (int i = NumDims - 2; i >= 0; --i) {
341  m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
342  }
343  }
344 
345  m_dimensions = m_inputImpl.dimensions();
346  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
347  for (int i = 0; i < NumKernelDims; ++i) {
348  const Index index = op.indices()[i];
349  const Index input_dim = input_dims[index];
350  const Index kernel_dim = kernel_dims[i];
351  const Index result_dim = input_dim - kernel_dim + 1;
352  m_dimensions[index] = result_dim;
353  if (i > 0) {
354  m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
355  } else {
356  m_kernelStride[0] = 1;
357  }
358  m_indexStride[i] = m_inputStride[index];
359  }
360 
361  m_outputStride[0] = 1;
362  for (int i = 1; i < NumDims; ++i) {
363  m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
364  }
365  } else {
366  for (int i = NumKernelDims - 1; i >= 0; --i) {
367  const Index index = op.indices()[i];
368  const Index input_dim = input_dims[index];
369  const Index kernel_dim = kernel_dims[i];
370  const Index result_dim = input_dim - kernel_dim + 1;
371  m_dimensions[index] = result_dim;
372  if (i < NumKernelDims - 1) {
373  m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
374  } else {
375  m_kernelStride[NumKernelDims - 1] = 1;
376  }
377  m_indexStride[i] = m_inputStride[index];
378  }
379 
380  m_outputStride[NumDims - 1] = 1;
381  for (int i = NumDims - 2; i >= 0; --i) {
382  m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
383  }
384  }
385  }
386 
387  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
388 
389  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
390  m_inputImpl.evalSubExprsIfNeeded(NULL);
391  preloadKernel();
392  return true;
393  }
394  EIGEN_STRONG_INLINE void cleanup() {
395  m_inputImpl.cleanup();
396  if (m_local_kernel) {
397  m_device.deallocate((void*)m_kernel);
398  m_local_kernel = false;
399  }
400  m_kernel = NULL;
401  }
402 
403  void evalTo(typename XprType::Scalar* buffer) {
404  evalSubExprsIfNeeded(NULL);
405  for (int i = 0; i < dimensions().TotalSize(); ++i) {
406  buffer[i] += coeff(i);
407  }
408  cleanup();
409  }
410 
411  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
412  {
413  CoeffReturnType result = CoeffReturnType(0);
414  convolve(firstInput(index), 0, NumKernelDims-1, result);
415  return result;
416  }
417 
418  template<int LoadMode>
419  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
420  {
421  Index indices[2] = {index, index+PacketSize-1};
422  Index startInputs[2] = {0, 0};
423  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
424  for (int i = NumDims - 1; i > 0; --i) {
425  const Index idx0 = indices[0] / m_outputStride[i];
426  const Index idx1 = indices[1] / m_outputStride[i];
427  startInputs[0] += idx0 * m_inputStride[i];
428  startInputs[1] += idx1 * m_inputStride[i];
429  indices[0] -= idx0 * m_outputStride[i];
430  indices[1] -= idx1 * m_outputStride[i];
431  }
432  } else {
433  for (int i = 0; i < NumDims - 1; ++i) {
434  const Index idx0 = indices[0] / m_outputStride[i];
435  const Index idx1 = indices[1] / m_outputStride[i];
436  startInputs[0] += idx0 * m_inputStride[i];
437  startInputs[1] += idx1 * m_inputStride[i];
438  indices[0] -= idx0 * m_outputStride[i];
439  indices[1] -= idx1 * m_outputStride[i];
440  }
441  }
442  startInputs[0] += indices[0];
443  startInputs[1] += indices[1];
444 
445  if (startInputs[1]-startInputs[0] == PacketSize-1) {
446  PacketReturnType result = internal::pset1<PacketReturnType>(0);
447  convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
448  return result;
449  } else {
450  EIGEN_ALIGN_MAX Scalar data[PacketSize];
451  data[0] = Scalar(0);
452  convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
453  for (int i = 1; i < PacketSize-1; ++i) {
454  data[i] = Scalar(0);
455  convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
456  }
457  data[PacketSize-1] = Scalar(0);
458  convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
459  return internal::pload<PacketReturnType>(data);
460  }
461  }
462 
463  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
464  costPerCoeff(bool vectorized) const {
465  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
466  // We ignore the use of fused multiply-add.
467  const double convolve_compute_cost =
468  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
469  const double firstIndex_compute_cost =
470  NumDims *
471  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
472  TensorOpCost::DivCost<Index>());
473  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
474  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
475  m_kernelImpl.costPerCoeff(vectorized) +
476  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
477  PacketSize));
478  }
479 
480  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
481 
482  private:
483  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
484  Index startInput = 0;
485  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
486  for (int i = NumDims - 1; i > 0; --i) {
487  const Index idx = index / m_outputStride[i];
488  startInput += idx * m_inputStride[i];
489  index -= idx * m_outputStride[i];
490  }
491  } else {
492  for (int i = 0; i < NumDims - 1; ++i) {
493  const Index idx = index / m_outputStride[i];
494  startInput += idx * m_inputStride[i];
495  index -= idx * m_outputStride[i];
496  }
497  }
498  startInput += index;
499  return startInput;
500  }
501 
502  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
503  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
504  const Index input = firstIndex + j * m_indexStride[DimIndex];
505  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
506  if (DimIndex > 0) {
507  convolve(input, kernel, DimIndex-1, accum);
508  } else {
509  accum += m_inputImpl.coeff(input) * m_kernel[kernel];
510  }
511  }
512  }
513 
514  template <typename Packet>
515  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
516  for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
517  const Index input = firstIndex + j * m_indexStride[DimIndex];
518  const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
519  if (DimIndex > 0) {
520  convolvePacket(input, kernel, DimIndex-1, accum);
521  } else {
522  accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
523  }
524  }
525  }
526 
527  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
528  // Don't make a local copy of the kernel unless we have to (i.e. it's an
529  // expression that needs to be evaluated)
530  const Scalar* in_place = m_kernelImpl.data();
531  if (in_place) {
532  m_kernel = in_place;
533  m_local_kernel = false;
534  } else {
535  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
536  Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
537  typedef TensorEvalToOp<const KernelArgType> EvalTo;
538  EvalTo evalToTmp(local, m_kernelArg);
539  const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
540  internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
541 
542  m_kernel = local;
543  m_local_kernel = true;
544  }
545  }
546 
547  array<Index, NumDims> m_inputStride;
548  array<Index, NumDims> m_outputStride;
549 
550  array<Index, NumKernelDims> m_indexStride;
551  array<Index, NumKernelDims> m_kernelStride;
552  TensorEvaluator<InputArgType, Device> m_inputImpl;
553  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
554  Dimensions m_dimensions;
555 
556  KernelArgType m_kernelArg;
557  const Scalar* m_kernel;
558  bool m_local_kernel;
559  const Device EIGEN_DEVICE_REF m_device;
560 };
561 
562 
563 
564 
565 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
566 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
567 
568 template <int StaticKernelSize>
569 struct GetKernelSize {
570  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
571  return StaticKernelSize;
572  }
573 };
574 template <>
575 struct GetKernelSize<Dynamic> {
576  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
577  return kernelSize;
578  }
579 };
580 
581 template <typename InputEvaluator, typename Index, typename InputDims,
582  int StaticKernelSize>
583 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
584  InputEvaluator eval,
585  const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
586  indexMapper,
587  const float* __restrict kernel, const int numPlanes, const int numX,
588  const int maxX, const int kernelSize, float* buffer) {
589 #if defined(EIGEN_HIPCC)
590  HIP_DYNAMIC_SHARED(float, s)
591 #else
592  extern __shared__ float s[];
593 #endif
594 
595  const int first_x = blockIdx.x * maxX;
596  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
597  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
598  const int num_x_output = last_x - first_x + 1;
599 
600  const int first_plane = blockIdx.y * blockDim.y;
601  const int plane_stride = blockDim.y * gridDim.y;
602 
603  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
604  // Load inputs to shared memory
605  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
606  const int plane_kernel_offset = threadIdx.y * num_x_input;
607  #pragma unroll
608  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
609  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
610  s[i + plane_kernel_offset] = eval.coeff(tensor_index);
611  }
612 
613  __syncthreads();
614 
615  // Compute the convolution
616  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
617 
618  #pragma unroll
619  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
620  const int kernel_offset = plane_kernel_offset + i;
621  float result = 0.0f;
622  #pragma unroll
623  for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
624  result += s[k + kernel_offset] * kernel[k];
625  }
626  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
627  buffer[tensor_index] = result;
628  }
629  __syncthreads();
630  }
631 };
632 
633 template <typename InputEvaluator, typename Index, typename InputDims,
634  int StaticKernelSizeX, int StaticKernelSizeY>
635 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
636  InputEvaluator eval,
637  const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
638  indexMapper,
639  const float* __restrict kernel, const int numPlanes, const int numX,
640  const int maxX, const int numY, const int maxY, const int kernelSizeX,
641  const int kernelSizeY, float* buffer) {
642 #if defined(EIGEN_HIPCC)
643  HIP_DYNAMIC_SHARED(float, s)
644 #else
645  extern __shared__ float s[];
646 #endif
647 
648  const int first_x = blockIdx.x * maxX;
649  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
650  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
651  const int num_x_output = last_x - first_x + 1;
652 
653  const int first_y = blockIdx.y * maxY;
654  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
655  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
656  const int num_y_output = last_y - first_y + 1;
657 
658  const int first_plane = blockIdx.z * blockDim.z;
659  const int plane_stride = blockDim.z * gridDim.z;
660 
661  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
662 
663  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
664  const int plane_kernel_offset = threadIdx.z * num_y_input;
665 
666  // Load inputs to shared memory
667  #pragma unroll
668  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
669  const int input_offset = num_x_input * (j + plane_kernel_offset);
670  #pragma unroll
671  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
672  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
673  s[i + input_offset] = eval.coeff(tensor_index);
674  }
675  }
676 
677  __syncthreads();
678 
679  // Convolution
680  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
681 
682  #pragma unroll
683  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
684  #pragma unroll
685  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
686  float result = 0.0f;
687  #pragma unroll
688  for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
689  const int kernel_offset = kernelSizeX * l;
690  const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
691  #pragma unroll
692  for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
693  result += s[k + input_offset] * kernel[k + kernel_offset];
694  }
695  }
696  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
697  buffer[tensor_index] = result;
698  }
699  }
700 
701  __syncthreads();
702  }
703 };
704 
705 template <typename InputEvaluator, typename Index, typename InputDims>
706 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
707  InputEvaluator eval,
708  const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
709  indexMapper,
710  const float* __restrict kernel, const size_t numPlanes, const size_t numX,
711  const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
712  const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
713  const size_t kernelSizeZ, float* buffer) {
714 #if defined(EIGEN_HIPCC)
715  HIP_DYNAMIC_SHARED(float, s)
716 #else
717  extern __shared__ float s[];
718 #endif
719 
720  // Load inputs to shared memory
721  const int first_x = blockIdx.x * maxX;
722  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
723  const int num_x_input = last_x - first_x + kernelSizeX;
724 
725  const int first_y = blockIdx.y * maxY;
726  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
727  const int num_y_input = last_y - first_y + kernelSizeY;
728 
729  const int first_z = blockIdx.z * maxZ;
730  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
731  const int num_z_input = last_z - first_z + kernelSizeZ;
732 
733  for (int p = 0; p < numPlanes; ++p) {
734 
735  const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
736  const int plane_kernel_offset = 0;
737 
738  for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
739  for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
740  for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
741  const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
742  s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
743  }
744  }
745  }
746 
747  __syncthreads();
748 
749  // Convolution
750  const int num_z_output = last_z - first_z + 1;
751  const int num_y_output = last_y - first_y + 1;
752  const int num_x_output = last_x - first_x + 1;
753  const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
754 
755  for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
756  for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
757  for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
758  float result = 0.0f;
759  for (int n = 0; n < kernelSizeZ; ++n) {
760  for (int m = 0; m < kernelSizeY; ++m) {
761  for (int l = 0; l < kernelSizeX; ++l) {
762  result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
763  }
764  }
765  }
766  const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
767  buffer[tensor_index] = result;
768  }
769  }
770  }
771  __syncthreads();
772  }
773 };
774 
775 
776 
777 template<typename Indices, typename InputArgType, typename KernelArgType>
778 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
779 {
780  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
781 
782  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
783  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
784  typedef typename XprType::Index Index;
785  typedef DSizes<Index, NumDims> Dimensions;
786  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
787 
788  static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;
789  enum {
790  IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
791  PacketAccess = false,
792  BlockAccess = false,
793  PreferBlockAccess = false,
794  CoordAccess = false, // to be implemented
795  RawAccess = false
796  };
797 
798  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
799  typedef internal::TensorBlockNotImplemented TensorBlock;
800  //===--------------------------------------------------------------------===//
801 
802  TensorEvaluator(const XprType& op, const GpuDevice& device)
803  : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
804  {
805  EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
806 
807  const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
808  const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
809 
810  m_dimensions = m_inputImpl.dimensions();
811  for (int i = 0; i < NumKernelDims; ++i) {
812  const Index index = op.indices()[i];
813  const Index input_dim = input_dims[index];
814  const Index kernel_dim = kernel_dims[i];
815  const Index result_dim = input_dim - kernel_dim + 1;
816  m_dimensions[index] = result_dim;
817  }
818  }
819 
820  typedef typename XprType::CoeffReturnType CoeffReturnType;
821  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
822  typedef typename InputArgType::Scalar Scalar;
823  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
824 
825  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
826 
827  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
828  preloadKernel();
829  m_inputImpl.evalSubExprsIfNeeded(NULL);
830  if (data) {
831  executeEval(data);
832  return false;
833  } else {
834  m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
835  executeEval(m_buf);
836  return true;
837  }
838  }
839 
840  EIGEN_STRONG_INLINE void cleanup() {
841  m_inputImpl.cleanup();
842  if (m_buf) {
843  m_device.deallocate(m_buf);
844  m_buf = NULL;
845  }
846  if (m_local_kernel) {
847  m_device.deallocate((void*)m_kernel);
848  m_local_kernel = false;
849  }
850  m_kernel = NULL;
851  }
852 
853  EIGEN_STRONG_INLINE void preloadKernel() {
854  // Don't make a local copy of the kernel unless we have to (i.e. it's an
855  // expression that needs to be evaluated)
856  const Scalar* in_place = m_kernelImpl.data();
857  if (in_place) {
858  m_kernel = in_place;
859  m_local_kernel = false;
860  } else {
861  size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
862  Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
863  typedef TensorEvalToOp<const KernelArgType> EvalTo;
864  EvalTo evalToTmp(local, m_kernelArg);
865  const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
866  internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
867 
868  m_kernel = local;
869  m_local_kernel = true;
870  }
871  }
872 
873  static unsigned int ceil(unsigned int num, unsigned int denom) {
874  const unsigned int rounded_toward_zero = num / denom;
875  if (num > rounded_toward_zero * denom) {
876  return rounded_toward_zero + 1;
877  }
878  return rounded_toward_zero;
879  }
880 
881  void executeEval(Scalar* data) const {
882  typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
883 
884  const int maxSharedMem = m_device.sharedMemPerBlock();
885  const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
886  const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
887  const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
888  const int warpSize = 32;
889 
890  switch (NumKernelDims) {
891  case 1: {
892  const int kernel_size = m_kernelImpl.dimensions().TotalSize();
893 
894  const int numX = dimensions()[m_indices[0]];
895  const int numP = dimensions().TotalSize() / numX;
896  int maxX;
897  dim3 block_size;
898 
899  const int single_stride_dim =
900  static_cast<int>(Layout) == static_cast<int>(ColMajor)
901  ? 0
902  : m_inputImpl.dimensions().rank() - 1;
903  if (m_indices[0] == single_stride_dim) {
904  // Maximum the reuse
905  const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
906  maxX = numext::mini<int>(inner_dim, numX);
907  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
908  block_size.x = numext::mini(maxThreadsPerBlock, maxX);
909  block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
910  }
911  else {
912  // Read as much as possible alongside the inner most dimension, that is the plane
913  const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
914  const int maxP = numext::mini<int>(inner_dim, numP);
915  maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
916 
917  block_size.x = numext::mini(warpSize, maxX);
918  block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
919  }
920 
921  const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
922  gpu_assert(shared_mem <= maxSharedMem);
923 
924  const int num_x_blocks = ceil(numX, maxX);
925  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
926  const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
927 
928  dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
929 
930 
931  //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
932 
933  const array<Index, 1> indices(m_indices[0]);
934  const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
935  internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
936  m_inputImpl.dimensions(), kernel_dims, indices);
937  switch(kernel_size) {
938  case 4: {
939  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
940  break;
941  }
942  case 7: {
943  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
944  break;
945  }
946  default: {
947  LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
948  }
949  }
950  break;
951  }
952 
953  case 2: {
954  const int idxX =
955  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
956  const int idxY =
957  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
958  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
959  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
960 
961  const int numX = dimensions()[m_indices[idxX]];
962  const int numY = dimensions()[m_indices[idxY]];
963  const int numP = dimensions().TotalSize() / (numX*numY);
964 
965  const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
966 
967  // Snap maxX to warp size
968  int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
969  const int maxX = numext::mini<int>(inner_dim, numX);
970  const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
971  const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
972 
973  dim3 block_size;
974  block_size.x = numext::mini(1024, maxX);
975  block_size.y = numext::mini<int>(1024/block_size.x, maxY);
976  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
977 
978  const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
979  gpu_assert(shared_mem <= maxSharedMem);
980 
981  const int num_x_blocks = ceil(numX, maxX);
982  const int num_y_blocks = ceil(numY, maxY);
983  const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
984  const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
985 
986  dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
987 
988 
989  //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
990 
991  const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
992  const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
993  m_kernelImpl.dimensions()[idxY]);
994  internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
995  m_inputImpl.dimensions(), kernel_dims, indices);
996  switch (kernel_size_x) {
997  case 4: {
998  switch (kernel_size_y) {
999  case 7: {
1000  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
1001  break;
1002  }
1003  default: {
1004  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1005  break;
1006  }
1007  }
1008  break;
1009  }
1010  case 7: {
1011  switch (kernel_size_y) {
1012  case 4: {
1013  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1014  break;
1015  }
1016  default: {
1017  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1018  break;
1019  }
1020  }
1021  break;
1022  }
1023  default: {
1024  LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1025  break;
1026  }
1027  }
1028  break;
1029  }
1030 
1031  case 3: {
1032  const int idxX =
1033  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1034  const int idxY =
1035  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1036  const int idxZ =
1037  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1038 
1039  const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1040  const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1041  const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1042 
1043  const int numX = dimensions()[m_indices[idxX]];
1044  const int numY = dimensions()[m_indices[idxY]];
1045  const int numZ = dimensions()[m_indices[idxZ]];
1046  const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1047 
1048  const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1049  const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1050  const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1051 
1052  dim3 block_size;
1053  block_size.x = numext::mini(32, maxX);
1054  block_size.y = numext::mini(32, maxY);
1055  block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1056  dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1057 
1058  const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1059  gpu_assert(shared_mem <= maxSharedMem);
1060 
1061  //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1062  const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1063  m_indices[idxZ]);
1064  const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1065  m_kernelImpl.dimensions()[idxY],
1066  m_kernelImpl.dimensions()[idxZ]);
1067  internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1068  m_inputImpl.dimensions(), kernel_dims, indices);
1069 
1070  LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1071  break;
1072  }
1073 
1074  default: {
1075  EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1076  }
1077  }
1078  }
1079 
1080  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1081  {
1082  eigen_assert(m_buf);
1083  eigen_assert(index < m_dimensions.TotalSize());
1084  return m_buf[index];
1085  }
1086 
1087  template<int LoadMode>
1088  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1089  {
1090  eigen_assert(m_buf);
1091  eigen_assert(index < m_dimensions.TotalSize());
1092  return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1093  }
1094 
1095  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1096  costPerCoeff(bool vectorized) const {
1097  // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1098  // model.
1099  const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1100  // We ignore the use of fused multiply-add.
1101  const double convolve_compute_cost =
1102  TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1103  const double firstIndex_compute_cost =
1104  NumDims *
1105  (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1106  TensorOpCost::DivCost<Index>());
1107  return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1108  kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1109  m_kernelImpl.costPerCoeff(vectorized) +
1110  TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1111  PacketSize));
1112  }
1113 
1114  private:
1115  // No assignment (copies are needed by the kernels)
1116  TensorEvaluator& operator = (const TensorEvaluator&);
1117 
1118  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1119  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1120  KernelArgType m_kernelArg;
1121  Indices m_indices;
1122  Dimensions m_dimensions;
1123  Scalar* m_buf;
1124  const Scalar* m_kernel;
1125  bool m_local_kernel;
1126 
1127  const GpuDevice& m_device;
1128 };
1129 #endif
1130 
1131 
1132 } // end namespace Eigen
1133 
1134 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)
const int Dynamic