10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
13 #include "./InternalHeaderCheck.h"
26 template <
typename Index,
typename InputDims,
int NumKernelDims,
int Layout>
29 IndexMapper(
const InputDims& input_dims,
const array<Index, NumKernelDims>& kernel_dims,
30 const array<Index, NumKernelDims>& indices) {
32 array<Index, NumDims> dimensions = input_dims;
33 for (
int i = 0; i < NumKernelDims; ++i) {
34 const Index index = indices[i];
35 const Index input_dim = input_dims[index];
36 const Index kernel_dim = kernel_dims[i];
37 const Index result_dim = input_dim - kernel_dim + 1;
38 dimensions[index] = result_dim;
41 array<Index, NumDims> inputStrides;
42 array<Index, NumDims> outputStrides;
43 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
46 for (
int i = 1; i < NumDims; ++i) {
47 inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
48 outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
51 inputStrides[NumDims - 1] = 1;
52 outputStrides[NumDims - 1] = 1;
53 for (
int i =
static_cast<int>(NumDims) - 2; i >= 0; --i) {
54 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
55 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
59 array<Index, NumDims> gpuInputDimensions;
60 array<Index, NumDims> gpuOutputDimensions;
61 array<Index, NumDims> tmp = dimensions;
62 array<Index, NumDims> ordering;
63 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
65 : NumDims - NumKernelDims;
66 for (
int i = 0; i < NumKernelDims; ++i) {
67 const Index index = i + offset;
68 ordering[index] = indices[i];
70 gpuInputDimensions[index] = input_dims[indices[i]];
71 gpuOutputDimensions[index] = dimensions[indices[i]];
74 int written =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
77 for (
int i = 0; i < NumDims; ++i) {
79 ordering[written] = i;
80 gpuInputDimensions[written] = input_dims[i];
81 gpuOutputDimensions[written] = dimensions[i];
86 for (
int i = 0; i < NumDims; ++i) {
87 m_inputStrides[i] = inputStrides[ordering[i]];
88 m_outputStrides[i] = outputStrides[ordering[i]];
91 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
92 for (
int i = 0; i < NumDims; ++i) {
93 if (i > NumKernelDims) {
94 m_gpuInputStrides[i] =
95 m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
96 m_gpuOutputStrides[i] =
97 m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
99 m_gpuInputStrides[i] = 1;
100 m_gpuOutputStrides[i] = 1;
104 for (
int i = NumDims - 1; i >= 0; --i) {
105 if (
static_cast<size_t>(i + 1) < offset) {
106 m_gpuInputStrides[i] =
107 m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
108 m_gpuOutputStrides[i] =
109 m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
111 m_gpuInputStrides[i] = 1;
112 m_gpuOutputStrides[i] = 1;
118 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuInputPlaneToTensorInputOffset(
Index p)
const {
119 Index inputIndex = 0;
120 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
121 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
122 const Index idx = p / m_gpuInputStrides[d];
123 inputIndex += idx * m_inputStrides[d];
124 p -= idx * m_gpuInputStrides[d];
126 inputIndex += p * m_inputStrides[NumKernelDims];
128 std::ptrdiff_t limit = 0;
129 if (NumKernelDims < NumDims) {
130 limit = NumDims - NumKernelDims - 1;
132 for (
int d = 0; d < limit; ++d) {
133 const Index idx = p / m_gpuInputStrides[d];
134 inputIndex += idx * m_inputStrides[d];
135 p -= idx * m_gpuInputStrides[d];
137 inputIndex += p * m_inputStrides[limit];
142 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuOutputPlaneToTensorOutputOffset(
Index p)
const {
143 Index outputIndex = 0;
144 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
145 for (
int d = NumDims - 1; d > NumKernelDims; --d) {
146 const Index idx = p / m_gpuOutputStrides[d];
147 outputIndex += idx * m_outputStrides[d];
148 p -= idx * m_gpuOutputStrides[d];
150 outputIndex += p * m_outputStrides[NumKernelDims];
152 std::ptrdiff_t limit = 0;
153 if (NumKernelDims < NumDims) {
154 limit = NumDims - NumKernelDims - 1;
156 for (
int d = 0; d < limit; ++d) {
157 const Index idx = p / m_gpuOutputStrides[d];
158 outputIndex += idx * m_outputStrides[d];
159 p -= idx * m_gpuOutputStrides[d];
161 outputIndex += p * m_outputStrides[limit];
166 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuInputKernelToTensorInputOffset(
Index i)
const {
167 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
169 : NumDims - NumKernelDims;
170 return i * m_inputStrides[offset];
173 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuOutputKernelToTensorOutputOffset(
Index i)
const {
174 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
176 : NumDims - NumKernelDims;
177 return i * m_outputStrides[offset];
180 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuInputKernelToTensorInputOffset(
Index i,
Index j)
const {
181 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
183 : NumDims - NumKernelDims;
184 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
187 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuOutputKernelToTensorOutputOffset(
Index i,
Index j)
const {
188 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
190 : NumDims - NumKernelDims;
191 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
194 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuInputKernelToTensorInputOffset(
Index i,
Index j,
Index k)
const {
195 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
197 : NumDims - NumKernelDims;
198 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
199 k * m_inputStrides[offset + 2];
202 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
Index mapGpuOutputKernelToTensorOutputOffset(
Index i,
Index j,
Index k)
const {
203 const size_t offset =
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
205 : NumDims - NumKernelDims;
206 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
207 k * m_outputStrides[offset + 2];
211 static constexpr
int NumDims = internal::array_size<InputDims>::value;
212 array<Index, NumDims> m_inputStrides;
213 array<Index, NumDims> m_outputStrides;
214 array<Index, NumDims> m_gpuInputStrides;
215 array<Index, NumDims> m_gpuOutputStrides;
220 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
221 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
224 typedef typename promote_storage_type<
typename InputXprType::Scalar,
225 typename KernelXprType::Scalar>::ret Scalar;
226 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
227 typename traits<KernelXprType>::StorageKind>::ret StorageKind;
228 typedef typename promote_index_type<typename traits<InputXprType>::Index,
229 typename traits<KernelXprType>::Index>::type
Index;
230 typedef typename InputXprType::Nested LhsNested;
231 typedef typename KernelXprType::Nested RhsNested;
232 typedef std::remove_reference_t<LhsNested> LhsNested_;
233 typedef std::remove_reference_t<RhsNested> RhsNested_;
234 static constexpr
int NumDimensions = traits<InputXprType>::NumDimensions;
235 static constexpr
int Layout = traits<InputXprType>::Layout;
236 typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
237 typename traits<InputXprType>::PointerType,
typename traits<KernelXprType>::PointerType> PointerType;
244 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
245 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>,
Eigen::Dense>
247 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
250 template<
typename Dimensions,
typename InputXprType,
typename KernelXprType>
251 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
253 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
260 template<
typename Indices,
typename InputXprType,
typename KernelXprType>
261 class TensorConvolutionOp :
public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
264 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
266 typedef typename internal::promote_storage_type<
typename InputXprType::CoeffReturnType,
267 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
268 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
269 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
270 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index
Index;
272 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(
const InputXprType& input,
const KernelXprType& kernel,
const Indices& dims)
273 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
275 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
276 const Indices& indices()
const {
return m_indices; }
279 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
280 const internal::remove_all_t<typename InputXprType::Nested>&
281 inputExpression()
const {
return m_input_xpr; }
283 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
284 const internal::remove_all_t<typename KernelXprType::Nested>&
285 kernelExpression()
const {
return m_kernel_xpr; }
288 typename InputXprType::Nested m_input_xpr;
289 typename KernelXprType::Nested m_kernel_xpr;
290 const Indices m_indices;
294 template<
typename Indices,
typename InputArgType,
typename KernelArgType,
typename Device>
295 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
297 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
299 static constexpr
int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
300 static constexpr
int NumKernelDims = internal::array_size<Indices>::value;
301 typedef typename XprType::Index Index;
302 typedef DSizes<Index, NumDims> Dimensions;
304 typedef typename XprType::Scalar Scalar;
305 typedef typename XprType::CoeffReturnType CoeffReturnType;
306 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
307 static constexpr
int PacketSize = PacketType<CoeffReturnType, Device>::size;
308 typedef StorageMemory<Scalar, Device> Storage;
309 typedef typename Storage::Type EvaluatorPointerType;
311 static constexpr
int Layout = TensorEvaluator<InputArgType, Device>::Layout;
313 IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
314 PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
316 PreferBlockAccess =
false,
322 typedef internal::TensorBlockNotImplemented TensorBlock;
325 EIGEN_STRONG_INLINE TensorEvaluator(
const XprType& op,
const Device& device)
326 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
328 EIGEN_STATIC_ASSERT((
static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) ==
static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
330 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
331 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
333 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
334 m_inputStride[0] = 1;
335 for (
int i = 1; i < NumDims; ++i) {
336 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
339 m_inputStride[NumDims - 1] = 1;
340 for (
int i = NumDims - 2; i >= 0; --i) {
341 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
345 m_dimensions = m_inputImpl.dimensions();
346 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
347 for (
int i = 0; i < NumKernelDims; ++i) {
348 const Index index = op.indices()[i];
349 const Index input_dim = input_dims[index];
350 const Index kernel_dim = kernel_dims[i];
351 const Index result_dim = input_dim - kernel_dim + 1;
352 m_dimensions[index] = result_dim;
354 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
356 m_kernelStride[0] = 1;
358 m_indexStride[i] = m_inputStride[index];
361 m_outputStride[0] = 1;
362 for (
int i = 1; i < NumDims; ++i) {
363 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
366 for (
int i = NumKernelDims - 1; i >= 0; --i) {
367 const Index index = op.indices()[i];
368 const Index input_dim = input_dims[index];
369 const Index kernel_dim = kernel_dims[i];
370 const Index result_dim = input_dim - kernel_dim + 1;
371 m_dimensions[index] = result_dim;
372 if (i < NumKernelDims - 1) {
373 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
375 m_kernelStride[NumKernelDims - 1] = 1;
377 m_indexStride[i] = m_inputStride[index];
380 m_outputStride[NumDims - 1] = 1;
381 for (
int i = NumDims - 2; i >= 0; --i) {
382 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
387 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions()
const {
return m_dimensions; }
389 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(Scalar*) {
390 m_inputImpl.evalSubExprsIfNeeded(NULL);
394 EIGEN_STRONG_INLINE
void cleanup() {
395 m_inputImpl.cleanup();
396 if (m_local_kernel) {
397 m_device.deallocate((
void*)m_kernel);
398 m_local_kernel =
false;
403 void evalTo(
typename XprType::Scalar* buffer) {
404 evalSubExprsIfNeeded(NULL);
405 for (
int i = 0; i < dimensions().TotalSize(); ++i) {
406 buffer[i] += coeff(i);
411 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const
413 CoeffReturnType result = CoeffReturnType(0);
414 convolve(firstInput(index), 0, NumKernelDims-1, result);
418 template<
int LoadMode>
419 EIGEN_DEVICE_FUNC PacketReturnType packet(
const Index index)
const
421 Index indices[2] = {index, index+PacketSize-1};
422 Index startInputs[2] = {0, 0};
423 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
424 for (
int i = NumDims - 1; i > 0; --i) {
425 const Index idx0 = indices[0] / m_outputStride[i];
426 const Index idx1 = indices[1] / m_outputStride[i];
427 startInputs[0] += idx0 * m_inputStride[i];
428 startInputs[1] += idx1 * m_inputStride[i];
429 indices[0] -= idx0 * m_outputStride[i];
430 indices[1] -= idx1 * m_outputStride[i];
433 for (
int i = 0; i < NumDims - 1; ++i) {
434 const Index idx0 = indices[0] / m_outputStride[i];
435 const Index idx1 = indices[1] / m_outputStride[i];
436 startInputs[0] += idx0 * m_inputStride[i];
437 startInputs[1] += idx1 * m_inputStride[i];
438 indices[0] -= idx0 * m_outputStride[i];
439 indices[1] -= idx1 * m_outputStride[i];
442 startInputs[0] += indices[0];
443 startInputs[1] += indices[1];
445 if (startInputs[1]-startInputs[0] == PacketSize-1) {
446 PacketReturnType result = internal::pset1<PacketReturnType>(0);
447 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
450 EIGEN_ALIGN_MAX Scalar data[PacketSize];
452 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
453 for (
int i = 1; i < PacketSize-1; ++i) {
455 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
457 data[PacketSize-1] = Scalar(0);
458 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
459 return internal::pload<PacketReturnType>(data);
463 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
464 costPerCoeff(
bool vectorized)
const {
465 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
467 const double convolve_compute_cost =
468 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
469 const double firstIndex_compute_cost =
471 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
472 TensorOpCost::DivCost<Index>());
473 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
474 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
475 m_kernelImpl.costPerCoeff(vectorized) +
476 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
480 EIGEN_DEVICE_FUNC EvaluatorPointerType data()
const {
return NULL; }
483 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index firstInput(Index index)
const {
484 Index startInput = 0;
485 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
486 for (
int i = NumDims - 1; i > 0; --i) {
487 const Index idx = index / m_outputStride[i];
488 startInput += idx * m_inputStride[i];
489 index -= idx * m_outputStride[i];
492 for (
int i = 0; i < NumDims - 1; ++i) {
493 const Index idx = index / m_outputStride[i];
494 startInput += idx * m_inputStride[i];
495 index -= idx * m_outputStride[i];
502 EIGEN_DEVICE_FUNC
void convolve(Index firstIndex, Index firstKernel,
int DimIndex, CoeffReturnType& accum)
const {
503 for (
int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
504 const Index input = firstIndex + j * m_indexStride[DimIndex];
505 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
507 convolve(input, kernel, DimIndex-1, accum);
509 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
514 template <
typename Packet>
515 EIGEN_DEVICE_FUNC
void convolvePacket(Index firstIndex, Index firstKernel,
int DimIndex, Packet& accum)
const {
516 for (
int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
517 const Index input = firstIndex + j * m_indexStride[DimIndex];
518 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
520 convolvePacket(input, kernel, DimIndex-1, accum);
522 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
527 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void preloadKernel() {
530 const Scalar* in_place = m_kernelImpl.data();
533 m_local_kernel =
false;
535 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
536 Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
537 typedef TensorEvalToOp<const KernelArgType> EvalTo;
538 EvalTo evalToTmp(local, m_kernelArg);
539 const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
540 internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
543 m_local_kernel =
true;
547 array<Index, NumDims> m_inputStride;
548 array<Index, NumDims> m_outputStride;
550 array<Index, NumKernelDims> m_indexStride;
551 array<Index, NumKernelDims> m_kernelStride;
552 TensorEvaluator<InputArgType, Device> m_inputImpl;
553 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
554 Dimensions m_dimensions;
556 KernelArgType m_kernelArg;
557 const Scalar* m_kernel;
559 const Device EIGEN_DEVICE_REF m_device;
566 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
568 template <
int StaticKernelSize>
569 struct GetKernelSize {
570 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int operator() (
const int )
const {
571 return StaticKernelSize;
575 struct GetKernelSize<
Dynamic> {
576 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int operator() (
const int kernelSize)
const {
581 template <
typename InputEvaluator,
typename Index,
typename InputDims,
582 int StaticKernelSize>
583 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void EigenConvolutionKernel1D(
585 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
587 const float* __restrict kernel,
const int numPlanes,
const int numX,
588 const int maxX,
const int kernelSize,
float* buffer) {
589 #if defined(EIGEN_HIPCC)
590 HIP_DYNAMIC_SHARED(
float, s)
592 extern __shared__
float s[];
595 const int first_x = blockIdx.x * maxX;
596 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
597 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
598 const int num_x_output = last_x - first_x + 1;
600 const int first_plane = blockIdx.y * blockDim.y;
601 const int plane_stride = blockDim.y * gridDim.y;
603 for (
int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
605 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
606 const int plane_kernel_offset = threadIdx.y * num_x_input;
608 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
609 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
610 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
616 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
619 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
620 const int kernel_offset = plane_kernel_offset + i;
623 for (
int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
624 result += s[k + kernel_offset] * kernel[k];
626 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
627 buffer[tensor_index] = result;
633 template <
typename InputEvaluator,
typename Index,
typename InputDims,
634 int StaticKernelSizeX,
int StaticKernelSizeY>
635 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void EigenConvolutionKernel2D(
637 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
639 const float* __restrict kernel,
const int numPlanes,
const int numX,
640 const int maxX,
const int numY,
const int maxY,
const int kernelSizeX,
641 const int kernelSizeY,
float* buffer) {
642 #if defined(EIGEN_HIPCC)
643 HIP_DYNAMIC_SHARED(
float, s)
645 extern __shared__
float s[];
648 const int first_x = blockIdx.x * maxX;
649 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
650 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
651 const int num_x_output = last_x - first_x + 1;
653 const int first_y = blockIdx.y * maxY;
654 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
655 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
656 const int num_y_output = last_y - first_y + 1;
658 const int first_plane = blockIdx.z * blockDim.z;
659 const int plane_stride = blockDim.z * gridDim.z;
661 for (
int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
663 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
664 const int plane_kernel_offset = threadIdx.z * num_y_input;
668 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
669 const int input_offset = num_x_input * (j + plane_kernel_offset);
671 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
672 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
673 s[i + input_offset] = eval.coeff(tensor_index);
680 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
683 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
685 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
688 for (
int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
689 const int kernel_offset = kernelSizeX * l;
690 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
692 for (
int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
693 result += s[k + input_offset] * kernel[k + kernel_offset];
696 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
697 buffer[tensor_index] = result;
705 template <
typename InputEvaluator,
typename Index,
typename InputDims>
706 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
void EigenConvolutionKernel3D(
708 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
710 const float* __restrict kernel,
const size_t numPlanes,
const size_t numX,
711 const size_t maxX,
const size_t numY,
const size_t maxY,
const size_t numZ,
712 const size_t maxZ,
const size_t kernelSizeX,
const size_t kernelSizeY,
713 const size_t kernelSizeZ,
float* buffer) {
714 #if defined(EIGEN_HIPCC)
715 HIP_DYNAMIC_SHARED(
float, s)
717 extern __shared__
float s[];
721 const int first_x = blockIdx.x * maxX;
722 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
723 const int num_x_input = last_x - first_x + kernelSizeX;
725 const int first_y = blockIdx.y * maxY;
726 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
727 const int num_y_input = last_y - first_y + kernelSizeY;
729 const int first_z = blockIdx.z * maxZ;
730 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
731 const int num_z_input = last_z - first_z + kernelSizeZ;
733 for (
int p = 0; p < numPlanes; ++p) {
735 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
736 const int plane_kernel_offset = 0;
738 for (
int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
739 for (
int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
740 for (
int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
741 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
742 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
750 const int num_z_output = last_z - first_z + 1;
751 const int num_y_output = last_y - first_y + 1;
752 const int num_x_output = last_x - first_x + 1;
753 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
755 for (
int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
756 for (
int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
757 for (
int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
759 for (
int n = 0; n < kernelSizeZ; ++n) {
760 for (
int m = 0; m < kernelSizeY; ++m) {
761 for (
int l = 0; l < kernelSizeX; ++l) {
762 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
766 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
767 buffer[tensor_index] = result;
777 template<
typename Indices,
typename InputArgType,
typename KernelArgType>
778 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
780 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
782 static constexpr
int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
783 static constexpr
int NumKernelDims = internal::array_size<Indices>::value;
784 typedef typename XprType::Index
Index;
785 typedef DSizes<Index, NumDims> Dimensions;
786 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
788 static constexpr
int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;
790 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
791 PacketAccess =
false,
793 PreferBlockAccess =
false,
799 typedef internal::TensorBlockNotImplemented TensorBlock;
802 TensorEvaluator(
const XprType& op,
const GpuDevice& device)
803 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
805 EIGEN_STATIC_ASSERT((
static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) ==
static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
807 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
808 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
810 m_dimensions = m_inputImpl.dimensions();
811 for (
int i = 0; i < NumKernelDims; ++i) {
812 const Index index = op.indices()[i];
813 const Index input_dim = input_dims[index];
814 const Index kernel_dim = kernel_dims[i];
815 const Index result_dim = input_dim - kernel_dim + 1;
816 m_dimensions[index] = result_dim;
820 typedef typename XprType::CoeffReturnType CoeffReturnType;
821 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
822 typedef typename InputArgType::Scalar Scalar;
823 static constexpr
int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
825 EIGEN_DEVICE_FUNC
const Dimensions& dimensions()
const {
return m_dimensions; }
827 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(Scalar* data) {
829 m_inputImpl.evalSubExprsIfNeeded(NULL);
834 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() *
sizeof(Scalar));
840 EIGEN_STRONG_INLINE
void cleanup() {
841 m_inputImpl.cleanup();
843 m_device.deallocate(m_buf);
846 if (m_local_kernel) {
847 m_device.deallocate((
void*)m_kernel);
848 m_local_kernel =
false;
853 EIGEN_STRONG_INLINE
void preloadKernel() {
856 const Scalar* in_place = m_kernelImpl.data();
859 m_local_kernel =
false;
861 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
862 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
863 typedef TensorEvalToOp<const KernelArgType> EvalTo;
864 EvalTo evalToTmp(local, m_kernelArg);
865 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
866 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
869 m_local_kernel =
true;
873 static unsigned int ceil(
unsigned int num,
unsigned int denom) {
874 const unsigned int rounded_toward_zero = num / denom;
875 if (num > rounded_toward_zero * denom) {
876 return rounded_toward_zero + 1;
878 return rounded_toward_zero;
881 void executeEval(Scalar* data)
const {
882 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
884 const int maxSharedMem = m_device.sharedMemPerBlock();
885 const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
886 const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
887 const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
888 const int warpSize = 32;
890 switch (NumKernelDims) {
892 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
894 const int numX = dimensions()[m_indices[0]];
895 const int numP = dimensions().TotalSize() / numX;
899 const int single_stride_dim =
900 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)
902 : m_inputImpl.dimensions().rank() - 1;
903 if (m_indices[0] == single_stride_dim) {
905 const int inner_dim = ((maxSharedMem / (
sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
906 maxX = numext::mini<int>(inner_dim, numX);
907 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) *
sizeof(Scalar)), numP);
908 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
909 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
913 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) *
sizeof(Scalar));
914 const int maxP = numext::mini<int>(inner_dim, numP);
915 maxX = numext::mini<int>(maxSharedMem / (inner_dim *
sizeof(Scalar)) - kernel_size + 1, numX);
917 block_size.x = numext::mini(warpSize, maxX);
918 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
921 const int shared_mem = block_size.y * (maxX + kernel_size - 1) *
sizeof(Scalar);
922 gpu_assert(shared_mem <= maxSharedMem);
924 const int num_x_blocks =
ceil(numX, maxX);
925 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
926 const int num_y_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
928 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks,
ceil(numP, block_size.y)));
933 const array<Index, 1> indices(m_indices[0]);
934 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
935 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
936 m_inputImpl.dimensions(), kernel_dims, indices);
937 switch(kernel_size) {
939 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
943 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
947 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
955 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1;
957 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 0;
958 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
959 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
961 const int numX = dimensions()[m_indices[idxX]];
962 const int numY = dimensions()[m_indices[idxY]];
963 const int numP = dimensions().TotalSize() / (numX*numY);
965 const float scaling_factor = sqrtf(
static_cast<float>(maxSharedMem) / (
sizeof(Scalar) * kernel_size_y * kernel_size_x));
968 int inner_dim = ((
static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
969 const int maxX = numext::mini<int>(inner_dim, numX);
970 const int maxY = numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
971 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) *
sizeof(Scalar)), numP);
974 block_size.x = numext::mini(1024, maxX);
975 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
976 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
978 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) *
sizeof(Scalar);
979 gpu_assert(shared_mem <= maxSharedMem);
981 const int num_x_blocks =
ceil(numX, maxX);
982 const int num_y_blocks =
ceil(numY, maxY);
983 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
984 const int num_z_blocks =
ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
986 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks,
ceil(numP, block_size.z)));
991 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
992 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
993 m_kernelImpl.dimensions()[idxY]);
994 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
995 m_inputImpl.dimensions(), kernel_dims, indices);
996 switch (kernel_size_x) {
998 switch (kernel_size_y) {
1000 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
1004 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4,
Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1011 switch (kernel_size_y) {
1013 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1017 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7,
Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1024 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
Dynamic,
Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1033 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2;
1035 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 1;
1037 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 2 : 0;
1039 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1040 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1041 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1043 const int numX = dimensions()[m_indices[idxX]];
1044 const int numY = dimensions()[m_indices[idxY]];
1045 const int numZ = dimensions()[m_indices[idxZ]];
1046 const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1048 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1049 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1050 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (
sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1053 block_size.x = numext::mini(32, maxX);
1054 block_size.y = numext::mini(32, maxY);
1055 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1056 dim3 num_blocks(
ceil(numX, maxX),
ceil(numY, maxY),
ceil(numZ, maxZ));
1058 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) *
sizeof(Scalar);
1059 gpu_assert(shared_mem <= maxSharedMem);
1062 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1064 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1065 m_kernelImpl.dimensions()[idxY],
1066 m_kernelImpl.dimensions()[idxZ]);
1067 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1068 m_inputImpl.dimensions(), kernel_dims, indices);
1070 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1075 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1080 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const
1082 eigen_assert(m_buf);
1083 eigen_assert(index < m_dimensions.TotalSize());
1084 return m_buf[index];
1087 template<
int LoadMode>
1088 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(
const Index index)
const
1090 eigen_assert(m_buf);
1091 eigen_assert(index < m_dimensions.TotalSize());
1092 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1095 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1096 costPerCoeff(
bool vectorized)
const {
1099 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1101 const double convolve_compute_cost =
1102 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1103 const double firstIndex_compute_cost =
1105 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1106 TensorOpCost::DivCost<Index>());
1107 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1108 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1109 m_kernelImpl.costPerCoeff(vectorized) +
1110 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1116 TensorEvaluator& operator = (
const TensorEvaluator&);
1118 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1119 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1120 KernelArgType m_kernelArg;
1122 Dimensions m_dimensions;
1124 const Scalar* m_kernel;
1125 bool m_local_kernel;
1127 const GpuDevice& m_device;
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)