Eigen-unsupported  3.4.90 (git rev 67eeba6e720c5745abc77ae6c92ce0a44aa7b7ae)
TensorExecutor.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
31 namespace internal {
32 
41 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.
42 
43 template<typename Expression>
44 struct ExpressionHasTensorBroadcastingOp {
45  enum { value = false };
46 };
47 
48 template<typename LhsXprType, typename RhsXprType>
49 struct ExpressionHasTensorBroadcastingOp<
50  const TensorAssignOp<LhsXprType, RhsXprType> > {
51  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
52 };
53 
54 template<typename UnaryOp, typename XprType>
55 struct ExpressionHasTensorBroadcastingOp<
56  const TensorCwiseUnaryOp<UnaryOp, XprType> > {
57  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
58 };
59 
60 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
61 struct ExpressionHasTensorBroadcastingOp<
62  const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
63  enum {
64  value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
65  ExpressionHasTensorBroadcastingOp<RhsXprType>::value
66  };
67 };
68 
69 template<typename Broadcast, typename XprType>
70 struct ExpressionHasTensorBroadcastingOp<
71  const TensorBroadcastingOp<Broadcast, XprType> > {
72  enum { value = true };
73 };
74 
75 // -------------------------------------------------------------------------- //
76 
81 template <typename Expression, typename Device, bool Vectorizable,
82  TiledEvaluation Tiling>
83 class TensorExecutor {
84  public:
85  typedef typename Expression::Index StorageIndex;
86 
87  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
88  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
89  // violation. If this template is instantiated with a non-default device, it
90  // means that this header file was included without defining
91  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
92  static_assert(std::is_same<Device, DefaultDevice>::value,
93  "Default executor instantiated with non-default device. "
94  "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
95  "EIGEN_USE_SYCL before including Eigen headers.");
96 
97  EIGEN_DEVICE_FUNC
98  static EIGEN_STRONG_INLINE void run(const Expression& expr,
99  const Device& device = Device()) {
100  TensorEvaluator<Expression, Device> evaluator(expr, device);
101  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
102  if (needs_assign) {
103  const StorageIndex size = array_prod(evaluator.dimensions());
104  for (StorageIndex i = 0; i < size; ++i) {
105  evaluator.evalScalar(i);
106  }
107  }
108  evaluator.cleanup();
109  }
110 };
111 
116 template <typename Expression, typename Device, typename DoneCallback,
117  bool Vectorizable, TiledEvaluation Tiling>
118 class TensorAsyncExecutor {};
119 
123 template <typename Expression>
124 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
125  /*Tiling=*/TiledEvaluation::Off> {
126  public:
127  typedef typename Expression::Index StorageIndex;
128 
129  EIGEN_DEVICE_FUNC
130  static EIGEN_STRONG_INLINE void run(
131  const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
132  TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
133  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
134  if (needs_assign) {
135  const StorageIndex size = array_prod(evaluator.dimensions());
136  const int PacketSize = unpacket_traits<typename TensorEvaluator<
137  Expression, DefaultDevice>::PacketReturnType>::size;
138 
139  // Give compiler a strong possibility to unroll the loop. But don't insist
140  // on unrolling, because if the function is expensive compiler should not
141  // unroll the loop at the expense of inlining.
142  const StorageIndex UnrolledSize =
143  (size / (4 * PacketSize)) * 4 * PacketSize;
144  for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
145  for (StorageIndex j = 0; j < 4; j++) {
146  evaluator.evalPacket(i + j * PacketSize);
147  }
148  }
149  const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
150  for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
151  evaluator.evalPacket(i);
152  }
153  for (StorageIndex i = VectorizedSize; i < size; ++i) {
154  evaluator.evalScalar(i);
155  }
156  }
157  evaluator.cleanup();
158  }
159 };
160 
165 template <typename Expression, bool Vectorizable>
166 class TensorExecutor<Expression, DefaultDevice, Vectorizable,
167  /*Tiling=*/TiledEvaluation::On> {
168  public:
169  typedef typename traits<Expression>::Scalar Scalar;
170  typedef std::remove_const_t<Scalar> ScalarNoConst;
171 
172  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
173  typedef typename traits<Expression>::Index StorageIndex;
174 
175  static constexpr int NumDims = traits<Expression>::NumDimensions;
176 
177  EIGEN_DEVICE_FUNC
178  static EIGEN_STRONG_INLINE void run(const Expression& expr,
179  const DefaultDevice& device = DefaultDevice()) {
180  typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
181  TensorBlockMapper;
182 
183  typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
184  TensorBlockDesc;
185  typedef internal::TensorBlockScratchAllocator<DefaultDevice>
186  TensorBlockScratch;
187 
188  Evaluator evaluator(expr, device);
189 
190  // TODO(ezhulenev): Do not use tiling for small tensors?
191  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
192 
193  if (needs_assign) {
194  // Query expression tree for desired block size/shape.
195  const TensorBlockResourceRequirements requirements =
196  evaluator.getResourceRequirements();
197 
198  const TensorBlockMapper block_mapper(
199  typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
200  requirements);
201 
202  // Share scratch memory allocator between all blocks.
203  TensorBlockScratch scratch(device);
204 
205  const StorageIndex total_block_count = block_mapper.blockCount();
206  for (StorageIndex i = 0; i < total_block_count; ++i) {
207  TensorBlockDesc desc = block_mapper.blockDescriptor(i);
208  evaluator.evalBlock(desc, scratch);
209  scratch.reset();
210  }
211  }
212  evaluator.cleanup();
213  }
214 };
215 
227 #ifdef EIGEN_USE_THREADS
228 
229 template <typename TensorBlockMapper>
230 struct TensorExecutorTilingContext {
231  TensorExecutorTilingContext() = default;
232  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
233  const TensorOpCost& b_cost, size_t b_aligned_size)
234  : block_mapper(b_mapper),
235  cost(b_cost),
236  aligned_blocksize(b_aligned_size) {}
237 
238  TensorBlockMapper block_mapper; // navigate through blocks
239  TensorOpCost cost; // cost of computing a single block
240  size_t aligned_blocksize; // block size after memory alignment
241 };
242 
243 // Computes a block evaluation parameters, and allocates temporary memory buffer
244 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
245 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
246 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
247  const Evaluator& evaluator) {
248  // Query expression tree for desired block size/shape.
249  TensorBlockResourceRequirements requirements =
250  evaluator.getResourceRequirements();
251 
252  // Update target block size based on cost model.
253  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
254  1, requirements.cost_per_coeff);
255  requirements.size = static_cast<size_t>(1.0 / taskSize);
256 
257  TensorBlockMapper block_mapper(
258  typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
259  requirements);
260 
261  size_t block_size = block_mapper.blockTotalSize();
262  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
263  const size_t aligned_blocksize =
264  align *
265  divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
266 
267  return {block_mapper, requirements.cost_per_coeff * block_size,
268  aligned_blocksize};
269 }
270 
271 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
272 struct EvalRange {
273  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
274  const StorageIndex lastIdx) {
275  Evaluator evaluator = *evaluator_in;
276  eigen_assert(lastIdx >= firstIdx);
277  for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
278  evaluator.evalScalar(i);
279  }
280  }
281 
282  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
283 };
284 
285 template <typename Evaluator, typename StorageIndex>
286 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
287  static constexpr int PacketSize =
288  unpacket_traits<typename Evaluator::PacketReturnType>::size;
289 
290  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
291  const StorageIndex lastIdx) {
292  Evaluator evaluator = *evaluator_in;
293  eigen_assert(lastIdx >= firstIdx);
294  StorageIndex i = firstIdx;
295  if (lastIdx - firstIdx >= PacketSize) {
296  eigen_assert(firstIdx % PacketSize == 0);
297  StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
298  // Give compiler a strong possibility to unroll the loop. But don't insist
299  // on unrolling, because if the function is expensive compiler should not
300  // unroll the loop at the expense of inlining.
301  for (; i <= last_chunk_offset; i += 4 * PacketSize) {
302  for (StorageIndex j = 0; j < 4; j++) {
303  evaluator.evalPacket(i + j * PacketSize);
304  }
305  }
306  last_chunk_offset = lastIdx - PacketSize;
307  for (; i <= last_chunk_offset; i += PacketSize) {
308  evaluator.evalPacket(i);
309  }
310  }
311  for (; i < lastIdx; ++i) {
312  evaluator.evalScalar(i);
313  }
314  }
315 
316  static StorageIndex alignBlockSize(StorageIndex size) {
317  // Align block size to packet size and account for unrolling in run above.
318  if (size >= 16 * PacketSize) {
319  return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
320  }
321  // Aligning to 4 * PacketSize would increase block size by more than 25%.
322  return (size + PacketSize - 1) & ~(PacketSize - 1);
323  }
324 };
325 
326 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
327 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
328  public:
329  typedef typename Expression::Index StorageIndex;
330 
331  static EIGEN_STRONG_INLINE void run(const Expression& expr,
332  const ThreadPoolDevice& device) {
333  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
334  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
335 
336  Evaluator evaluator(expr, device);
337  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
338  if (needs_assign) {
339  const StorageIndex size = array_prod(evaluator.dimensions());
340  device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
341  EvalRange::alignBlockSize,
342  [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
343  EvalRange::run(&evaluator, firstIdx, lastIdx);
344  });
345  }
346  evaluator.cleanup();
347  }
348 };
349 
350 template <typename Expression, bool Vectorizable>
351 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
352  /*Tiling=*/TiledEvaluation::On> {
353  public:
354  typedef typename traits<Expression>::Index IndexType;
355  typedef typename traits<Expression>::Scalar Scalar;
356  typedef std::remove_const_t<Scalar> ScalarNoConst;
357 
358  static constexpr int NumDims = traits<Expression>::NumDimensions;
359 
360  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
361  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
362  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
363 
364  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
365  TensorBlockDesc;
366  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
367  TensorBlockScratch;
368 
369  static EIGEN_STRONG_INLINE void run(const Expression& expr,
370  const ThreadPoolDevice& device) {
371  Evaluator evaluator(expr, device);
372 
373  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
374  if (needs_assign) {
375  const TilingContext tiling =
376  internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
377  Vectorizable>(evaluator);
378 
379  auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
380  IndexType lastBlockIdx) {
381  TensorBlockScratch scratch(device);
382 
383  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
384  ++block_idx) {
385  TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
386  evaluator.evalBlock(desc, scratch);
387  scratch.reset();
388  }
389  };
390 
391  // Evaluate small expressions directly as a single block.
392  if (tiling.block_mapper.blockCount() == 1) {
393  TensorBlockScratch scratch(device);
394  TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
395  evaluator.evalBlock(desc, scratch);
396  } else {
397  device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
398  eval_block);
399  }
400  }
401  evaluator.cleanup();
402  }
403 };
404 
405 template <typename Expression, typename DoneCallback, bool Vectorizable,
406  TiledEvaluation Tiling>
407 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
408  Vectorizable, Tiling> {
409  public:
410  typedef typename Expression::Index StorageIndex;
411  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
412 
413  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
414  const ThreadPoolDevice& device,
415  DoneCallback done) {
416  TensorAsyncExecutorContext* const ctx =
417  new TensorAsyncExecutorContext(expr, device, std::move(done));
418 
419  const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
420  if (!need_assign) {
421  delete ctx;
422  return;
423  }
424 
425  typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
426  const StorageIndex size = array_prod(ctx->evaluator.dimensions());
427  device.parallelForAsync(
428  size, ctx->evaluator.costPerCoeff(Vectorizable),
429  EvalRange::alignBlockSize,
430  [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
431  EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
432  },
433  [ctx]() { delete ctx; });
434  };
435 
436  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
437  }
438 
439  private:
440  struct TensorAsyncExecutorContext {
441  TensorAsyncExecutorContext(const Expression& expr,
442  const ThreadPoolDevice& thread_pool,
443  DoneCallback done)
444  : evaluator(expr, thread_pool), on_done(std::move(done)) {}
445 
446  ~TensorAsyncExecutorContext() {
447  evaluator.cleanup();
448  on_done();
449  }
450 
451  Evaluator evaluator;
452 
453  private:
454  DoneCallback on_done;
455  };
456 };
457 
458 template <typename Expression, typename DoneCallback, bool Vectorizable>
459 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
460  Vectorizable, /*Tileable*/ TiledEvaluation::On> {
461  public:
462  typedef typename traits<Expression>::Index IndexType;
463  typedef typename traits<Expression>::Scalar Scalar;
464  typedef std::remove_const_t<Scalar> ScalarNoConst;
465 
466  static constexpr int NumDims = traits<Expression>::NumDimensions;
467 
468  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
469  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
470  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
471 
472  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
473  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
474  TensorBlockScratch;
475 
476  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
477  const ThreadPoolDevice& device,
478  DoneCallback done) {
479 
480  TensorAsyncExecutorContext* const ctx =
481  new TensorAsyncExecutorContext(expr, device, std::move(done));
482 
483  const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
484  if (!need_assign) {
485  delete ctx;
486  return;
487  }
488 
489  ctx->tiling = internal::GetTensorExecutorTilingContext<
490  Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
491 
492  auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
493  TensorBlockScratch scratch(ctx->device);
494 
495  for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
496  ++block_idx) {
497  TensorBlockDesc desc =
498  ctx->tiling.block_mapper.blockDescriptor(block_idx);
499  ctx->evaluator.evalBlock(desc, scratch);
500  scratch.reset();
501  }
502  };
503 
504  // Evaluate small expressions directly as a single block.
505  if (ctx->tiling.block_mapper.blockCount() == 1) {
506  TensorBlockScratch scratch(ctx->device);
507  TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
508  ctx->evaluator.evalBlock(desc, scratch);
509  delete ctx;
510  } else {
511  ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
512  ctx->tiling.cost, eval_block,
513  [ctx]() { delete ctx; });
514  }
515  };
516 
517  ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
518  }
519 
520  private:
521  struct TensorAsyncExecutorContext {
522  TensorAsyncExecutorContext(const Expression& expr,
523  const ThreadPoolDevice& thread_pool,
524  DoneCallback done)
525  : device(thread_pool),
526  evaluator(expr, thread_pool),
527  on_done(std::move(done)) {}
528 
529  ~TensorAsyncExecutorContext() {
530  evaluator.cleanup();
531  on_done();
532  }
533 
534  const ThreadPoolDevice& device;
535  Evaluator evaluator;
536  TilingContext tiling;
537 
538  private:
539  DoneCallback on_done;
540  };
541 };
542 
543 #endif // EIGEN_USE_THREADS
544 
545 // GPU: the evaluation of the expression is offloaded to a GPU.
546 #if defined(EIGEN_USE_GPU)
547 
548 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
549 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
550  public:
551  typedef typename Expression::Index StorageIndex;
552  static void run(const Expression& expr, const GpuDevice& device);
553 };
554 
555 #if defined(EIGEN_GPUCC)
556 // Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
557 template <typename Index>
558 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs,
559  Index rhs) {
560  const Index highest = NumTraits<Index>::highest();
561  const Index lowest = NumTraits<Index>::lowest();
562  if (lhs > 0 && rhs > 0) {
563  return lhs > highest - rhs ? 1 : 0;
564  } else if (lhs < 0 && rhs < 0) {
565  return lhs < lowest - rhs ? -1 : 0;
566  } else {
567  return 0;
568  }
569 }
570 
571 // Returns lhs + rhs, saturating to the highest/lowest representable value on
572 // overflow/underflow respectively.
573 template <typename Index>
574 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
575  const Index highest = NumTraits<Index>::highest();
576  const Index lowest = NumTraits<Index>::lowest();
577  int overflow = sum_will_overflow(lhs, rhs);
578  return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
579 }
580 
581 // A functor that adds step_size to a given index, saturating to avoid
582 // overflow/underflow. If overflow/underflow is not possible, regular addition
583 // is used (for efficiency).
584 template <typename Index>
585 struct SafeStep {
586  // lastIdx is one past the end of the possible indexes.
587  // step_size is the value that will be added to the given index when the
588  // functor is called.
589  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
590  : can_overflow_(sum_will_overflow(lastIdx, step_size)),
591  step_size_(step_size) {}
592 
593  // Adds step_size to index, saturating on overflow (if overflow is possible).
594  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
595  return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
596  }
597 
598  private:
599  const bool can_overflow_;
600  const Index step_size_;
601 };
602 
603 template <typename Evaluator, typename StorageIndex, bool Vectorizable>
604 struct EigenMetaKernelEval {
605  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
606  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
607  SafeStep<StorageIndex> safe_step(lastIdx, step_size);
608  for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
609  eval.evalScalar(i);
610  }
611  }
612 };
613 
614 template <typename Evaluator, typename StorageIndex>
615 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
616  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
617  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
618  const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
619  const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
620  const StorageIndex vectorized_step_size = step_size * PacketSize;
621 
622  SafeStep<StorageIndex> safe_vectorized_step(vectorized_size,
623  vectorized_step_size);
624  // Use the vector path
625  for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
626  i = safe_vectorized_step(i)) {
627  eval.evalPacket(i);
628  }
629  SafeStep<StorageIndex> safe_step(lastIdx, step_size);
630  for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;
631  i = safe_step(i)) {
632  eval.evalScalar(i);
633  }
634  }
635 };
636 
637 template <typename Evaluator, typename StorageIndex>
638 __global__ void
639 __launch_bounds__(1024)
640 EigenMetaKernel(Evaluator eval, StorageIndex size) {
641 
642  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
643  const StorageIndex step_size = blockDim.x * gridDim.x;
644 
645  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
646  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
647 }
648 
649 /*static*/
650 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
652  const Expression& expr, const GpuDevice& device) {
653  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
654  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
655  if (needs_assign) {
656 
657  const int block_size = device.maxGpuThreadsPerBlock();
658  const int max_blocks =
659  numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
660  device.maxGpuThreadsPerMultiProcessor(),
661  NumTraits<StorageIndex>::highest()) /
662  block_size;
663  const StorageIndex size = array_prod(evaluator.dimensions());
664  // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
665  const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
666 
667  LAUNCH_GPU_KERNEL(
668  (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
669  num_blocks, block_size, 0, device, evaluator, size);
670  }
671  evaluator.cleanup();
672 }
673 
674 #endif // EIGEN_GPUCC
675 #endif // EIGEN_USE_GPU
676 
677 // SYCL Executor policy
678 #ifdef EIGEN_USE_SYCL
679 
680 template <typename Evaluator>
681 struct ExecExprFunctorKernel {
682  typedef typename Evaluator::Index Index;
683  Evaluator evaluator;
684  const Index range;
685  template <typename Scratch>
686  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
687  const Scratch, Evaluator evaluator_, const Index range_)
688  : evaluator(evaluator_), range(range_) {}
689 
690  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
691  cl::sycl::nd_item<1> itemID) {
692  compute(itemID);
693  }
694  template <bool is_vec = Evaluator::PacketAccess>
695  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
696  compute(const cl::sycl::nd_item<1>& itemID) {
697  Index gId = static_cast<Index>(itemID.get_global_linear_id());
698  Index total_threads = itemID.get_global_range(0);
699 
700  for (Index i = gId; i < range; i += total_threads) {
701  evaluator.evalScalar(i);
702  }
703  }
704  template <bool is_vec = Evaluator::PacketAccess>
705  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
706  compute(const cl::sycl::nd_item<1>& itemID) {
707  const Index vectorizedRange =
708  (range / Evaluator::PacketSize) * Evaluator::PacketSize;
709  Index gId = static_cast<Index>(itemID.get_global_linear_id());
710  const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
711  const Index start = Evaluator::PacketSize * gId;
712  for (Index i = start; i < vectorizedRange; i += step) {
713  evaluator.evalPacket(i);
714  }
715  gId += vectorizedRange;
716  for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
717  evaluator.evalScalar(i);
718  }
719  }
720 };
721 
722 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
723 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
724  public:
725  typedef typename Expression::Index Index;
726  static EIGEN_STRONG_INLINE void run(const Expression& expr,
727  const Eigen::SyclDevice& dev) {
729  Evaluator evaluator(expr, dev);
730  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
731  if (needs_assign) {
732  Index range, GRange, tileSize;
733  Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
734  total_size = (total_size == 0) ? 1 : total_size;
735  const int PacketSize =
736  Eigen::PacketType<typename Evaluator::CoeffReturnType,
737  Eigen::SyclDevice>::size;
738  Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
739  dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
740  range = total_size;
741 
742  dev.template nullary_kernel_launcher<
743  typename Evaluator::CoeffReturnType,
744  ExecExprFunctorKernel<Evaluator> >(
745  evaluator,
746  cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
747  cl::sycl::range<1>(tileSize)),
748  Index(1), range);
749  }
750  evaluator.cleanup();
751  }
752 };
753 
754 #endif
755 
756 } // end namespace internal
757 
758 } // end namespace Eigen
759 
760 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
The tensor executor class.
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31