unsupported/TensorExecutor_8h_source.html

 // This file is part of Eigen, a lightweight C++ template library

 // for linear algebra.

 //

 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>

 //

 // This Source Code Form is subject to the terms of the Mozilla

 // Public License v. 2.0. If a copy of the MPL was not distributed

 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.


 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H

 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H


 #include "./InternalHeaderCheck.h"


 namespace Eigen {


 namespace internal {


 // TODO(ezhulenev): Add specializations for all other types of Tensor ops.


 template<typename Expression>

 struct ExpressionHasTensorBroadcastingOp {

   enum { value = false };

 };


 template<typename LhsXprType, typename RhsXprType>

 struct ExpressionHasTensorBroadcastingOp<

     const TensorAssignOp<LhsXprType, RhsXprType> > {

   enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };

 };


 template<typename UnaryOp, typename XprType>

 struct ExpressionHasTensorBroadcastingOp<

     const TensorCwiseUnaryOp<UnaryOp, XprType> > {

   enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };

 };


 template<typename BinaryOp, typename LhsXprType, typename RhsXprType>

 struct ExpressionHasTensorBroadcastingOp<

     const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {

   enum {

     value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||

         ExpressionHasTensorBroadcastingOp<RhsXprType>::value

   };

 };


 template<typename Broadcast, typename XprType>

 struct ExpressionHasTensorBroadcastingOp<

     const TensorBroadcastingOp<Broadcast, XprType> > {

   enum { value = true };

 };


 // -------------------------------------------------------------------------- //


 template <typename Expression, typename Device, bool Vectorizable,

           TiledEvaluation Tiling>

 class TensorExecutor {

  public:

   typedef typename Expression::Index StorageIndex;


   // Including `unsupported/Eigen/CXX11/Tensor` in different translation units

   // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR

   // violation. If this template is instantiated with a non-default device, it

   // means that this header file was included without defining

   // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.

   static_assert(std::is_same<Device, DefaultDevice>::value,

                 "Default executor instantiated with non-default device. "

                 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "

                 "EIGEN_USE_SYCL before including Eigen headers.");


   EIGEN_DEVICE_FUNC

   static EIGEN_STRONG_INLINE void run(const Expression& expr,

                                       const Device& device = Device()) {

     TensorEvaluator<Expression, Device> evaluator(expr, device);

     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);

     if (needs_assign) {

       const StorageIndex size = array_prod(evaluator.dimensions());

       for (StorageIndex i = 0; i < size; ++i) {

         evaluator.evalScalar(i);

       }

     }

     evaluator.cleanup();

   }

 };


 template <typename Expression, typename Device, typename DoneCallback,

           bool Vectorizable, TiledEvaluation Tiling>

 class TensorAsyncExecutor {};


 template <typename Expression>

 class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,

                      /*Tiling=*/TiledEvaluation::Off> {

  public:

   typedef typename Expression::Index StorageIndex;


   EIGEN_DEVICE_FUNC

   static EIGEN_STRONG_INLINE void run(

       const Expression& expr, const DefaultDevice& device = DefaultDevice()) {

     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);

     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);

     if (needs_assign) {

       const StorageIndex size = array_prod(evaluator.dimensions());

       const int PacketSize = unpacket_traits<typename TensorEvaluator<

           Expression, DefaultDevice>::PacketReturnType>::size;


       // Give compiler a strong possibility to unroll the loop. But don't insist

       // on unrolling, because if the function is expensive compiler should not

       // unroll the loop at the expense of inlining.

       const StorageIndex UnrolledSize =

           (size / (4 * PacketSize)) * 4 * PacketSize;

       for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {

         for (StorageIndex j = 0; j < 4; j++) {

           evaluator.evalPacket(i + j * PacketSize);

         }

       }

       const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;

       for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {

         evaluator.evalPacket(i);

       }

       for (StorageIndex i = VectorizedSize; i < size; ++i) {

         evaluator.evalScalar(i);

       }

     }

     evaluator.cleanup();

   }

 };


 template <typename Expression, bool Vectorizable>

 class TensorExecutor<Expression, DefaultDevice, Vectorizable,

                      /*Tiling=*/TiledEvaluation::On> {

  public:

   typedef typename traits<Expression>::Scalar Scalar;

   typedef std::remove_const_t<Scalar> ScalarNoConst;


   typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;

   typedef typename traits<Expression>::Index StorageIndex;


   static constexpr int NumDims = traits<Expression>::NumDimensions;


   EIGEN_DEVICE_FUNC

   static EIGEN_STRONG_INLINE void run(const Expression& expr,

                          const DefaultDevice& device = DefaultDevice()) {

     typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>

         TensorBlockMapper;


     typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>

         TensorBlockDesc;

     typedef internal::TensorBlockScratchAllocator<DefaultDevice>

         TensorBlockScratch;


     Evaluator evaluator(expr, device);


     // TODO(ezhulenev): Do not use tiling for small tensors?

     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);


     if (needs_assign) {

       // Query expression tree for desired block size/shape.

       const TensorBlockResourceRequirements requirements =

           evaluator.getResourceRequirements();


       const TensorBlockMapper block_mapper(

           typename TensorBlockDesc::Dimensions(evaluator.dimensions()),

           requirements);


       // Share scratch memory allocator between all blocks.

       TensorBlockScratch scratch(device);


       const StorageIndex total_block_count = block_mapper.blockCount();

       for (StorageIndex i = 0; i < total_block_count; ++i) {

         TensorBlockDesc desc = block_mapper.blockDescriptor(i);

         evaluator.evalBlock(desc, scratch);

         scratch.reset();

       }

     }

     evaluator.cleanup();

   }

 };


 #ifdef EIGEN_USE_THREADS


 template <typename TensorBlockMapper>

 struct TensorExecutorTilingContext {

   TensorExecutorTilingContext() = default;

   TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,

                               const TensorOpCost& b_cost, size_t b_aligned_size)

       : block_mapper(b_mapper),

         cost(b_cost),

         aligned_blocksize(b_aligned_size) {}


   TensorBlockMapper block_mapper;  // navigate through blocks

   TensorOpCost cost;               // cost of computing a single block

   size_t aligned_blocksize;        // block size after memory alignment

 };


 // Computes a block evaluation parameters, and allocates temporary memory buffer

 // for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.

 template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>

 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(

     const Evaluator& evaluator) {

   // Query expression tree for desired block size/shape.

   TensorBlockResourceRequirements requirements =

       evaluator.getResourceRequirements();


   // Update target block size based on cost model.

   double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(

       1, requirements.cost_per_coeff);

   requirements.size = static_cast<size_t>(1.0 / taskSize);


   TensorBlockMapper block_mapper(

       typename TensorBlockMapper::Dimensions(evaluator.dimensions()),

       requirements);


   size_t block_size = block_mapper.blockTotalSize();

   const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);

   const size_t aligned_blocksize =

       align *

       divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);


   return {block_mapper, requirements.cost_per_coeff * block_size,

           aligned_blocksize};

 }


 template <typename Evaluator, typename StorageIndex, bool Vectorizable>

 struct EvalRange {

   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,

                   const StorageIndex lastIdx) {

     Evaluator evaluator = *evaluator_in;

     eigen_assert(lastIdx >= firstIdx);

     for (StorageIndex i = firstIdx; i < lastIdx; ++i) {

       evaluator.evalScalar(i);

     }

   }


   static StorageIndex alignBlockSize(StorageIndex size) { return size; }

 };


 template <typename Evaluator, typename StorageIndex>

 struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {

   static constexpr int PacketSize =

       unpacket_traits<typename Evaluator::PacketReturnType>::size;


   static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,

                   const StorageIndex lastIdx) {

     Evaluator evaluator = *evaluator_in;

     eigen_assert(lastIdx >= firstIdx);

     StorageIndex i = firstIdx;

     if (lastIdx - firstIdx >= PacketSize) {

       eigen_assert(firstIdx % PacketSize == 0);

       StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;

       // Give compiler a strong possibility to unroll the loop. But don't insist

       // on unrolling, because if the function is expensive compiler should not

       // unroll the loop at the expense of inlining.

       for (; i <= last_chunk_offset; i += 4 * PacketSize) {

         for (StorageIndex j = 0; j < 4; j++) {

           evaluator.evalPacket(i + j * PacketSize);

         }

       }

       last_chunk_offset = lastIdx - PacketSize;

       for (; i <= last_chunk_offset; i += PacketSize) {

         evaluator.evalPacket(i);

       }

     }

     for (; i < lastIdx; ++i) {

       evaluator.evalScalar(i);

     }

   }


   static StorageIndex alignBlockSize(StorageIndex size) {

     // Align block size to packet size and account for unrolling in run above.

     if (size >= 16 * PacketSize) {

       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);

     }

     // Aligning to 4 * PacketSize would increase block size by more than 25%.

     return (size + PacketSize - 1) & ~(PacketSize - 1);

   }

 };


 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>

 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {

  public:

   typedef typename Expression::Index StorageIndex;


   static EIGEN_STRONG_INLINE void run(const Expression& expr,

                          const ThreadPoolDevice& device) {

     typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;

     typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;


     Evaluator evaluator(expr, device);

     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);

     if (needs_assign) {

       const StorageIndex size = array_prod(evaluator.dimensions());

       device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),

                          EvalRange::alignBlockSize,

                          [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {

                            EvalRange::run(&evaluator, firstIdx, lastIdx);

                          });

     }

     evaluator.cleanup();

   }

 };


 template <typename Expression, bool Vectorizable>

 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,

                      /*Tiling=*/TiledEvaluation::On> {

  public:

   typedef typename traits<Expression>::Index IndexType;

   typedef typename traits<Expression>::Scalar Scalar;

   typedef std::remove_const_t<Scalar> ScalarNoConst;


   static constexpr int NumDims = traits<Expression>::NumDimensions;


   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;

   typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;

   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;


   typedef internal::TensorBlockDescriptor<NumDims, IndexType>

       TensorBlockDesc;

   typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>

       TensorBlockScratch;


   static EIGEN_STRONG_INLINE void run(const Expression& expr,

                                       const ThreadPoolDevice& device) {

     Evaluator evaluator(expr, device);


     const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);

     if (needs_assign) {

       const TilingContext tiling =

           internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,

                                                    Vectorizable>(evaluator);


       auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,

                                                        IndexType lastBlockIdx) {

         TensorBlockScratch scratch(device);


         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;

              ++block_idx) {

           TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);

           evaluator.evalBlock(desc, scratch);

           scratch.reset();

         }

       };


       // Evaluate small expressions directly as a single block.

       if (tiling.block_mapper.blockCount() == 1) {

         TensorBlockScratch scratch(device);

         TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());

         evaluator.evalBlock(desc, scratch);

       } else {

         device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,

                            eval_block);

       }

     }

     evaluator.cleanup();

   }

 };


 template <typename Expression, typename DoneCallback, bool Vectorizable,

           TiledEvaluation Tiling>

 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,

                           Vectorizable, Tiling> {

  public:

   typedef typename Expression::Index StorageIndex;

   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;


   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,

                                            const ThreadPoolDevice& device,

                                            DoneCallback done) {

     TensorAsyncExecutorContext* const ctx =

         new TensorAsyncExecutorContext(expr, device, std::move(done));


     const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {

       if (!need_assign) {

         delete ctx;

         return;

       }


       typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;

       const StorageIndex size = array_prod(ctx->evaluator.dimensions());

       device.parallelForAsync(

           size, ctx->evaluator.costPerCoeff(Vectorizable),

           EvalRange::alignBlockSize,

           [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {

             EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);

           },

           [ctx]() { delete ctx; });

     };


     ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);

   }


  private:

   struct TensorAsyncExecutorContext {

     TensorAsyncExecutorContext(const Expression& expr,

                                const ThreadPoolDevice& thread_pool,

                                DoneCallback done)

         : evaluator(expr, thread_pool), on_done(std::move(done)) {}


     ~TensorAsyncExecutorContext() {

       evaluator.cleanup();

       on_done();

     }


     Evaluator evaluator;


    private:

     DoneCallback on_done;

   };

 };


 template <typename Expression, typename DoneCallback, bool Vectorizable>

 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,

                           Vectorizable, /*Tileable*/ TiledEvaluation::On> {

  public:

   typedef typename traits<Expression>::Index IndexType;

   typedef typename traits<Expression>::Scalar Scalar;

   typedef std::remove_const_t<Scalar> ScalarNoConst;


   static constexpr int NumDims = traits<Expression>::NumDimensions;


   typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;

   typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;

   typedef TensorExecutorTilingContext<BlockMapper> TilingContext;


   typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;

   typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>

       TensorBlockScratch;


   static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,

                                            const ThreadPoolDevice& device,

                                            DoneCallback done) {


     TensorAsyncExecutorContext* const ctx =

         new TensorAsyncExecutorContext(expr, device, std::move(done));


     const auto on_eval_subexprs = [ctx](bool need_assign) -> void {

       if (!need_assign) {

         delete ctx;

         return;

       }


       ctx->tiling = internal::GetTensorExecutorTilingContext<

           Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);


       auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {

         TensorBlockScratch scratch(ctx->device);


         for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;

              ++block_idx) {

           TensorBlockDesc desc =

               ctx->tiling.block_mapper.blockDescriptor(block_idx);

           ctx->evaluator.evalBlock(desc, scratch);

           scratch.reset();

         }

       };


       // Evaluate small expressions directly as a single block.

       if (ctx->tiling.block_mapper.blockCount() == 1) {

         TensorBlockScratch scratch(ctx->device);

         TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());

         ctx->evaluator.evalBlock(desc, scratch);

         delete ctx;

       } else {

         ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),

                                      ctx->tiling.cost, eval_block,

                                      [ctx]() { delete ctx; });

       }

     };


     ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);

   }


  private:

   struct TensorAsyncExecutorContext {

     TensorAsyncExecutorContext(const Expression& expr,

                                const ThreadPoolDevice& thread_pool,

                                DoneCallback done)

         : device(thread_pool),

           evaluator(expr, thread_pool),

           on_done(std::move(done)) {}


     ~TensorAsyncExecutorContext() {

       evaluator.cleanup();

       on_done();

     }


     const ThreadPoolDevice& device;

     Evaluator evaluator;

     TilingContext tiling;


    private:

     DoneCallback on_done;

   };

 };


 #endif  // EIGEN_USE_THREADS


 // GPU: the evaluation of the expression is offloaded to a GPU.

 #if defined(EIGEN_USE_GPU)


 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>

 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {

  public:

   typedef typename Expression::Index StorageIndex;

   static void run(const Expression& expr, const GpuDevice& device);

 };


 #if defined(EIGEN_GPUCC)

 // Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.

 template <typename Index>

 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs,

                                                             Index rhs) {

   const Index highest = NumTraits<Index>::highest();

   const Index lowest = NumTraits<Index>::lowest();

   if (lhs > 0 && rhs > 0) {

     return lhs > highest - rhs ? 1 : 0;

   } else if (lhs < 0 && rhs < 0) {

     return lhs < lowest - rhs ? -1 : 0;

   } else {

     return 0;

   }

 }


 // Returns lhs + rhs, saturating to the highest/lowest representable value on

 // overflow/underflow respectively.

 template <typename Index>

 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {

   const Index highest = NumTraits<Index>::highest();

   const Index lowest = NumTraits<Index>::lowest();

   int overflow = sum_will_overflow(lhs, rhs);

   return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;

 }


 // A functor that adds step_size to a given index, saturating to avoid

 // overflow/underflow. If overflow/underflow is not possible, regular addition

 // is used (for efficiency).

 template <typename Index>

 struct SafeStep {

   // lastIdx is one past the end of the possible indexes.

   // step_size is the value that will be added to the given index when the

   // functor is called.

   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)

       : can_overflow_(sum_will_overflow(lastIdx, step_size)),

         step_size_(step_size) {}


   // Adds step_size to index, saturating on overflow (if overflow is possible).

   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {

     return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;

   }


  private:

   const bool can_overflow_;

   const Index step_size_;

 };


 template <typename Evaluator, typename StorageIndex, bool Vectorizable>

 struct EigenMetaKernelEval {

   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE

   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {

     SafeStep<StorageIndex> safe_step(lastIdx, step_size);

     for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {

       eval.evalScalar(i);

     }

   }

 };


 template <typename Evaluator, typename StorageIndex>

 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {

   static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE

   void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {

     const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;

     const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;

     const StorageIndex vectorized_step_size = step_size * PacketSize;


     SafeStep<StorageIndex> safe_vectorized_step(vectorized_size,

                                                 vectorized_step_size);

     // Use the vector path

     for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;

          i = safe_vectorized_step(i)) {

       eval.evalPacket(i);

     }

     SafeStep<StorageIndex> safe_step(lastIdx, step_size);

     for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;

          i = safe_step(i)) {

       eval.evalScalar(i);

     }

   }

 };


 template <typename Evaluator, typename StorageIndex>

 __global__ void

 __launch_bounds__(1024)

 EigenMetaKernel(Evaluator eval, StorageIndex size) {


   const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;

   const StorageIndex step_size = blockDim.x * gridDim.x;


   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;

   EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);

 }


 /*static*/

 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>

 EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(

     const Expression& expr, const GpuDevice& device) {

   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);

   const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);

   if (needs_assign) {


     const int block_size = device.maxGpuThreadsPerBlock();

     const int max_blocks =

         numext::mini<int64_t>(device.getNumGpuMultiProcessors() *

                               device.maxGpuThreadsPerMultiProcessor(),

                           NumTraits<StorageIndex>::highest()) /

         block_size;

     const StorageIndex size = array_prod(evaluator.dimensions());

     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.

     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);


     LAUNCH_GPU_KERNEL(

         (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),

         num_blocks, block_size, 0, device, evaluator, size);

   }

   evaluator.cleanup();

 }


 #endif  // EIGEN_GPUCC

 #endif  // EIGEN_USE_GPU


 // SYCL Executor policy

 #ifdef EIGEN_USE_SYCL


 template <typename Evaluator>

 struct ExecExprFunctorKernel {

   typedef typename Evaluator::Index Index;

   Evaluator evaluator;

   const Index range;

   template <typename Scratch>

   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(

       const Scratch, Evaluator evaluator_, const Index range_)

       : evaluator(evaluator_), range(range_) {}


   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(

       cl::sycl::nd_item<1> itemID) {

     compute(itemID);

   }

   template <bool is_vec = Evaluator::PacketAccess>

   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>

   compute(const cl::sycl::nd_item<1>& itemID) {

     Index gId = static_cast<Index>(itemID.get_global_linear_id());

     Index total_threads = itemID.get_global_range(0);


     for (Index i = gId; i < range; i += total_threads) {

       evaluator.evalScalar(i);

     }

   }

   template <bool is_vec = Evaluator::PacketAccess>

   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>

   compute(const cl::sycl::nd_item<1>& itemID) {

     const Index vectorizedRange =

         (range / Evaluator::PacketSize) * Evaluator::PacketSize;

     Index gId = static_cast<Index>(itemID.get_global_linear_id());

     const Index step = Evaluator::PacketSize * itemID.get_global_range(0);

     const Index start = Evaluator::PacketSize * gId;

     for (Index i = start; i < vectorizedRange; i += step) {

       evaluator.evalPacket(i);

     }

     gId += vectorizedRange;

     for (Index i = gId; i < range; i += itemID.get_global_range(0)) {

       evaluator.evalScalar(i);

     }

   }

 };


 template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>

 class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {

  public:

   typedef typename Expression::Index Index;

   static EIGEN_STRONG_INLINE void run(const Expression& expr,

                                       const Eigen::SyclDevice& dev) {

     typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;

     Evaluator evaluator(expr, dev);

     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);

     if (needs_assign) {

       Index range, GRange, tileSize;

       Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());

       total_size = (total_size == 0) ? 1 : total_size;

       const int PacketSize =

           Eigen::PacketType<typename Evaluator::CoeffReturnType,

                             Eigen::SyclDevice>::size;

       Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);

       dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);

       range = total_size;


       dev.template nullary_kernel_launcher<

           typename Evaluator::CoeffReturnType,

           ExecExprFunctorKernel<Evaluator> >(

           evaluator,

           cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),

                                 cl::sycl::range<1>(tileSize)),

           Index(1), range);

     }

     evaluator.cleanup();

   }

 };


 #endif


 } // end namespace internal


 } // end namespace Eigen


 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H

TensorExecutor
The tensor executor class.

Eigen
Namespace containing all symbols from the Eigen library.

Eigen::Index
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index

Eigen::TensorEvaluator
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31