10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
13 #include "./InternalHeaderCheck.h"
43 template<
typename Expression>
44 struct ExpressionHasTensorBroadcastingOp {
45 enum { value =
false };
48 template<
typename LhsXprType,
typename RhsXprType>
49 struct ExpressionHasTensorBroadcastingOp<
50 const TensorAssignOp<LhsXprType, RhsXprType> > {
51 enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
54 template<
typename UnaryOp,
typename XprType>
55 struct ExpressionHasTensorBroadcastingOp<
56 const TensorCwiseUnaryOp<UnaryOp, XprType> > {
57 enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
60 template<
typename BinaryOp,
typename LhsXprType,
typename RhsXprType>
61 struct ExpressionHasTensorBroadcastingOp<
62 const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
64 value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
65 ExpressionHasTensorBroadcastingOp<RhsXprType>::value
69 template<
typename Broadcast,
typename XprType>
70 struct ExpressionHasTensorBroadcastingOp<
71 const TensorBroadcastingOp<Broadcast, XprType> > {
72 enum { value =
true };
81 template <
typename Expression,
typename Device,
bool Vectorizable,
82 TiledEvaluation Tiling>
85 typedef typename Expression::Index StorageIndex;
92 static_assert(std::is_same<Device, DefaultDevice>::value,
93 "Default executor instantiated with non-default device. "
94 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
95 "EIGEN_USE_SYCL before including Eigen headers.");
98 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
99 const Device& device = Device()) {
100 TensorEvaluator<Expression, Device> evaluator(expr, device);
101 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
103 const StorageIndex size = array_prod(evaluator.dimensions());
104 for (StorageIndex i = 0; i < size; ++i) {
105 evaluator.evalScalar(i);
116 template <
typename Expression,
typename Device,
typename DoneCallback,
117 bool Vectorizable, TiledEvaluation Tiling>
118 class TensorAsyncExecutor {};
123 template <
typename Expression>
125 TiledEvaluation::Off> {
127 typedef typename Expression::Index StorageIndex;
130 static EIGEN_STRONG_INLINE
void run(
131 const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
132 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
133 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
135 const StorageIndex size = array_prod(evaluator.dimensions());
136 const int PacketSize = unpacket_traits<
typename TensorEvaluator<
137 Expression, DefaultDevice>::PacketReturnType>::size;
142 const StorageIndex UnrolledSize =
143 (size / (4 * PacketSize)) * 4 * PacketSize;
144 for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
145 for (StorageIndex j = 0; j < 4; j++) {
146 evaluator.evalPacket(i + j * PacketSize);
149 const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
150 for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
151 evaluator.evalPacket(i);
153 for (StorageIndex i = VectorizedSize; i < size; ++i) {
154 evaluator.evalScalar(i);
165 template <
typename Expression,
bool Vectorizable>
167 TiledEvaluation::On> {
169 typedef typename traits<Expression>::Scalar Scalar;
170 typedef std::remove_const_t<Scalar> ScalarNoConst;
172 typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
173 typedef typename traits<Expression>::Index StorageIndex;
175 static constexpr
int NumDims = traits<Expression>::NumDimensions;
178 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
179 const DefaultDevice& device = DefaultDevice()) {
180 typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
183 typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
185 typedef internal::TensorBlockScratchAllocator<DefaultDevice>
188 Evaluator evaluator(expr, device);
191 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
195 const TensorBlockResourceRequirements requirements =
196 evaluator.getResourceRequirements();
198 const TensorBlockMapper block_mapper(
199 typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
203 TensorBlockScratch scratch(device);
205 const StorageIndex total_block_count = block_mapper.blockCount();
206 for (StorageIndex i = 0; i < total_block_count; ++i) {
207 TensorBlockDesc desc = block_mapper.blockDescriptor(i);
208 evaluator.evalBlock(desc, scratch);
227 #ifdef EIGEN_USE_THREADS
229 template <
typename TensorBlockMapper>
230 struct TensorExecutorTilingContext {
231 TensorExecutorTilingContext() =
default;
232 TensorExecutorTilingContext(
const TensorBlockMapper& b_mapper,
233 const TensorOpCost& b_cost,
size_t b_aligned_size)
234 : block_mapper(b_mapper),
236 aligned_blocksize(b_aligned_size) {}
238 TensorBlockMapper block_mapper;
240 size_t aligned_blocksize;
245 template <
typename Evaluator,
typename TensorBlockMapper,
bool Vectorizable>
246 TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
247 const Evaluator& evaluator) {
249 TensorBlockResourceRequirements requirements =
250 evaluator.getResourceRequirements();
253 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
254 1, requirements.cost_per_coeff);
255 requirements.size =
static_cast<size_t>(1.0 / taskSize);
257 TensorBlockMapper block_mapper(
258 typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
261 size_t block_size = block_mapper.blockTotalSize();
262 const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
263 const size_t aligned_blocksize =
265 divup<size_t>(block_size *
sizeof(
typename Evaluator::Scalar), align);
267 return {block_mapper, requirements.cost_per_coeff * block_size,
271 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
273 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
274 const StorageIndex lastIdx) {
275 Evaluator evaluator = *evaluator_in;
276 eigen_assert(lastIdx >= firstIdx);
277 for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
278 evaluator.evalScalar(i);
282 static StorageIndex alignBlockSize(StorageIndex size) {
return size; }
285 template <
typename Evaluator,
typename StorageIndex>
286 struct EvalRange<Evaluator, StorageIndex, true> {
287 static constexpr
int PacketSize =
288 unpacket_traits<typename Evaluator::PacketReturnType>::size;
290 static void run(Evaluator* evaluator_in,
const StorageIndex firstIdx,
291 const StorageIndex lastIdx) {
292 Evaluator evaluator = *evaluator_in;
293 eigen_assert(lastIdx >= firstIdx);
294 StorageIndex i = firstIdx;
295 if (lastIdx - firstIdx >= PacketSize) {
296 eigen_assert(firstIdx % PacketSize == 0);
297 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
301 for (; i <= last_chunk_offset; i += 4 * PacketSize) {
302 for (StorageIndex j = 0; j < 4; j++) {
303 evaluator.evalPacket(i + j * PacketSize);
306 last_chunk_offset = lastIdx - PacketSize;
307 for (; i <= last_chunk_offset; i += PacketSize) {
308 evaluator.evalPacket(i);
311 for (; i < lastIdx; ++i) {
312 evaluator.evalScalar(i);
316 static StorageIndex alignBlockSize(StorageIndex size) {
318 if (size >= 16 * PacketSize) {
319 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
322 return (size + PacketSize - 1) & ~(PacketSize - 1);
326 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
327 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
329 typedef typename Expression::Index StorageIndex;
331 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
332 const ThreadPoolDevice& device) {
333 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
334 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
336 Evaluator evaluator(expr, device);
337 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
339 const StorageIndex size = array_prod(evaluator.dimensions());
340 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
341 EvalRange::alignBlockSize,
342 [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
343 EvalRange::run(&evaluator, firstIdx, lastIdx);
350 template <
typename Expression,
bool Vectorizable>
352 TiledEvaluation::On> {
354 typedef typename traits<Expression>::Index IndexType;
355 typedef typename traits<Expression>::Scalar Scalar;
356 typedef std::remove_const_t<Scalar> ScalarNoConst;
358 static constexpr
int NumDims = traits<Expression>::NumDimensions;
360 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
361 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
362 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
364 typedef internal::TensorBlockDescriptor<NumDims, IndexType>
366 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
369 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
370 const ThreadPoolDevice& device) {
371 Evaluator evaluator(expr, device);
373 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
375 const TilingContext tiling =
376 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
377 Vectorizable>(evaluator);
379 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
380 IndexType lastBlockIdx) {
381 TensorBlockScratch scratch(device);
383 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
385 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
386 evaluator.evalBlock(desc, scratch);
392 if (tiling.block_mapper.blockCount() == 1) {
393 TensorBlockScratch scratch(device);
394 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
395 evaluator.evalBlock(desc, scratch);
397 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
405 template <
typename Expression,
typename DoneCallback,
bool Vectorizable,
406 TiledEvaluation Tiling>
407 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
408 Vectorizable, Tiling> {
410 typedef typename Expression::Index StorageIndex;
411 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
413 static EIGEN_STRONG_INLINE
void runAsync(
const Expression& expr,
414 const ThreadPoolDevice& device,
416 TensorAsyncExecutorContext*
const ctx =
417 new TensorAsyncExecutorContext(expr, device, std::move(done));
419 const auto on_eval_subexprs = [ctx, &device](
bool need_assign) ->
void {
425 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
426 const StorageIndex size = array_prod(ctx->evaluator.dimensions());
427 device.parallelForAsync(
428 size, ctx->evaluator.costPerCoeff(Vectorizable),
429 EvalRange::alignBlockSize,
430 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
431 EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
433 [ctx]() { delete ctx; });
436 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
440 struct TensorAsyncExecutorContext {
441 TensorAsyncExecutorContext(
const Expression& expr,
442 const ThreadPoolDevice& thread_pool,
444 : evaluator(expr, thread_pool), on_done(std::move(done)) {}
446 ~TensorAsyncExecutorContext() {
454 DoneCallback on_done;
458 template <
typename Expression,
typename DoneCallback,
bool Vectorizable>
459 class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
460 Vectorizable, TiledEvaluation::On> {
462 typedef typename traits<Expression>::Index IndexType;
463 typedef typename traits<Expression>::Scalar Scalar;
464 typedef std::remove_const_t<Scalar> ScalarNoConst;
466 static constexpr
int NumDims = traits<Expression>::NumDimensions;
468 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
469 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
470 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
472 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
473 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
476 static EIGEN_STRONG_INLINE
void runAsync(
const Expression& expr,
477 const ThreadPoolDevice& device,
480 TensorAsyncExecutorContext*
const ctx =
481 new TensorAsyncExecutorContext(expr, device, std::move(done));
483 const auto on_eval_subexprs = [ctx](
bool need_assign) ->
void {
489 ctx->tiling = internal::GetTensorExecutorTilingContext<
490 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
492 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
493 TensorBlockScratch scratch(ctx->device);
495 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
497 TensorBlockDesc desc =
498 ctx->tiling.block_mapper.blockDescriptor(block_idx);
499 ctx->evaluator.evalBlock(desc, scratch);
505 if (ctx->tiling.block_mapper.blockCount() == 1) {
506 TensorBlockScratch scratch(ctx->device);
507 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
508 ctx->evaluator.evalBlock(desc, scratch);
511 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
512 ctx->tiling.cost, eval_block,
513 [ctx]() { delete ctx; });
517 ctx->evaluator.evalSubExprsIfNeededAsync(
nullptr, on_eval_subexprs);
521 struct TensorAsyncExecutorContext {
522 TensorAsyncExecutorContext(
const Expression& expr,
523 const ThreadPoolDevice& thread_pool,
525 : device(thread_pool),
526 evaluator(expr, thread_pool),
527 on_done(std::move(done)) {}
529 ~TensorAsyncExecutorContext() {
534 const ThreadPoolDevice& device;
536 TilingContext tiling;
539 DoneCallback on_done;
546 #if defined(EIGEN_USE_GPU)
548 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
549 class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
551 typedef typename Expression::Index StorageIndex;
552 static void run(
const Expression& expr,
const GpuDevice& device);
555 #if defined(EIGEN_GPUCC)
557 template <
typename Index>
558 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
int sum_will_overflow(
Index lhs,
560 const Index highest = NumTraits<Index>::highest();
561 const Index lowest = NumTraits<Index>::lowest();
562 if (lhs > 0 && rhs > 0) {
563 return lhs > highest - rhs ? 1 : 0;
564 }
else if (lhs < 0 && rhs < 0) {
565 return lhs < lowest - rhs ? -1 : 0;
573 template <
typename Index>
574 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
Index saturate_add(
Index lhs,
Index rhs) {
575 const Index highest = NumTraits<Index>::highest();
576 const Index lowest = NumTraits<Index>::lowest();
577 int overflow = sum_will_overflow(lhs, rhs);
578 return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
584 template <
typename Index>
589 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(
Index lastIdx,
Index step_size)
590 : can_overflow_(sum_will_overflow(lastIdx, step_size)),
591 step_size_(step_size) {}
594 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
Index operator()(
Index index)
const {
595 return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
599 const bool can_overflow_;
600 const Index step_size_;
603 template <
typename Evaluator,
typename StorageIndex,
bool Vectorizable>
604 struct EigenMetaKernelEval {
605 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
606 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
607 SafeStep<StorageIndex> safe_step(lastIdx, step_size);
608 for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
614 template <
typename Evaluator,
typename StorageIndex>
615 struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
616 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
617 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
618 const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
619 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
620 const StorageIndex vectorized_step_size = step_size * PacketSize;
622 SafeStep<StorageIndex> safe_vectorized_step(vectorized_size,
623 vectorized_step_size);
625 for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
626 i = safe_vectorized_step(i)) {
629 SafeStep<StorageIndex> safe_step(lastIdx, step_size);
630 for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx;
637 template <
typename Evaluator,
typename StorageIndex>
639 __launch_bounds__(1024)
640 EigenMetaKernel(Evaluator eval, StorageIndex size) {
642 const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
643 const StorageIndex step_size = blockDim.x * gridDim.x;
645 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
646 EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
650 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
652 const Expression& expr,
const GpuDevice& device) {
653 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
654 const bool needs_assign = evaluator.evalSubExprsIfNeeded(
nullptr);
657 const int block_size = device.maxGpuThreadsPerBlock();
658 const int max_blocks =
659 numext::mini<int64_t>(device.getNumGpuMultiProcessors() *
660 device.maxGpuThreadsPerMultiProcessor(),
661 NumTraits<StorageIndex>::highest()) /
663 const StorageIndex size = array_prod(evaluator.dimensions());
665 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
668 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
669 num_blocks, block_size, 0, device, evaluator, size);
678 #ifdef EIGEN_USE_SYCL
680 template <
typename Evaluator>
681 struct ExecExprFunctorKernel {
682 typedef typename Evaluator::Index
Index;
685 template <
typename Scratch>
686 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
687 const Scratch, Evaluator evaluator_,
const Index range_)
688 : evaluator(evaluator_), range(range_) {}
690 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void operator()(
691 cl::sycl::nd_item<1> itemID) {
694 template <
bool is_vec = Evaluator::PacketAccess>
695 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec>
696 compute(
const cl::sycl::nd_item<1>& itemID) {
697 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
698 Index total_threads = itemID.get_global_range(0);
700 for (
Index i = gId; i < range; i += total_threads) {
701 evaluator.evalScalar(i);
704 template <
bool is_vec = Evaluator::PacketAccess>
705 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec>
706 compute(
const cl::sycl::nd_item<1>& itemID) {
707 const Index vectorizedRange =
708 (range / Evaluator::PacketSize) * Evaluator::PacketSize;
709 Index gId =
static_cast<Index>(itemID.get_global_linear_id());
710 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
711 const Index start = Evaluator::PacketSize * gId;
712 for (
Index i = start; i < vectorizedRange; i += step) {
713 evaluator.evalPacket(i);
715 gId += vectorizedRange;
716 for (
Index i = gId; i < range; i += itemID.get_global_range(0)) {
717 evaluator.evalScalar(i);
722 template <
typename Expression,
bool Vectorizable, TiledEvaluation Tiling>
725 typedef typename Expression::Index
Index;
726 static EIGEN_STRONG_INLINE
void run(
const Expression& expr,
727 const Eigen::SyclDevice& dev) {
729 Evaluator evaluator(expr, dev);
730 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
732 Index range, GRange, tileSize;
733 Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
734 total_size = (total_size == 0) ? 1 : total_size;
735 const int PacketSize =
736 Eigen::PacketType<
typename Evaluator::CoeffReturnType,
737 Eigen::SyclDevice>::size;
738 Index vectorizable_threads =
static_cast<Index>(total_size / PacketSize);
739 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
742 dev.template nullary_kernel_launcher<
743 typename Evaluator::CoeffReturnType,
744 ExecExprFunctorKernel<Evaluator> >(
746 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
747 cl::sycl::range<1>(tileSize)),
The tensor executor class.
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:31