8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
11 #include "./InternalHeaderCheck.h"
18 template <
typename Scalar,
typename IndexType,
int NumDims,
int Layout>
27 template <
int Layout,
typename IndexType,
int NumDims>
28 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
29 const DSizes<IndexType, NumDims>& dimensions) {
30 DSizes<IndexType, NumDims> strides;
31 if (NumDims == 0)
return strides;
35 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
37 for (
int i = 1; i < NumDims; ++i) {
38 strides[i] = strides[i - 1] * dimensions[i - 1];
41 strides[NumDims - 1] = 1;
42 for (
int i = NumDims - 2; i >= 0; --i) {
43 strides[i] = strides[i + 1] * dimensions[i + 1];
50 template <
int Layout,
typename IndexType,
size_t NumDims>
51 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
52 const Eigen::array<IndexType, NumDims>& dimensions) {
53 return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
56 template <
int Layout, std::ptrdiff_t... Indices>
57 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t,
sizeof...(Indices)> strides(
58 const Sizes<Indices...>& sizes) {
59 return strides<Layout>(DSizes<std::ptrdiff_t,
sizeof...(Indices)>(sizes));
75 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
77 struct TensorBlockResourceRequirements {
78 TensorBlockShapeType shape_type;
80 TensorOpCost cost_per_coeff;
87 TensorBlockResourceRequirements(TensorBlockShapeType shape_type_,
size_t size_,
89 : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
93 template <
typename Scalar>
94 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements withShapeAndSize(
95 TensorBlockShapeType shape_type,
size_t size_in_bytes,
97 const size_t size = numext::maxi(
size_t(1), size_in_bytes /
sizeof(Scalar));
98 return {shape_type, size, cost};
101 template <
typename Scalar>
102 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements withShapeAndSize(
103 TensorBlockShapeType shape_type,
size_t size_in_bytes) {
118 return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
124 template <
typename Scalar>
125 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements skewed(
126 size_t size_in_bytes) {
127 return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
131 template <
typename Scalar>
132 EIGEN_DEVICE_FUNC
static TensorBlockResourceRequirements uniform(
133 size_t size_in_bytes) {
134 return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
139 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
140 merge(
const TensorBlockResourceRequirements& lhs,
141 const TensorBlockResourceRequirements& rhs) {
142 return {merge(lhs.shape_type, rhs.shape_type),
143 merge(lhs.size, rhs.size),
144 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};
147 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
149 cost_per_coeff += cost;
157 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
158 return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
162 using Requirements = TensorBlockResourceRequirements;
165 static EIGEN_STRONG_INLINE
size_t merge(
size_t lhs_size,
size_t rhs_size) {
166 return numext::maxi(lhs_size, rhs_size);
170 static EIGEN_STRONG_INLINE TensorBlockShapeType
171 merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
172 return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
173 rhs == TensorBlockShapeType::kSkewedInnerDims)
174 ? TensorBlockShapeType::kSkewedInnerDims
175 : TensorBlockShapeType::kUniformAllDims;
179 static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
180 TensorOpCost rhs_cost) {
181 return lhs_cost + rhs_cost;
189 template <
int NumDims,
typename IndexType = Eigen::Index>
190 class TensorBlockDescriptor {
192 typedef DSizes<IndexType, NumDims> Dimensions;
204 class DestinationBuffer {
206 enum DestinationBufferKind :
int {
237 template <
typename Scalar>
238 Scalar* data()
const {
239 eigen_assert(m_data_type_size ==
sizeof(Scalar));
240 return static_cast<Scalar*
>(m_data);
243 const Dimensions& strides()
const {
return m_strides; }
244 const DestinationBufferKind& kind()
const {
return m_kind; }
247 friend class TensorBlockDescriptor<NumDims, IndexType>;
249 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
251 template <
typename Scalar>
252 DestinationBuffer(Scalar* data,
const Dimensions& strides,
253 DestinationBufferKind kind)
254 : m_data(static_cast<void*>(data)),
255 m_data_type_size(sizeof(Scalar)),
259 template <
int Layout,
typename Scalar>
260 static DestinationBuffer make(
const TensorBlockDescriptor& desc,
261 Scalar* data,
const Dimensions& strides) {
262 return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
265 template <
int Layout>
266 static DestinationBufferKind kind(
const TensorBlockDescriptor& desc,
267 const Dimensions& strides) {
268 const Dimensions& desc_dims = desc.dimensions();
269 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
270 for (
int i = 0; i < NumDims; ++i) {
271 if (desc_dims[i] == 1)
continue;
272 if (desc_strides[i] != strides[i])
return kStrided;
280 size_t m_data_type_size;
284 Dimensions m_strides;
286 DestinationBufferKind m_kind;
289 TensorBlockDescriptor(
const IndexType offset,
const Dimensions& dimensions,
290 const DestinationBuffer& destination)
292 m_dimensions(dimensions),
293 m_destination(destination) {}
295 TensorBlockDescriptor(
const IndexType offset,
const Dimensions& dimensions)
297 m_dimensions(dimensions),
298 m_destination(DestinationBuffer()) {}
300 IndexType offset()
const {
return m_offset; }
301 const Dimensions& dimensions()
const {
return m_dimensions; }
302 IndexType dimension(
int index)
const {
return m_dimensions[index]; }
303 IndexType size()
const {
return array_prod<IndexType>(m_dimensions); }
305 const DestinationBuffer& destination()
const {
return m_destination; }
307 template <
int Layout,
typename Scalar>
308 void AddDestinationBuffer(Scalar* dst_base,
const Dimensions& dst_strides) {
309 eigen_assert(dst_base != NULL);
311 DestinationBuffer::template make<Layout>(*
this, dst_base, dst_strides);
314 template <
int Layout,
typename Scalar,
typename DstStr
idesIndexType>
315 void AddDestinationBuffer(
317 const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
319 AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
322 TensorBlockDescriptor& DropDestinationBuffer() {
323 m_destination.m_data = NULL;
324 m_destination.m_kind = DestinationBuffer::kEmpty;
328 bool HasDestinationBuffer()
const {
329 return m_destination.kind() != DestinationBuffer::kEmpty;
333 TensorBlockDescriptor WithOffset(IndexType offset)
const {
334 return TensorBlockDescriptor(offset, m_dimensions, m_destination);
340 const IndexType m_offset;
341 const Dimensions m_dimensions;
342 DestinationBuffer m_destination;
348 template <
int NumDims,
int Layout,
typename IndexType = Eigen::Index>
349 class TensorBlockMapper {
350 typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
353 typedef DSizes<IndexType, NumDims> Dimensions;
355 TensorBlockMapper() =
default;
356 TensorBlockMapper(
const DSizes<IndexType, NumDims>& dimensions,
357 const TensorBlockResourceRequirements& requirements)
358 : m_tensor_dimensions(dimensions), m_requirements(requirements) {
360 InitializeBlockDimensions();
363 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount()
const {
364 return m_total_block_count;
367 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize()
const {
368 return m_block_dimensions.TotalSize();
371 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const DSizes<IndexType, NumDims>&
372 blockDimensions()
const {
373 return m_block_dimensions;
376 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
377 blockDescriptor(IndexType block_index)
const {
378 static const bool isColMajor = Layout ==
static_cast<int>(
ColMajor);
380 IndexType offset = 0;
381 DSizes<IndexType, NumDims> dimensions;
383 if (NumDims == 0)
return BlockDescriptor(offset, dimensions);
386 for (
int i = NumDims - 1; i >= 0; --i) {
387 const int dim = isColMajor ? i : NumDims - i - 1;
389 const IndexType idx = block_index / m_block_strides[dim];
390 block_index -= idx * m_block_strides[dim];
392 const IndexType coord = idx * m_block_dimensions[dim];
393 dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
394 m_block_dimensions[dim]);
395 offset += coord * m_tensor_strides[dim];
398 return {offset, dimensions};
402 void InitializeBlockDimensions() {
404 const TensorBlockShapeType shape_type = m_requirements.shape_type;
405 IndexType target_block_size =
406 numext::maxi<IndexType>(1,
static_cast<IndexType
>(m_requirements.size));
408 IndexType tensor_size = m_tensor_dimensions.TotalSize();
414 if (tensor_size == 0) {
415 for (
int i = 0; i < NumDims; ++i) {
416 m_block_dimensions[i] = 1;
418 m_total_block_count = 0;
423 if (tensor_size <= target_block_size) {
424 m_block_dimensions = m_tensor_dimensions;
425 m_total_block_count = 1;
428 for (
int i = 0; i < NumDims; ++i) {
429 m_tensor_strides[i] = 0;
430 m_block_strides[i] = 1;
435 static const bool isColMajor = Layout ==
static_cast<int>(
ColMajor);
438 if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
439 IndexType coeff_to_allocate = target_block_size;
441 for (
int i = 0; i < NumDims; ++i) {
442 const int dim = isColMajor ? i : NumDims - i - 1;
443 m_block_dimensions[dim] =
444 numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
445 coeff_to_allocate = divup(
447 numext::maxi(
static_cast<IndexType
>(1), m_block_dimensions[dim]));
449 eigen_assert(coeff_to_allocate == 1);
451 }
else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
454 const IndexType dim_size_target = convert_index<IndexType>(
455 std::pow(
static_cast<float>(target_block_size),
456 1.0f /
static_cast<float>(m_block_dimensions.rank())));
458 for (
int i = 0; i < NumDims; ++i) {
463 m_block_dimensions[i] =
464 numext::mini(dim_size_target, m_tensor_dimensions[i]);
468 IndexType total_size = m_block_dimensions.TotalSize();
469 for (
int i = 0; i < NumDims; ++i) {
470 const int dim = isColMajor ? i : NumDims - i - 1;
472 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
473 const IndexType total_size_other_dims =
474 total_size / m_block_dimensions[dim];
475 const IndexType alloc_avail =
476 divup<IndexType>(target_block_size, total_size_other_dims);
477 if (alloc_avail == m_block_dimensions[dim]) {
481 m_block_dimensions[dim] =
482 numext::mini(m_tensor_dimensions[dim], alloc_avail);
483 total_size = total_size_other_dims * m_block_dimensions[dim];
491 eigen_assert(m_block_dimensions.TotalSize() >=
492 numext::mini<IndexType>(target_block_size,
493 m_tensor_dimensions.TotalSize()));
496 DSizes<IndexType, NumDims> block_count;
497 for (
int i = 0; i < NumDims; ++i) {
498 block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
500 m_total_block_count = array_prod(block_count);
503 m_tensor_strides = strides<Layout>(m_tensor_dimensions);
504 m_block_strides = strides<Layout>(block_count);
507 DSizes<IndexType, NumDims> m_tensor_dimensions;
508 TensorBlockResourceRequirements m_requirements;
510 DSizes<IndexType, NumDims> m_block_dimensions;
511 IndexType m_total_block_count;
513 DSizes<IndexType, NumDims> m_tensor_strides;
514 DSizes<IndexType, NumDims> m_block_strides;
526 template <
typename Device>
527 class TensorBlockScratchAllocator {
529 explicit TensorBlockScratchAllocator(
const Device& device)
530 : m_device(device), m_allocation_index(0) {}
532 ~TensorBlockScratchAllocator() {
533 for (
size_t i = 0; i < m_allocations.size(); ++i) {
534 m_device.deallocate(m_allocations[i].ptr);
538 void* allocate(
size_t size) {
540 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
543 const int num_allocations =
static_cast<int>(m_allocations.size());
544 const bool has_allocation = m_allocation_index < num_allocations;
547 eigen_assert(m_allocation_index <= num_allocations);
554 if (has_allocation && m_allocations[m_allocation_index].size < size) {
555 m_device.deallocate(m_allocations[m_allocation_index].ptr);
556 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
557 m_allocations[m_allocation_index].size = size;
561 if (!has_allocation) {
562 Allocation allocation;
563 allocation.ptr = m_device.allocate(size);
564 allocation.size = size;
565 m_allocations.push_back(allocation);
568 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
569 eigen_assert(m_allocations[m_allocation_index].size >= size);
571 return m_allocations[m_allocation_index++].ptr;
574 void reset() { m_allocation_index = 0; }
582 const Device& m_device;
583 int m_allocation_index;
585 std::vector<Allocation> m_allocations;
591 enum TensorBlockKind {
603 kMaterializedInScratch,
612 kMaterializedInOutput
619 class TensorBlockNotImplemented {
621 typedef void XprType;
629 template <
typename XprType>
631 typedef typename XprType::Scalar type;
634 struct XprScalar<void> {
656 template <
typename Scalar,
int NumDims,
int Layout,
658 class TensorMaterializedBlock {
660 typedef DSizes<IndexType, NumDims> Dimensions;
661 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
663 TensorMaterializedBlock(TensorBlockKind kind,
const Scalar* data,
664 const Dimensions& dimensions,
bool valid_expr =
true)
667 m_dimensions(dimensions),
668 m_expr(m_data, m_dimensions),
669 m_valid_expr(valid_expr) {
670 eigen_assert(m_kind == internal::TensorBlockKind::kView ||
671 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
672 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
675 TensorBlockKind kind()
const {
return m_kind; }
679 const XprType& expr()
const {
680 eigen_assert(m_valid_expr);
683 const Scalar* data()
const {
return m_data; }
686 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
698 Scalar* data()
const {
return m_data; }
699 const Dimensions& dimensions()
const {
return m_dimensions; }
700 const Dimensions& strides()
const {
return m_strides; }
702 TensorMaterializedBlock AsTensorMaterializedBlock()
const {
703 return TensorMaterializedBlock(
704 m_materialized_in_output
705 ? internal::TensorBlockKind::kMaterializedInOutput
706 : internal::TensorBlockKind::kMaterializedInScratch,
707 m_data, m_dimensions, !m_strided_storage);
711 friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
713 Storage(Scalar* data,
const Dimensions& dimensions,
714 const Dimensions& strides,
bool materialized_in_output,
715 bool strided_storage)
717 m_dimensions(dimensions),
719 m_materialized_in_output(materialized_in_output),
720 m_strided_storage(strided_storage) {}
723 Dimensions m_dimensions;
724 Dimensions m_strides;
725 bool m_materialized_in_output;
726 bool m_strided_storage;
731 template <
typename TensorBlockScratch>
732 EIGEN_STRONG_INLINE
static Storage prepareStorage(
733 TensorBlockDesc& desc, TensorBlockScratch& scratch,
734 bool allow_strided_storage =
false) {
736 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
738 if (desc.destination().kind() == DestinationBuffer::kContiguous) {
739 Scalar* buffer = desc.destination().template data<Scalar>();
740 desc.DropDestinationBuffer();
741 return Storage(buffer, desc.dimensions(),
742 internal::strides<Layout>(desc.dimensions()),
746 }
else if (desc.destination().kind() == DestinationBuffer::kStrided &&
747 allow_strided_storage) {
748 Scalar* buffer = desc.destination().template data<Scalar>();
749 desc.DropDestinationBuffer();
750 return Storage(buffer, desc.dimensions(), desc.destination().strides(),
754 void* mem = scratch.allocate(desc.size() *
sizeof(Scalar));
755 return Storage(
static_cast<Scalar*
>(mem), desc.dimensions(),
756 internal::strides<Layout>(desc.dimensions()),
763 template <
typename DataDimensions,
typename TensorBlockScratch>
764 EIGEN_STRONG_INLINE
static TensorMaterializedBlock materialize(
765 const Scalar* data,
const DataDimensions& data_dims,
766 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
767 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
779 static const bool is_col_major = Layout ==
ColMajor;
782 int num_matching_inner_dims = 0;
783 for (
int i = 0; i < NumDims; ++i) {
784 int dim = is_col_major ? i : NumDims - i - 1;
785 if (data_dims[dim] != desc.dimensions()[dim])
break;
786 ++num_matching_inner_dims;
791 bool can_use_direct_access =
true;
792 for (
int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
793 int dim = is_col_major ? i : NumDims - i - 1;
794 if (desc.dimension(dim) != 1) {
795 can_use_direct_access =
false;
800 if (can_use_direct_access) {
801 const Scalar* block_start = data + desc.offset();
802 return TensorMaterializedBlock(internal::TensorBlockKind::kView,
803 block_start, desc.dimensions());
807 const Storage storage = prepareStorage(desc, scratch);
809 typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
811 typedef typename TensorBlockIO::Dst TensorBlockIODst;
812 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
814 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
815 data, desc.offset());
816 TensorBlockIODst dst(storage.dimensions(), storage.strides(),
819 TensorBlockIO::Copy(dst, src);
820 return storage.AsTensorMaterializedBlock();
825 TensorBlockKind m_kind;
826 const Scalar* m_data;
827 Dimensions m_dimensions;
836 template <
typename UnaryOp,
typename ArgTensorBlock>
837 class TensorCwiseUnaryBlock {
838 static constexpr
bool NoArgBlockAccess =
839 internal::is_void<typename ArgTensorBlock::XprType>::value;
842 typedef std::conditional_t<
843 NoArgBlockAccess, void,
844 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >
847 typedef typename XprScalar<XprType>::type Scalar;
849 TensorCwiseUnaryBlock(
const ArgTensorBlock& arg_block,
const UnaryOp& functor)
850 : m_arg_block(arg_block), m_functor(functor) {}
852 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
854 XprType expr()
const {
return XprType(m_arg_block.expr(), m_functor); }
855 const Scalar* data()
const {
return NULL; }
856 void cleanup() { m_arg_block.cleanup(); }
859 ArgTensorBlock m_arg_block;
867 template <
typename BinaryOp,
typename LhsTensorBlock,
typename RhsTensorBlock>
868 class TensorCwiseBinaryBlock {
869 static constexpr
bool NoArgBlockAccess =
870 internal::is_void<typename LhsTensorBlock::XprType>::value ||
871 internal::is_void<typename RhsTensorBlock::XprType>::value;
874 typedef std::conditional_t<
875 NoArgBlockAccess, void,
876 TensorCwiseBinaryOp<BinaryOp,
const typename LhsTensorBlock::XprType,
877 const typename RhsTensorBlock::XprType> >
880 typedef typename XprScalar<XprType>::type Scalar;
882 TensorCwiseBinaryBlock(
const LhsTensorBlock& left_block,
883 const RhsTensorBlock& right_block,
884 const BinaryOp& functor)
885 : m_left_block(left_block),
886 m_right_block(right_block),
887 m_functor(functor) {}
889 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
891 XprType expr()
const {
892 return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
895 const Scalar* data()
const {
return NULL; }
898 m_left_block.cleanup();
899 m_right_block.cleanup();
903 LhsTensorBlock m_left_block;
904 RhsTensorBlock m_right_block;
913 template <
typename BlockFactory,
typename ArgTensorBlock>
914 class TensorUnaryExprBlock {
915 typedef typename ArgTensorBlock::XprType ArgXprType;
916 static constexpr
bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
919 typedef std::conditional_t<
920 NoArgBlockAccess, void,
921 typename BlockFactory::template XprType<ArgXprType>::type> XprType;
923 typedef typename XprScalar<XprType>::type Scalar;
925 TensorUnaryExprBlock(
const ArgTensorBlock& arg_block,
926 const BlockFactory& factory)
927 : m_arg_block(arg_block), m_factory(factory) {}
929 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
930 XprType expr()
const {
return m_factory.expr(m_arg_block.expr()); }
931 const Scalar* data()
const {
return NULL; }
932 void cleanup() { m_arg_block.cleanup(); }
935 ArgTensorBlock m_arg_block;
936 BlockFactory m_factory;
943 template <
typename BlockFactory,
typename Arg1TensorBlock,
944 typename Arg2TensorBlock,
typename Arg3TensorBlock>
945 class TensorTernaryExprBlock {
946 typedef typename Arg1TensorBlock::XprType Arg1XprType;
947 typedef typename Arg2TensorBlock::XprType Arg2XprType;
948 typedef typename Arg3TensorBlock::XprType Arg3XprType;
950 static constexpr
bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
951 internal::is_void<Arg2XprType>::value ||
952 internal::is_void<Arg3XprType>::value;
955 typedef std::conditional_t<
956 NoArgBlockAccess, void,
957 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
958 Arg3XprType>::type> XprType;
960 typedef typename XprScalar<XprType>::type Scalar;
962 TensorTernaryExprBlock(
const Arg1TensorBlock& arg1_block,
963 const Arg2TensorBlock& arg2_block,
964 const Arg3TensorBlock& arg3_block,
965 const BlockFactory& factory)
966 : m_arg1_block(arg1_block),
967 m_arg2_block(arg2_block),
968 m_arg3_block(arg3_block),
969 m_factory(factory) {}
971 TensorBlockKind kind()
const {
return internal::TensorBlockKind::kExpr; }
972 XprType expr()
const {
973 return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
974 m_arg3_block.expr());
976 const Scalar* data()
const {
return NULL; }
978 m_arg1_block.cleanup();
979 m_arg2_block.cleanup();
980 m_arg3_block.cleanup();
984 Arg1TensorBlock m_arg1_block;
985 Arg2TensorBlock m_arg2_block;
986 Arg3TensorBlock m_arg3_block;
987 BlockFactory m_factory;
994 template <
typename Scalar,
typename IndexType>
995 class StridedLinearBufferCopy {
996 typedef typename packet_traits<Scalar>::type Packet;
998 Vectorizable = packet_traits<Scalar>::Vectorizable,
999 PacketSize = packet_traits<Scalar>::size
1014 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1022 Src(IndexType o, IndexType s,
const Scalar* d)
1023 : offset(o), stride(s), data(d) {}
1030 template <
typename Str
idedLinearBufferCopy::Kind kind>
1031 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
const Dst& dst,
1033 const size_t count) {
1034 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1039 template <
typename Str
idedLinearBufferCopy::Kind kind>
1040 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
1041 const IndexType count,
const IndexType dst_offset,
1042 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1043 const IndexType src_offset,
const IndexType src_stride,
1044 const Scalar* EIGEN_RESTRICT src_data) {
1045 const Scalar* src = &src_data[src_offset];
1046 Scalar* dst = &dst_data[dst_offset];
1048 if (!Vectorizable) {
1049 for (
Index i = 0; i < count; ++i) {
1050 dst[i * dst_stride] = src[i * src_stride];
1055 const IndexType vectorized_size = count - PacketSize;
1058 if (kind == StridedLinearBufferCopy::Kind::Linear) {
1061 const IndexType unrolled_size = count - 4 * PacketSize;
1062 eigen_assert(src_stride == 1 && dst_stride == 1);
1063 for (; i <= unrolled_size; i += 4 * PacketSize) {
1064 for (
int j = 0; j < 4; ++j) {
1065 Packet p = ploadu<Packet>(src + i + j * PacketSize);
1066 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1069 for (; i <= vectorized_size; i += PacketSize) {
1070 Packet p = ploadu<Packet>(src + i);
1071 pstoreu<Scalar, Packet>(dst + i, p);
1073 for (; i < count; ++i) {
1077 }
else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1079 eigen_assert(src_stride == 1 && dst_stride != 1);
1080 for (; i <= vectorized_size; i += PacketSize) {
1081 Packet p = ploadu<Packet>(src + i);
1082 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1084 for (; i < count; ++i) {
1085 dst[i * dst_stride] = src[i];
1088 }
else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1090 eigen_assert(src_stride == 0 && dst_stride == 1);
1091 const IndexType unrolled_size = count - 4 * PacketSize;
1092 Packet p = pload1<Packet>(src);
1093 for (; i <= unrolled_size; i += 4 * PacketSize) {
1094 for (
int j = 0; j < 4; ++j) {
1095 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1098 for (; i <= vectorized_size; i += PacketSize) {
1099 pstoreu<Scalar, Packet>(dst + i, p);
1101 for (; i < count; ++i) {
1105 }
else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1107 eigen_assert(src_stride == 0 && dst_stride != 1);
1108 Packet p = pload1<Packet>(src);
1109 for (; i <= vectorized_size; i += PacketSize) {
1110 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1112 for (; i < count; ++i) {
1113 dst[i * dst_stride] = *src;
1116 }
else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1118 eigen_assert(dst_stride == 1);
1119 for (; i <= vectorized_size; i += PacketSize) {
1120 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1121 pstoreu<Scalar, Packet>(dst + i, p);
1123 for (; i < count; ++i) {
1124 dst[i] = src[i * src_stride];
1127 }
else if (kind == StridedLinearBufferCopy::Kind::Random) {
1129 for (; i < count; ++i) {
1130 dst[i * dst_stride] = src[i * src_stride];
1133 eigen_assert(
false);
1144 template <
typename Scalar,
typename IndexType,
int NumDims,
int Layout>
1145 class TensorBlockIO {
1146 static constexpr
bool IsColMajor = (Layout ==
ColMajor);
1148 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1151 typedef DSizes<IndexType, NumDims> Dimensions;
1152 typedef DSizes<int, NumDims> DimensionsMap;
1155 Dst(
const Dimensions& dst_dims,
const Dimensions& dst_strides, Scalar* dst,
1156 IndexType dst_offset = 0)
1157 : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1166 Src(
const Dimensions& src_strides,
const Scalar* src,
1167 IndexType src_offset = 0)
1168 : strides(src_strides), data(src), offset(src_offset) {}
1180 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
1181 const Dst& dst,
const Src& src,
const DimensionsMap& dst_to_src_dim_map) {
1184 *(dst.data + dst.offset) = *(src.data + src.offset);
1192 int inner_dim = IsColMajor ? 0 : NumDims - 1;
1193 EIGEN_UNUSED_VARIABLE(inner_dim);
1194 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1195 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1199 const DimensionsMap& dim_map = dst_to_src_dim_map;
1202 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1214 int num_size_one_inner_dims = 0;
1215 for (
int i = 0; i < num_squeezable_dims; ++i) {
1216 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1217 if (dst.dims[dst_dim] != 1)
break;
1218 num_size_one_inner_dims++;
1222 if (num_size_one_inner_dims == NumDims) {
1223 *(dst.data + dst.offset) = *(src.data + src.offset);
1228 const int dst_stride1_dim = IsColMajor
1229 ? num_size_one_inner_dims
1230 : NumDims - num_size_one_inner_dims - 1;
1233 const int src_dim_for_dst_stride1_dim =
1234 NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1237 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1241 for (
int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1242 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1243 const IndexType dst_stride = dst.strides[dst_dim];
1244 const IndexType src_stride = src.strides[dim_map[dst_dim]];
1245 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1246 dst_inner_dim_size *= dst.dims[dst_dim];
1247 ++num_size_one_inner_dims;
1254 IndexType input_offset = src.offset;
1255 IndexType output_offset = dst.offset;
1256 IndexType input_stride =
1257 NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1258 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1260 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1261 array<BlockIteratorState, at_least_1_dim> it;
1265 for (
int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1266 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1267 if (dst.dims[dst_dim] == 1)
continue;
1269 it[idx].size = dst.dims[dst_dim];
1270 it[idx].input_stride = src.strides[dim_map[dst_dim]];
1271 it[idx].output_stride = dst.strides[dst_dim];
1273 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1274 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1280 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1282 #define COPY_INNER_DIM(KIND) \
1283 IndexType num_copied = 0; \
1284 for (num_copied = 0; num_copied < block_total_size; \
1285 num_copied += dst_inner_dim_size) { \
1286 LinCopy::template Run<KIND>( \
1287 typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1288 typename LinCopy::Src(input_offset, input_stride, src.data), \
1289 dst_inner_dim_size); \
1291 for (int j = 0; j < idx; ++j) { \
1292 if (++it[j].count < it[j].size) { \
1293 input_offset += it[j].input_stride; \
1294 output_offset += it[j].output_stride; \
1298 input_offset -= it[j].input_span; \
1299 output_offset -= it[j].output_span; \
1304 if (input_stride == 1 && output_stride == 1) {
1305 COPY_INNER_DIM(LinCopy::Kind::Linear);
1306 }
else if (input_stride == 1 && output_stride != 1) {
1307 COPY_INNER_DIM(LinCopy::Kind::Scatter);
1308 }
else if (input_stride == 0 && output_stride == 1) {
1309 COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1310 }
else if (input_stride == 0 && output_stride != 1) {
1311 COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1312 }
else if (output_stride == 1) {
1313 COPY_INNER_DIM(LinCopy::Kind::Gather);
1315 COPY_INNER_DIM(LinCopy::Kind::Random);
1318 #undef COPY_INNER_DIM
1323 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(
const Dst& dst,
1325 DimensionsMap dst_to_src_map;
1326 for (
int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1327 return Copy(dst, src, dst_to_src_map);
1331 struct BlockIteratorState {
1332 BlockIteratorState()
1342 IndexType input_stride;
1343 IndexType output_stride;
1344 IndexType input_span;
1345 IndexType output_span;
1351 static int NumSqueezableInnerDims(
const DimensionsMap& dim_map) {
1352 int num_squeezable_dims = 0;
1353 for (
int i = 0; i < NumDims; ++i) {
1354 const int dim = IsColMajor ? i : NumDims - i - 1;
1355 if (dim_map[dim] != dim)
break;
1356 num_squeezable_dims++;
1358 return num_squeezable_dims;
1381 template <
typename Scalar,
int NumDims,
typename TensorBlockExpr,
1383 class TensorBlockAssignment {
1385 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1386 TensorBlockEvaluator;
1388 typedef DSizes<IndexType, NumDims> Dimensions;
1391 Vectorizable = packet_traits<Scalar>::Vectorizable,
1392 PacketSize = packet_traits<Scalar>::size
1395 template <
bool Vectorizable,
typename Evaluator>
1396 struct InnerDimAssign {
1397 EIGEN_ALWAYS_INLINE
static void Run(Scalar* target, IndexType count,
1398 const Evaluator& eval,
1399 IndexType eval_offset) {
1400 for (IndexType i = 0; i < count; ++i) {
1401 target[i] = eval.coeff(eval_offset + i);
1406 template <
typename Evaluator>
1407 struct InnerDimAssign<true, Evaluator> {
1408 EIGEN_ALWAYS_INLINE
static void Run(Scalar* target, IndexType count,
1409 const Evaluator& eval,
1410 IndexType eval_offset) {
1411 typedef typename packet_traits<Scalar>::type Packet;
1413 const IndexType unrolled_size = count - 4 * PacketSize;
1414 const IndexType vectorized_size = count - PacketSize;
1417 for (; i <= unrolled_size; i += 4 * PacketSize) {
1418 for (
int j = 0; j < 4; ++j) {
1419 const IndexType idx = eval_offset + i + j * PacketSize;
1420 Packet p = eval.template packet<Unaligned>(idx);
1421 pstoreu<Scalar>(target + i + j * PacketSize, p);
1425 for (; i <= vectorized_size; i += PacketSize) {
1426 Packet p = eval.template packet<Unaligned>(eval_offset + i);
1427 pstoreu<Scalar>(target + i, p);
1430 for (; i < count; ++i) {
1431 target[i] = eval.coeff(eval_offset + i);
1438 Target(
const Dimensions& target_dims,
const Dimensions& target_strides,
1439 Scalar* target_data, IndexType target_offset = 0)
1440 : dims(target_dims),
1441 strides(target_strides),
1443 offset(target_offset) {}
1451 static Target target(
const Dimensions& target_dims,
1452 const Dimensions& target_strides, Scalar* target_data,
1453 IndexType target_offset = 0) {
1454 return Target(target_dims, target_strides, target_data, target_offset);
1457 template <
typename TargetDimsIndexType,
typename TargetStr
idesIndexType>
1458 static Target target(
1459 const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1460 const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1461 Scalar* target_data, IndexType target_offset = 0) {
1463 return Target(Dimensions(target_dims), Dimensions(target_strides),
1464 target_data, target_offset);
1467 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void Run(
1468 const Target& target,
const TensorBlockExpr& expr) {
1470 DefaultDevice default_device;
1471 TensorBlockEvaluator eval(expr, default_device);
1474 eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1476 static const int Layout = TensorBlockEvaluator::Layout;
1477 static const bool is_col_major = Layout ==
ColMajor;
1480 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1481 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1482 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1485 eigen_assert(target.strides[inner_dim_idx] == 1);
1488 IndexType num_squeezed_dims = 0;
1489 for (
Index i = 1; i < NumDims; ++i) {
1490 const Index dim = is_col_major ? i : NumDims - i - 1;
1491 const IndexType target_stride = target.strides[dim];
1493 if (output_inner_dim_size == target_stride) {
1494 output_inner_dim_size *= target.dims[dim];
1495 num_squeezed_dims++;
1503 array<BlockIteratorState, NumDims> it;
1506 for (
Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1507 const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1510 it[idx].size = target.dims[dim];
1511 it[idx].output_stride = target.strides[dim];
1512 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1518 IndexType input_offset = 0;
1519 IndexType output_offset = target.offset;
1522 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1524 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1525 TensorBlockEvaluator>::Run(target.data + output_offset,
1526 output_inner_dim_size, eval,
1530 input_offset += output_inner_dim_size;
1533 for (
int j = 0; j < idx; ++j) {
1534 if (++it[j].count < it[j].size) {
1535 output_offset += it[j].output_stride;
1539 output_offset -= it[j].output_span;
1545 struct BlockIteratorState {
1546 BlockIteratorState()
1547 : count(0), size(0), output_stride(0), output_span(0) {}
1551 IndexType output_stride;
1552 IndexType output_span;
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index