10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
13 #include "./InternalHeaderCheck.h"
25 template<
typename Shuffle,
typename XprType>
26 struct traits<TensorShufflingOp<Shuffle, XprType> > :
public traits<XprType>
28 typedef typename XprType::Scalar Scalar;
29 typedef traits<XprType> XprTraits;
30 typedef typename XprTraits::StorageKind StorageKind;
31 typedef typename XprTraits::Index
Index;
32 typedef typename XprType::Nested Nested;
33 typedef std::remove_reference_t<Nested> Nested_;
34 static constexpr
int NumDimensions = XprTraits::NumDimensions;
35 static constexpr
int Layout = XprTraits::Layout;
36 typedef typename XprTraits::PointerType PointerType;
39 template<
typename Shuffle,
typename XprType>
40 struct eval<TensorShufflingOp<Shuffle, XprType>,
Eigen::Dense>
42 typedef const TensorShufflingOp<Shuffle, XprType>& type;
45 template<
typename Shuffle,
typename XprType>
46 struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
48 typedef TensorShufflingOp<Shuffle, XprType> type;
55 template<
typename Shuffle,
typename XprType>
56 class TensorShufflingOp :
public TensorBase<TensorShufflingOp<Shuffle, XprType> >
59 typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
60 typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
62 typedef typename XprType::CoeffReturnType CoeffReturnType;
63 typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
64 typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
65 typedef typename Eigen::internal::traits<TensorShufflingOp>::Index
Index;
67 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(
const XprType& expr,
const Shuffle& shfl)
68 : m_xpr(expr), m_shuffle(shfl) {}
71 const Shuffle& shufflePermutation()
const {
return m_shuffle; }
74 const internal::remove_all_t<typename XprType::Nested>&
75 expression()
const {
return m_xpr; }
77 EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
81 typename XprType::Nested m_xpr;
82 const Shuffle m_shuffle;
87 template<
typename Shuffle,
typename ArgType,
typename Device>
88 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
90 typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
91 typedef TensorShufflingOp<Shuffle, ArgType> XprType;
92 typedef typename XprType::Index Index;
93 static constexpr
int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
94 typedef DSizes<Index, NumDims> Dimensions;
95 typedef typename XprType::Scalar Scalar;
96 typedef typename XprType::CoeffReturnType CoeffReturnType;
97 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
98 static constexpr
int PacketSize = PacketType<CoeffReturnType, Device>::size;
99 typedef StorageMemory<CoeffReturnType, Device> Storage;
100 typedef typename Storage::Type EvaluatorPointerType;
102 static constexpr
int Layout = TensorEvaluator<ArgType, Device>::Layout;
105 PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
106 BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
107 PreferBlockAccess =
true,
112 typedef std::remove_const_t<Scalar> ScalarNoConst;
115 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
116 typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
118 typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
123 EIGEN_STRONG_INLINE TensorEvaluator(
const XprType& op,
const Device& device)
125 m_impl(op.expression(), device)
127 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
128 const Shuffle& shuffle = op.shufflePermutation();
129 m_is_identity =
true;
130 for (
int i = 0; i < NumDims; ++i) {
131 m_shuffle[i] =
static_cast<int>(shuffle[i]);
132 m_dimensions[i] = input_dims[shuffle[i]];
133 m_inverseShuffle[shuffle[i]] = i;
134 if (m_is_identity && shuffle[i] != i) {
135 m_is_identity =
false;
139 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
140 m_unshuffledInputStrides[0] = 1;
141 m_outputStrides[0] = 1;
143 for (
int i = 1; i < NumDims; ++i) {
144 m_unshuffledInputStrides[i] =
145 m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
146 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
147 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
148 m_outputStrides[i] > 0 ? m_outputStrides[i] :
Index(1));
151 m_unshuffledInputStrides[NumDims - 1] = 1;
152 m_outputStrides[NumDims - 1] = 1;
153 for (
int i = NumDims - 2; i >= 0; --i) {
154 m_unshuffledInputStrides[i] =
155 m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
156 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
157 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
158 m_outputStrides[i] > 0 ? m_outputStrides[i] :
Index(1));
162 for (
int i = 0; i < NumDims; ++i) {
163 m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
167 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions()
const {
return m_dimensions; }
169 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(EvaluatorPointerType ) {
170 m_impl.evalSubExprsIfNeeded(NULL);
174 #ifdef EIGEN_USE_THREADS
175 template <
typename EvalSubExprsCallback>
176 EIGEN_STRONG_INLINE
void evalSubExprsIfNeededAsync(
177 EvaluatorPointerType, EvalSubExprsCallback done) {
178 m_impl.evalSubExprsIfNeededAsync(
nullptr, [done](
bool) { done(
true); });
182 EIGEN_STRONG_INLINE
void cleanup() {
186 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const
189 return m_impl.coeff(index);
191 return m_impl.coeff(srcCoeff(index));
195 template <
int LoadMode,
typename Self,
bool ImplPacketAccess>
196 struct PacketLoader {
197 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
198 static PacketReturnType Run(
const Self&
self, Index index) {
199 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
201 for (
int i = 0; i < PacketSize; ++i) {
202 values[i] =
self.coeff(index + i);
204 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
209 template<
int LoadMode,
typename Self>
210 struct PacketLoader<LoadMode, Self, true> {
211 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
212 static PacketReturnType Run(
const Self&
self, Index index) {
213 if (
self.m_is_identity) {
214 return self.m_impl.template packet<LoadMode>(index);
216 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
218 for (
int i = 0; i < PacketSize; ++i) {
219 values[i] =
self.coeff(index + i);
221 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
227 template<
int LoadMode>
228 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index)
const
230 eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
231 return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*
this, index);
234 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
235 internal::TensorBlockResourceRequirements getResourceRequirements()
const {
236 static const int inner_dim =
237 Layout ==
static_cast<int>(
ColMajor) ? 0 : NumDims - 1;
239 const size_t target_size = m_device.firstLevelCacheSize();
240 const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
247 using BlockRequirements = internal::TensorBlockResourceRequirements;
248 if (inner_dim_shuffled) {
249 return BlockRequirements::uniform<Scalar>(target_size)
250 .addCostPerCoeff({0, 0, NumDims * 28});
252 return BlockRequirements::skewed<Scalar>(target_size);
256 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
257 block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
258 bool root_of_expr_ast =
false)
const {
259 assert(m_impl.data() != NULL);
261 typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
263 typedef typename TensorBlockIO::Dst TensorBlockIODst;
264 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
266 const typename TensorBlock::Storage block_storage =
267 TensorBlock::prepareStorage(
268 desc, scratch, root_of_expr_ast);
270 typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
271 TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
273 TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
274 block_storage.data());
276 typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
277 TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
279 return block_storage.AsTensorMaterializedBlock();
282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(
bool vectorized)
const {
283 const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
284 NumDims * (2 * TensorOpCost::AddCost<
Index>() +
285 2 * TensorOpCost::MulCost<
Index>() +
286 TensorOpCost::DivCost<
Index>());
287 return m_impl.costPerCoeff(vectorized) +
288 TensorOpCost(0, 0, compute_cost, m_is_identity , PacketSize);
291 EIGEN_DEVICE_FUNC
typename Storage::Type data()
const {
return NULL; }
293 #ifdef EIGEN_USE_SYCL
295 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void bind(cl::sycl::handler &cgh)
const {
300 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index GetBlockOutputIndex(
302 const DSizes<Index, NumDims>& input_block_strides,
303 const DSizes<Index, NumDims>& output_block_strides,
304 const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides)
const {
305 Index output_index = 0;
306 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
307 for (
int i = NumDims - 1; i > 0; --i) {
308 const Index idx = input_index / fast_input_block_strides[i];
309 output_index += idx * output_block_strides[m_inverseShuffle[i]];
310 input_index -= idx * input_block_strides[i];
312 return output_index + input_index *
313 output_block_strides[m_inverseShuffle[0]];
315 for (
int i = 0; i < NumDims - 1; ++i) {
316 const Index idx = input_index / fast_input_block_strides[i];
317 output_index += idx * output_block_strides[m_inverseShuffle[i]];
318 input_index -= idx * input_block_strides[i];
320 return output_index + input_index *
321 output_block_strides[m_inverseShuffle[NumDims - 1]];
325 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index srcCoeff(Index index)
const {
326 Index inputIndex = 0;
327 if (
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor)) {
328 for (
int i = NumDims - 1; i > 0; --i) {
329 const Index idx = index / m_fastOutputStrides[i];
330 inputIndex += idx * m_inputStrides[i];
331 index -= idx * m_outputStrides[i];
333 return inputIndex + index * m_inputStrides[0];
335 for (
int i = 0; i < NumDims - 1; ++i) {
336 const Index idx = index / m_fastOutputStrides[i];
337 inputIndex += idx * m_inputStrides[i];
338 index -= idx * m_outputStrides[i];
340 return inputIndex + index * m_inputStrides[NumDims - 1];
344 Dimensions m_dimensions;
346 array<int, NumDims> m_shuffle;
347 array<Index, NumDims> m_inverseShuffle;
348 array<Index, NumDims> m_outputStrides;
349 array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
350 array<Index, NumDims> m_inputStrides;
351 array<Index, NumDims> m_unshuffledInputStrides;
353 const Device EIGEN_DEVICE_REF m_device;
354 TensorEvaluator<ArgType, Device> m_impl;
359 template<
typename Shuffle,
typename ArgType,
typename Device>
360 struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
361 :
public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
363 typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
365 typedef TensorShufflingOp<Shuffle, ArgType> XprType;
366 typedef typename XprType::Index
Index;
367 static constexpr
int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
368 typedef DSizes<Index, NumDims> Dimensions;
369 typedef typename XprType::Scalar Scalar;
370 typedef typename XprType::CoeffReturnType CoeffReturnType;
371 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
372 static constexpr
int PacketSize = PacketType<CoeffReturnType, Device>::size;
373 static constexpr
int Layout = TensorEvaluator<ArgType, Device>::Layout;
377 PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
378 BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
379 PreferBlockAccess =
true,
383 typedef std::remove_const_t<Scalar> ScalarNoConst;
386 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
389 EIGEN_STRONG_INLINE TensorEvaluator(
const XprType& op,
const Device& device)
393 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
395 return this->m_impl.coeffRef(this->srcCoeff(index));
398 template <
int StoreMode> EIGEN_STRONG_INLINE
399 void writePacket(Index index,
const PacketReturnType& x)
401 EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
402 internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
404 for (
int i = 0; i < PacketSize; ++i) {
405 this->coeffRef(index+i) = values[i];
409 template <
typename TensorBlock>
410 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writeBlock(
411 const TensorBlockDesc& desc,
const TensorBlock& block) {
412 eigen_assert(this->m_impl.data() != NULL);
414 typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
416 typedef typename TensorBlockIO::Dst TensorBlockIODst;
417 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
419 const Scalar* block_buffer = block.data();
424 if (block_buffer == NULL) {
425 mem = this->m_device.allocate(desc.size() *
sizeof(Scalar));
426 ScalarNoConst* buf =
static_cast<ScalarNoConst*
>(mem);
428 typedef internal::TensorBlockAssignment<
429 ScalarNoConst, NumDims,
typename TensorBlock::XprType,
Index>
430 TensorBlockAssignment;
432 TensorBlockAssignment::Run(
433 TensorBlockAssignment::target(
434 desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
442 TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
446 typename TensorBlockIO::Dimensions output_strides(
447 this->m_unshuffledInputStrides);
448 typename TensorBlockIO::Dimensions output_dimensions;
449 for (
int i = 0; i < NumDims; ++i) {
450 output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
452 TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
453 this->srcCoeff(desc.offset()));
456 typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
457 for (
int i = 0; i < NumDims; ++i) {
458 dst_to_src_dim_map[i] =
static_cast<int>(this->m_inverseShuffle[i]);
460 TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
463 if (mem != NULL) this->m_device.deallocate(mem);
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index