Eigen-unsupported  3.4.90 (git rev 67eeba6e720c5745abc77ae6c92ce0a44aa7b7ae)
TensorShuffling.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
24 namespace internal {
25 template<typename Shuffle, typename XprType>
26 struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
27 {
28  typedef typename XprType::Scalar Scalar;
29  typedef traits<XprType> XprTraits;
30  typedef typename XprTraits::StorageKind StorageKind;
31  typedef typename XprTraits::Index Index;
32  typedef typename XprType::Nested Nested;
33  typedef std::remove_reference_t<Nested> Nested_;
34  static constexpr int NumDimensions = XprTraits::NumDimensions;
35  static constexpr int Layout = XprTraits::Layout;
36  typedef typename XprTraits::PointerType PointerType;
37 };
38 
39 template<typename Shuffle, typename XprType>
40 struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
41 {
42  typedef const TensorShufflingOp<Shuffle, XprType>& type;
43 };
44 
45 template<typename Shuffle, typename XprType>
46 struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
47 {
48  typedef TensorShufflingOp<Shuffle, XprType> type;
49 };
50 
51 } // end namespace internal
52 
53 
54 
55 template<typename Shuffle, typename XprType>
56 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
57 {
58  public:
59  typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
60  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
61  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
62  typedef typename XprType::CoeffReturnType CoeffReturnType;
63  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
64  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
65  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
66 
67  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
68  : m_xpr(expr), m_shuffle(shfl) {}
69 
70  EIGEN_DEVICE_FUNC
71  const Shuffle& shufflePermutation() const { return m_shuffle; }
72 
73  EIGEN_DEVICE_FUNC
74  const internal::remove_all_t<typename XprType::Nested>&
75  expression() const { return m_xpr; }
76 
77  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
78 
79 
80  protected:
81  typename XprType::Nested m_xpr;
82  const Shuffle m_shuffle;
83 };
84 
85 
86 // Eval as rvalue
87 template<typename Shuffle, typename ArgType, typename Device>
88 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
89 {
90  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
91  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
92  typedef typename XprType::Index Index;
93  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
94  typedef DSizes<Index, NumDims> Dimensions;
95  typedef typename XprType::Scalar Scalar;
96  typedef typename XprType::CoeffReturnType CoeffReturnType;
97  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
98  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
99  typedef StorageMemory<CoeffReturnType, Device> Storage;
100  typedef typename Storage::Type EvaluatorPointerType;
101 
102  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
103  enum {
104  IsAligned = false,
105  PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
106  BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
107  PreferBlockAccess = true,
108  CoordAccess = false, // to be implemented
109  RawAccess = false
110  };
111 
112  typedef std::remove_const_t<Scalar> ScalarNoConst;
113 
114  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
115  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
116  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
117 
118  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
119  Layout, Index>
120  TensorBlock;
121  //===--------------------------------------------------------------------===//
122 
123  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
124  : m_device(device),
125  m_impl(op.expression(), device)
126  {
127  const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
128  const Shuffle& shuffle = op.shufflePermutation();
129  m_is_identity = true;
130  for (int i = 0; i < NumDims; ++i) {
131  m_shuffle[i] = static_cast<int>(shuffle[i]);
132  m_dimensions[i] = input_dims[shuffle[i]];
133  m_inverseShuffle[shuffle[i]] = i;
134  if (m_is_identity && shuffle[i] != i) {
135  m_is_identity = false;
136  }
137  }
138 
139  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
140  m_unshuffledInputStrides[0] = 1;
141  m_outputStrides[0] = 1;
142 
143  for (int i = 1; i < NumDims; ++i) {
144  m_unshuffledInputStrides[i] =
145  m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
146  m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
147  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
148  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
149  }
150  } else {
151  m_unshuffledInputStrides[NumDims - 1] = 1;
152  m_outputStrides[NumDims - 1] = 1;
153  for (int i = NumDims - 2; i >= 0; --i) {
154  m_unshuffledInputStrides[i] =
155  m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
156  m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
157  m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
158  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
159  }
160  }
161 
162  for (int i = 0; i < NumDims; ++i) {
163  m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
164  }
165  }
166 
167  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
168 
169  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
170  m_impl.evalSubExprsIfNeeded(NULL);
171  return true;
172  }
173 
174 #ifdef EIGEN_USE_THREADS
175  template <typename EvalSubExprsCallback>
176  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
177  EvaluatorPointerType, EvalSubExprsCallback done) {
178  m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
179  }
180 #endif // EIGEN_USE_THREADS
181 
182  EIGEN_STRONG_INLINE void cleanup() {
183  m_impl.cleanup();
184  }
185 
186  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
187  {
188  if (m_is_identity) {
189  return m_impl.coeff(index);
190  } else {
191  return m_impl.coeff(srcCoeff(index));
192  }
193  }
194 
195  template <int LoadMode, typename Self, bool ImplPacketAccess>
196  struct PacketLoader {
197  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
198  static PacketReturnType Run(const Self& self, Index index) {
199  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
200  EIGEN_UNROLL_LOOP
201  for (int i = 0; i < PacketSize; ++i) {
202  values[i] = self.coeff(index + i);
203  }
204  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
205  return rslt;
206  }
207  };
208 
209  template<int LoadMode, typename Self>
210  struct PacketLoader<LoadMode, Self, true> {
211  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
212  static PacketReturnType Run(const Self& self, Index index) {
213  if (self.m_is_identity) {
214  return self.m_impl.template packet<LoadMode>(index);
215  } else {
216  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
217  EIGEN_UNROLL_LOOP
218  for (int i = 0; i < PacketSize; ++i) {
219  values[i] = self.coeff(index + i);
220  }
221  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
222  return rslt;
223  }
224  }
225  };
226 
227  template<int LoadMode>
228  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
229  {
230  eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
231  return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
232  }
233 
234  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
235  internal::TensorBlockResourceRequirements getResourceRequirements() const {
236  static const int inner_dim =
237  Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
238 
239  const size_t target_size = m_device.firstLevelCacheSize();
240  const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
241 
242  // Shuffled inner dimensions leads to a random memory access, which is not
243  // captured by default cost model bytes loaded/stored. We add this cost
244  // explicitly. The number of cycles picked based on the benchmarks.
245  // TODO(ezhulenev): This number was picked based on a very questionable
246  // benchmarks, add benchmarks that are representative of real workloads.
247  using BlockRequirements = internal::TensorBlockResourceRequirements;
248  if (inner_dim_shuffled) {
249  return BlockRequirements::uniform<Scalar>(target_size)
250  .addCostPerCoeff({0, 0, NumDims * 28});
251  } else {
252  return BlockRequirements::skewed<Scalar>(target_size);
253  }
254  }
255 
256  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
257  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
258  bool root_of_expr_ast = false) const {
259  assert(m_impl.data() != NULL);
260 
261  typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
262  TensorBlockIO;
263  typedef typename TensorBlockIO::Dst TensorBlockIODst;
264  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
265 
266  const typename TensorBlock::Storage block_storage =
267  TensorBlock::prepareStorage(
268  desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
269 
270  typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
271  TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
272 
273  TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
274  block_storage.data());
275 
276  typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
277  TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
278 
279  return block_storage.AsTensorMaterializedBlock();
280  }
281 
282  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
283  const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
284  NumDims * (2 * TensorOpCost::AddCost<Index>() +
285  2 * TensorOpCost::MulCost<Index>() +
286  TensorOpCost::DivCost<Index>());
287  return m_impl.costPerCoeff(vectorized) +
288  TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
289  }
290 
291  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
292 
293 #ifdef EIGEN_USE_SYCL
294  // binding placeholder accessors to a command group handler for SYCL
295  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
296  m_impl.bind(cgh);
297  }
298 #endif
299  protected:
300  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
301  Index input_index,
302  const DSizes<Index, NumDims>& input_block_strides,
303  const DSizes<Index, NumDims>& output_block_strides,
304  const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
305  Index output_index = 0;
306  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
307  for (int i = NumDims - 1; i > 0; --i) {
308  const Index idx = input_index / fast_input_block_strides[i];
309  output_index += idx * output_block_strides[m_inverseShuffle[i]];
310  input_index -= idx * input_block_strides[i];
311  }
312  return output_index + input_index *
313  output_block_strides[m_inverseShuffle[0]];
314  } else {
315  for (int i = 0; i < NumDims - 1; ++i) {
316  const Index idx = input_index / fast_input_block_strides[i];
317  output_index += idx * output_block_strides[m_inverseShuffle[i]];
318  input_index -= idx * input_block_strides[i];
319  }
320  return output_index + input_index *
321  output_block_strides[m_inverseShuffle[NumDims - 1]];
322  }
323  }
324 
325  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
326  Index inputIndex = 0;
327  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
328  for (int i = NumDims - 1; i > 0; --i) {
329  const Index idx = index / m_fastOutputStrides[i];
330  inputIndex += idx * m_inputStrides[i];
331  index -= idx * m_outputStrides[i];
332  }
333  return inputIndex + index * m_inputStrides[0];
334  } else {
335  for (int i = 0; i < NumDims - 1; ++i) {
336  const Index idx = index / m_fastOutputStrides[i];
337  inputIndex += idx * m_inputStrides[i];
338  index -= idx * m_outputStrides[i];
339  }
340  return inputIndex + index * m_inputStrides[NumDims - 1];
341  }
342  }
343 
344  Dimensions m_dimensions;
345  bool m_is_identity;
346  array<int, NumDims> m_shuffle;
347  array<Index, NumDims> m_inverseShuffle; // TODO(ezhulenev): Make it int type.
348  array<Index, NumDims> m_outputStrides;
349  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
350  array<Index, NumDims> m_inputStrides;
351  array<Index, NumDims> m_unshuffledInputStrides;
352 
353  const Device EIGEN_DEVICE_REF m_device;
354  TensorEvaluator<ArgType, Device> m_impl;
355 };
356 
357 
358 // Eval as lvalue
359 template<typename Shuffle, typename ArgType, typename Device>
360 struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
361  : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
362 {
363  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
364 
365  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
366  typedef typename XprType::Index Index;
367  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
368  typedef DSizes<Index, NumDims> Dimensions;
369  typedef typename XprType::Scalar Scalar;
370  typedef typename XprType::CoeffReturnType CoeffReturnType;
371  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
372  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
373  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
374 
375  enum {
376  IsAligned = false,
377  PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
378  BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
379  PreferBlockAccess = true,
380  RawAccess = false
381  };
382 
383  typedef std::remove_const_t<Scalar> ScalarNoConst;
384 
385  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
386  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
387  //===--------------------------------------------------------------------===//
388 
389  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
390  : Base(op, device)
391  { }
392 
393  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
394  {
395  return this->m_impl.coeffRef(this->srcCoeff(index));
396  }
397 
398  template <int StoreMode> EIGEN_STRONG_INLINE
399  void writePacket(Index index, const PacketReturnType& x)
400  {
401  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
402  internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
403  EIGEN_UNROLL_LOOP
404  for (int i = 0; i < PacketSize; ++i) {
405  this->coeffRef(index+i) = values[i];
406  }
407  }
408 
409  template <typename TensorBlock>
410  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
411  const TensorBlockDesc& desc, const TensorBlock& block) {
412  eigen_assert(this->m_impl.data() != NULL);
413 
414  typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
415  TensorBlockIO;
416  typedef typename TensorBlockIO::Dst TensorBlockIODst;
417  typedef typename TensorBlockIO::Src TensorBlockIOSrc;
418 
419  const Scalar* block_buffer = block.data();
420 
421  // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
422  // expression with coefficient and packet access as `src`.
423  void* mem = NULL;
424  if (block_buffer == NULL) {
425  mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
426  ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
427 
428  typedef internal::TensorBlockAssignment<
429  ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
430  TensorBlockAssignment;
431 
432  TensorBlockAssignment::Run(
433  TensorBlockAssignment::target(
434  desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
435  buf),
436  block.expr());
437 
438  block_buffer = buf;
439  }
440 
441  // Read from block.
442  TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
443  block_buffer);
444 
445  // Write to the output buffer.
446  typename TensorBlockIO::Dimensions output_strides(
447  this->m_unshuffledInputStrides);
448  typename TensorBlockIO::Dimensions output_dimensions;
449  for (int i = 0; i < NumDims; ++i) {
450  output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
451  }
452  TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
453  this->srcCoeff(desc.offset()));
454 
455  // Reorder dimensions according to the shuffle.
456  typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
457  for (int i = 0; i < NumDims; ++i) {
458  dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
459  }
460  TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
461 
462  // Deallocate temporary buffer used for the block materialization.
463  if (mem != NULL) this->m_device.deallocate(mem);
464  }
465 };
466 
467 
468 } // end namespace Eigen
469 
470 #endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index