Eigen-unsupported  3.4.90 (git rev 67eeba6e720c5745abc77ae6c92ce0a44aa7b7ae)
TensorBroadcasting.h
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
12 
13 #include "./InternalHeaderCheck.h"
14 
15 namespace Eigen {
16 
24 namespace internal {
25 template<typename Broadcast, typename XprType>
26 struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
27 {
28  typedef typename XprType::Scalar Scalar;
29  typedef traits<XprType> XprTraits;
30  typedef typename XprTraits::StorageKind StorageKind;
31  typedef typename XprTraits::Index Index;
32  typedef typename XprType::Nested Nested;
33  typedef std::remove_reference_t<Nested> Nested_;
34  static constexpr int NumDimensions = XprTraits::NumDimensions;
35  static constexpr int Layout = XprTraits::Layout;
36  typedef typename XprTraits::PointerType PointerType;
37 };
38 
39 template<typename Broadcast, typename XprType>
40 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
41 {
42  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
43 };
44 
45 template<typename Broadcast, typename XprType>
46 struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
47 {
48  typedef TensorBroadcastingOp<Broadcast, XprType> type;
49 };
50 
51 template <typename Dims>
52 struct is_input_scalar {
53  static const bool value = false;
54 };
55 template <>
56 struct is_input_scalar<Sizes<> > {
57  static const bool value = true;
58 };
59 #ifndef EIGEN_EMULATE_CXX11_META_H
60 template <typename std::ptrdiff_t... Indices>
61 struct is_input_scalar<Sizes<Indices...> > {
62  static const bool value = (Sizes<Indices...>::total_size == 1);
63 };
64 #endif
65 
66 } // end namespace internal
67 
68 
69 
70 template<typename Broadcast, typename XprType>
71 class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
72 {
73  public:
74  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
75  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
76  typedef typename XprType::CoeffReturnType CoeffReturnType;
77  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
78  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
79  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
80 
81  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
82  : m_xpr(expr), m_broadcast(broadcast) {}
83 
84  EIGEN_DEVICE_FUNC
85  const Broadcast& broadcast() const { return m_broadcast; }
86 
87  EIGEN_DEVICE_FUNC
88  const internal::remove_all_t<typename XprType::Nested>&
89  expression() const { return m_xpr; }
90 
91  protected:
92  typename XprType::Nested m_xpr;
93  const Broadcast m_broadcast;
94 };
95 
96 
97 // Eval as rvalue
98 template<typename Broadcast, typename ArgType, typename Device>
99 struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
100 {
101  typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
102  typedef typename XprType::Index Index;
103  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
104  typedef DSizes<Index, NumDims> Dimensions;
105  typedef typename XprType::Scalar Scalar;
106  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
107  typedef typename XprType::CoeffReturnType CoeffReturnType;
108  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
109  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
110  protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be standard layout;
111  bool isCopy, nByOne, oneByN;
112  public:
113  typedef StorageMemory<CoeffReturnType, Device> Storage;
114  typedef typename Storage::Type EvaluatorPointerType;
115 
116  enum {
117  IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
118  PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
119  BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
120  PreferBlockAccess = true,
121  RawAccess = false
122  };
123  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
124 
125  typedef std::remove_const_t<Scalar> ScalarNoConst;
126 
127  // We do block based broadcasting using a trick with 2x tensor rank and 0
128  // strides. See block method implementation for details.
129  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
130 
131  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
132  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
133  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
134 
135  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
136  ArgTensorBlock;
137 
138  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
139  Layout, Index>
140  TensorBlock;
141  //===--------------------------------------------------------------------===//
142 
143  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
144  : isCopy(false), nByOne(false), oneByN(false),
145  m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
146  {
147 
148  // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
149  // and store the result in a scalar. Instead one should reshape the scalar into a N-D
150  // tensor with N >= 1 of 1 element first and then broadcast.
151  EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
152  const InputDimensions& input_dims = m_impl.dimensions();
153  isCopy = true;
154  for (int i = 0; i < NumDims; ++i) {
155  eigen_assert(input_dims[i] > 0);
156  m_dimensions[i] = input_dims[i] * m_broadcast[i];
157  if (m_broadcast[i] != 1) {
158  isCopy = false;
159  }
160  }
161 
162  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
163  m_inputStrides[0] = 1;
164  m_outputStrides[0] = 1;
165  for (int i = 1; i < NumDims; ++i) {
166  m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
167  m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
168  }
169  } else {
170  m_inputStrides[NumDims-1] = 1;
171  m_outputStrides[NumDims-1] = 1;
172  for (int i = NumDims-2; i >= 0; --i) {
173  m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
174  m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
175  }
176  }
177 
178  if (input_dims[0] == 1) {
179  oneByN = true;
180  for (int i = 1; i < NumDims; ++i) {
181  if (m_broadcast[i] != 1) {
182  oneByN = false;
183  break;
184  }
185  }
186  } else if (input_dims[NumDims-1] == 1) {
187  nByOne = true;
188  for (int i = 0; i < NumDims-1; ++i) {
189  if (m_broadcast[i] != 1) {
190  nByOne = false;
191  break;
192  }
193  }
194  }
195 
196  // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
197  // broadcast shape is '[N, 1..., N]'
198  if (!oneByN && !nByOne) {
199  if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
200  nByOne = true;
201  oneByN = true;
202  for (int i = 1; i < NumDims-1; ++i) {
203  if (m_broadcast[i] != 1) {
204  nByOne = false;
205  oneByN = false;
206  break;
207  }
208  }
209  }
210  }
211  }
212 
213  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
214 
215  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
216  m_impl.evalSubExprsIfNeeded(NULL);
217  return true;
218  }
219 
220 #ifdef EIGEN_USE_THREADS
221  template <typename EvalSubExprsCallback>
222  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
223  EvaluatorPointerType, EvalSubExprsCallback done) {
224  m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
225  }
226 #endif // EIGEN_USE_THREADS
227 
228  EIGEN_STRONG_INLINE void cleanup() {
229  m_impl.cleanup();
230  }
231 
232  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
233  {
234  if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
235  return m_impl.coeff(0);
236  }
237 
238  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
239  if (isCopy) {
240  return m_impl.coeff(index);
241  } else {
242  return coeffColMajor(index);
243  }
244  } else {
245  if (isCopy) {
246  return m_impl.coeff(index);
247  } else {
248  return coeffRowMajor(index);
249  }
250  }
251  }
252 
253  // TODO: attempt to speed this up. The integer divisions and modulo are slow
254  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
255  Index inputIndex = 0;
256  EIGEN_UNROLL_LOOP
257  for (int i = NumDims - 1; i > 0; --i) {
258  const Index idx = index / m_outputStrides[i];
259  if (internal::index_statically_eq<Broadcast>(i, 1)) {
260  eigen_assert(idx < m_impl.dimensions()[i]);
261  inputIndex += idx * m_inputStrides[i];
262  } else {
263  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
264  eigen_assert(idx % m_impl.dimensions()[i] == 0);
265  } else {
266  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
267  }
268  }
269  index -= idx * m_outputStrides[i];
270  }
271  if (internal::index_statically_eq<Broadcast>(0, 1)) {
272  eigen_assert(index < m_impl.dimensions()[0]);
273  inputIndex += index;
274  } else {
275  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
276  eigen_assert(index % m_impl.dimensions()[0] == 0);
277  } else {
278  inputIndex += (index % m_impl.dimensions()[0]);
279  }
280  }
281  return inputIndex;
282  }
283 
284  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
285  {
286  return m_impl.coeff(indexColMajor(index));
287  }
288 
289  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
290  Index inputIndex = 0;
291  EIGEN_UNROLL_LOOP
292  for (int i = 0; i < NumDims - 1; ++i) {
293  const Index idx = index / m_outputStrides[i];
294  if (internal::index_statically_eq<Broadcast>(i, 1)) {
295  eigen_assert(idx < m_impl.dimensions()[i]);
296  inputIndex += idx * m_inputStrides[i];
297  } else {
298  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
299  eigen_assert(idx % m_impl.dimensions()[i] == 0);
300  } else {
301  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
302  }
303  }
304  index -= idx * m_outputStrides[i];
305  }
306  if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
307  eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
308  inputIndex += index;
309  } else {
310  if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
311  eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
312  } else {
313  inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
314  }
315  }
316  return inputIndex;
317  }
318 
319  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
320  {
321  return m_impl.coeff(indexRowMajor(index));
322  }
323 
324  template<int LoadMode>
325  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
326  {
327  if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
328  return internal::pset1<PacketReturnType>(m_impl.coeff(0));
329  }
330 
331  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
332  if (isCopy) {
333  #ifdef EIGEN_GPU_COMPILE_PHASE
334  // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
335  // unaligned loads here. The reason is unclear though.
336  return m_impl.template packet<Unaligned>(index);
337  #else
338  return m_impl.template packet<LoadMode>(index);
339  #endif
340  } else if (oneByN && !nByOne) {
341  return packetNByOne<LoadMode>(index);
342  } else if (!oneByN && nByOne) {
343  return packetOneByN<LoadMode>(index);
344  } else if (oneByN && nByOne) {
345  return packetOneByNByOne<LoadMode>(index);
346  } else {
347  return packetColMajor<LoadMode>(index);
348  }
349  } else {
350  if (isCopy) {
351  #ifdef EIGEN_GPU_COMPILE_PHASE
352  // See above.
353  return m_impl.template packet<Unaligned>(index);
354  #else
355  return m_impl.template packet<LoadMode>(index);
356  #endif
357  } else if (oneByN && !nByOne) {
358  return packetOneByN<LoadMode>(index);
359  } else if (!oneByN && nByOne) {
360  return packetNByOne<LoadMode>(index);
361  } else if (oneByN && nByOne) {
362  return packetOneByNByOne<LoadMode>(index);
363  } else {
364  return packetRowMajor<LoadMode>(index);
365  }
366  }
367  }
368 
369  template<int LoadMode>
370  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
371  (Index index) const
372  {
373  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
374 
375  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
376  Index startDim, endDim;
377  Index inputIndex, outputOffset, batchedIndex;
378 
379  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
380  startDim = NumDims - 1;
381  endDim = 1;
382  } else {
383  startDim = 0;
384  endDim = NumDims - 2;
385  }
386 
387  batchedIndex = index % m_outputStrides[startDim];
388  inputIndex = batchedIndex / m_outputStrides[endDim];
389  outputOffset = batchedIndex % m_outputStrides[endDim];
390 
391  if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
392  values[0] = m_impl.coeff(inputIndex);
393  return internal::pload1<PacketReturnType>(values);
394  } else {
395  EIGEN_UNROLL_LOOP
396  for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
397  if (outputOffset + cur < m_outputStrides[endDim]) {
398  values[i] = m_impl.coeff(inputIndex);
399  } else {
400  ++inputIndex;
401  inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
402  values[i] = m_impl.coeff(inputIndex);
403  outputOffset = 0;
404  cur = 0;
405  }
406  }
407  return internal::pload<PacketReturnType>(values);
408  }
409  }
410 
411  template<int LoadMode>
412  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
413  {
414  // Consider the flattened tensor [v0, ..., vN],
415  // Concatenates m_broadcast[dim] copies,
416  // [v0, ..., vN, v0, ..., vN, ... ]
417  // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
418  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
419 
420  // Size of flattened tensor.
421  const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
422  m_inputStrides[NumDims - 1] : m_inputStrides[0];
423  Index inputIndex = index % M;
424  if (inputIndex + PacketSize <= M) {
425  return m_impl.template packet<Unaligned>(inputIndex);
426  } else {
427  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
428  EIGEN_UNROLL_LOOP
429  for (int i = 0; i < PacketSize; ++i) {
430  if (inputIndex > M - 1) {
431  inputIndex = 0;
432  }
433  values[i] = m_impl.coeff(inputIndex++);
434  }
435  return internal::pload<PacketReturnType>(values);
436  }
437  }
438 
439  template<int LoadMode>
440  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
441  {
442  // Consider the flattened tensor [v0, ..., vN],
443  // Interleaves m_broadcast[dim] copies,
444  // [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
445  // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
446  eigen_assert(index + PacketSize-1 < dimensions().TotalSize());
447 
448  const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
449  m_broadcast[0] : m_broadcast[NumDims - 1];
450 
451  Index inputIndex = index / M;
452  Index outputOffset = index % M;
453  if (outputOffset + PacketSize <= M) {
454  return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
455  } else {
456  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
457  EIGEN_UNROLL_LOOP
458  for (int i = 0; i < PacketSize; ++i) {
459  if (outputOffset < M) {
460  values[i] = m_impl.coeff(inputIndex);
461  ++outputOffset;
462  } else {
463  values[i] = m_impl.coeff(++inputIndex);
464  outputOffset = 1; // Next offset.
465  }
466  }
467  return internal::pload<PacketReturnType>(values);
468  }
469  }
470 
471  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
472  // the alignment at compile time.
473  template<int LoadMode>
474  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
475  {
476  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
477 
478  const Index originalIndex = index;
479 
480  Index inputIndex = 0;
481  EIGEN_UNROLL_LOOP
482  for (int i = NumDims - 1; i > 0; --i) {
483  const Index idx = index / m_outputStrides[i];
484  if (internal::index_statically_eq<Broadcast>(i, 1)) {
485  eigen_assert(idx < m_impl.dimensions()[i]);
486  inputIndex += idx * m_inputStrides[i];
487  } else {
488  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
489  eigen_assert(idx % m_impl.dimensions()[i] == 0);
490  } else {
491  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
492  }
493  }
494  index -= idx * m_outputStrides[i];
495  }
496  Index innermostLoc;
497  if (internal::index_statically_eq<Broadcast>(0, 1)) {
498  eigen_assert(index < m_impl.dimensions()[0]);
499  innermostLoc = index;
500  } else {
501  if (internal::index_statically_eq<InputDimensions>(0, 1)) {
502  eigen_assert(index % m_impl.dimensions()[0] == 0);
503  innermostLoc = 0;
504  } else {
505  innermostLoc = index % m_impl.dimensions()[0];
506  }
507  }
508  inputIndex += innermostLoc;
509 
510  // Todo: this could be extended to the second dimension if we're not
511  // broadcasting alongside the first dimension, and so on.
512  if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
513  return m_impl.template packet<Unaligned>(inputIndex);
514  } else {
515  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
516  values[0] = m_impl.coeff(inputIndex);
517  EIGEN_UNROLL_LOOP
518  for (int i = 1; i < PacketSize; ++i) {
519  if (innermostLoc + i < m_impl.dimensions()[0]) {
520  values[i] = m_impl.coeff(inputIndex+i);
521  } else {
522  values[i] = coeffColMajor(originalIndex+i);
523  }
524  }
525  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
526  return rslt;
527  }
528  }
529 
530  template<int LoadMode>
531  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
532  {
533  eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
534 
535  const Index originalIndex = index;
536 
537  Index inputIndex = 0;
538  EIGEN_UNROLL_LOOP
539  for (int i = 0; i < NumDims - 1; ++i) {
540  const Index idx = index / m_outputStrides[i];
541  if (internal::index_statically_eq<Broadcast>(i, 1)) {
542  eigen_assert(idx < m_impl.dimensions()[i]);
543  inputIndex += idx * m_inputStrides[i];
544  } else {
545  if (internal::index_statically_eq<InputDimensions>(i, 1)) {
546  eigen_assert(idx % m_impl.dimensions()[i] == 0);
547  } else {
548  inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
549  }
550  }
551  index -= idx * m_outputStrides[i];
552  }
553  Index innermostLoc;
554  if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
555  eigen_assert(index < m_impl.dimensions()[NumDims-1]);
556  innermostLoc = index;
557  } else {
558  if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
559  eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
560  innermostLoc = 0;
561  } else {
562  innermostLoc = index % m_impl.dimensions()[NumDims-1];
563  }
564  }
565  inputIndex += innermostLoc;
566 
567  // Todo: this could be extended to the second dimension if we're not
568  // broadcasting alongside the first dimension, and so on.
569  if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
570  return m_impl.template packet<Unaligned>(inputIndex);
571  } else {
572  EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
573  values[0] = m_impl.coeff(inputIndex);
574  EIGEN_UNROLL_LOOP
575  for (int i = 1; i < PacketSize; ++i) {
576  if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
577  values[i] = m_impl.coeff(inputIndex+i);
578  } else {
579  values[i] = coeffRowMajor(originalIndex+i);
580  }
581  }
582  PacketReturnType rslt = internal::pload<PacketReturnType>(values);
583  return rslt;
584  }
585  }
586 
587  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
588  costPerCoeff(bool vectorized) const {
589  double compute_cost = TensorOpCost::AddCost<Index>();
590  if (!isCopy && NumDims > 0) {
591  EIGEN_UNROLL_LOOP
592  for (int i = NumDims - 1; i > 0; --i) {
593  compute_cost += TensorOpCost::DivCost<Index>();
594  if (internal::index_statically_eq<Broadcast>(i, 1)) {
595  compute_cost +=
596  TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
597  } else {
598  if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
599  compute_cost += TensorOpCost::MulCost<Index>() +
600  TensorOpCost::ModCost<Index>() +
601  TensorOpCost::AddCost<Index>();
602  }
603  }
604  compute_cost +=
605  TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
606  }
607  }
608  return m_impl.costPerCoeff(vectorized) +
609  TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
610  }
611 
612  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
613  internal::TensorBlockResourceRequirements getResourceRequirements() const {
614  // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
615  // tensors. But this might need further tuning.
616  const size_t target_size = m_device.firstLevelCacheSize();
617  return internal::TensorBlockResourceRequirements::merge(
618  m_impl.getResourceRequirements(),
619  internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
620  }
621 
622  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
623  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
624  bool /*root_of_expr_ast*/ = false) const {
625  BlockBroadcastingParams params = blockBroadcastingParams(desc);
626 
627  if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
628  return emptyBlock();
629  }
630 
631  // Prepare storage for the materialized broadcasting result.
632  const typename TensorBlock::Storage block_storage =
633  TensorBlock::prepareStorage(desc, scratch);
634  ScalarNoConst* materialized_output = block_storage.data();
635 
636  // We potentially will need to materialize input blocks.
637  size_t materialized_input_size = 0;
638  ScalarNoConst* materialized_input = NULL;
639 
640  // Initialize block broadcating iterator state for outer dimensions (outer
641  // with regard to bcast dimension). Dimension in this array are always in
642  // inner_most -> outer_most order (col major layout).
643  array<BlockBroadcastingIteratorState, NumDims> it;
644  int idx = 0;
645 
646  for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
647  const Index dim = IsColMajor ? i : NumDims - 1 - i;
648  it[idx].size = params.output_dims[dim];
649  it[idx].count = 0;
650  it[idx].output_stride = m_outputStrides[dim];
651  it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
652  idx++;
653  }
654 
655  // Write output into the beginning of `materialized_output`.
656  Index output_offset = 0;
657 
658  // We will fill output block by broadcasting along the bcast dim, and
659  // iterating over outer dimension.
660  const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
661 
662  for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
663  ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
664  Index bcast_offset = desc.offset() + output_offset;
665 
666  // Broadcast along the bcast dimension.
667  num_output_coeffs += BroadcastBlockAlongBcastDim(
668  params, bcast_offset, scratch, bcast_output, &materialized_input,
669  &materialized_input_size);
670 
671  // Switch to the next outer dimension.
672  for (int j = 0; j < idx; ++j) {
673  if (++it[j].count < it[j].size) {
674  output_offset += it[j].output_stride;
675  break;
676  }
677  it[j].count = 0;
678  output_offset -= it[j].output_span;
679  }
680  }
681 
682  return block_storage.AsTensorMaterializedBlock();
683  }
684 
685  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
686 
687  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
688 
689  Broadcast functor() const { return m_broadcast; }
690 #ifdef EIGEN_USE_SYCL
691  // binding placeholder accessors to a command group handler for SYCL
692  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
693  cl::sycl::handler& cgh) const {
694  m_impl.bind(cgh);
695  }
696 #endif
697  private:
698  static constexpr bool IsColMajor =
699  static_cast<int>(Layout) == static_cast<int>(ColMajor);
700 
701  // We will build a general case block broadcasting on top of broadcasting
702  // primitive that will do broadcasting only for the inner dimension(s) along
703  // the first dimension smaller than the input size (it's called `bcast_dim`).
704  //
705  // Example:
706  // dim: 0 1 2 (ColMajor)
707  // input size: [9, 3, 6]
708  // block size: [9, 2, 6]
709  //
710  // We will compute broadcasted block by iterating over the outer dimensions
711  // before `bcast_dim` (only dimension `2` in this example) and computing
712  // broadcasts along the `bcast_dim` (dimension `1` in this example).
713 
714  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
715  // single block along the broadcasting dimension. Sizes and strides along the
716  // `bcast_dim` might be invalid, they will be adjusted later in
717  // `BroadcastBlockAlongBcastDim`.
718  struct BlockBroadcastingParams {
719  Dimensions input_dims; // input expression dimensions
720  Dimensions output_dims; // output block sizes
721  Dimensions output_strides; // output block strides
722 
723  int inner_dim_count; // count inner dimensions matching in size
724  int bcast_dim; // broadcasting dimension index
725  Index bcast_dim_size; // broadcasting dimension size
726  Index inner_dim_size; // inner dimensions size
727 
728  // Block sizes and strides for the input block where all dimensions before
729  // `bcast_dim` are equal to `1`.
730  Dimensions input_block_sizes;
731  Dimensions input_block_strides;
732 
733  // Block sizes and strides for blocks with extra dimensions and strides `0`.
734  BroadcastDimensions bcast_block_sizes;
735  BroadcastDimensions bcast_block_strides;
736  BroadcastDimensions bcast_input_strides;
737  };
738 
739  struct BlockBroadcastingIteratorState {
740  Index size;
741  Index count;
742  Index output_stride;
743  Index output_span;
744  };
745 
746  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
747  blockBroadcastingParams(TensorBlockDesc& desc) const {
748  BlockBroadcastingParams params;
749 
750  params.input_dims = Dimensions(m_impl.dimensions());
751 
752  // Output block sizes and strides.
753  params.output_dims = desc.dimensions();
754  params.output_strides = internal::strides<Layout>(params.output_dims);
755 
756  // Find the broadcasting dimension (first dimension with output size smaller
757  // that the input size).
758  params.bcast_dim = 0;
759  params.bcast_dim_size = 1;
760  params.inner_dim_size = 1;
761 
762  // Count the number of inner dimensions that have the same size in the block
763  // and in the broadcast expression.
764  params.inner_dim_count = 0;
765 
766  for (int i = 0; i < NumDims; ++i) {
767  const int dim = IsColMajor ? i : NumDims - i - 1;
768 
769  if (params.output_dims[dim] == m_dimensions[dim]) {
770  params.inner_dim_size *= params.output_dims[dim];
771  ++params.inner_dim_count;
772  continue;
773  }
774 
775  // First non-matching dimension is the broadcasting dimension.
776  eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
777  params.bcast_dim = dim;
778  params.bcast_dim_size = params.output_dims[dim];
779  break;
780  }
781 
782  // Calculate the input block size for looking into the input.
783  for (int i = 0; i < params.inner_dim_count; ++i) {
784  const int dim = IsColMajor ? i : NumDims - i - 1;
785  params.input_block_sizes[dim] = params.input_dims[dim];
786  }
787  for (int i = params.inner_dim_count; i < NumDims; ++i) {
788  const int dim = IsColMajor ? i : NumDims - i - 1;
789  params.input_block_sizes[dim] = 1;
790  }
791  params.input_block_strides =
792  internal::strides<Layout>(params.input_block_sizes);
793 
794  // Broadcast with the 0-stride trick: Create 1 extra dim for each
795  // broadcast, set the input stride to 0.
796  //
797  // When ColMajor:
798  //
799  // - bcast_block_sizes:
800  // [d_0, b_0, d_1, b_1, ...]
801  //
802  // - bcast_block_strides:
803  // [output_block_strides[0], output_block_strides[0] * d_0,
804  // output_block_strides[1], output_block_strides[1] * d_1,
805  // ...]
806  //
807  // - bcast_input_strides:
808  // [input_block_strides[0], 0,
809  // input_block_strides[1], 0,
810  // ...].
811  //
812  for (int i = 0; i < params.inner_dim_count; ++i) {
813  const int dim = IsColMajor ? i : NumDims - i - 1;
814 
815  const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
816  const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
817 
818  params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
819  params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
820  params.bcast_block_strides[copy_dim] = params.output_strides[dim];
821  params.bcast_block_strides[broadcast_dim] =
822  params.output_strides[dim] * params.input_dims[dim];
823  params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
824  params.bcast_input_strides[broadcast_dim] = 0;
825  }
826 
827  for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
828  const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
829  params.bcast_block_sizes[dim] = 1;
830  params.bcast_block_strides[dim] = 0;
831  params.bcast_input_strides[dim] = 0;
832  }
833 
834  return params;
835  }
836 
837  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
838  DSizes<Index, NumDims> dimensions;
839  for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
840  return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
841  }
842 
843  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
844  BlockBroadcastingParams params, Index bcast_offset,
845  TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
846  ScalarNoConst** materialized_input,
847  size_t* materialized_input_size) const {
848  if (params.bcast_dim_size == 1) {
849  // We just need one block read using the ready-set values above.
850  return BroadcastBlock(
851  params.input_block_sizes, params.input_block_strides,
852  params.bcast_block_sizes, params.bcast_block_strides,
853  params.bcast_input_strides, bcast_offset, 0, scratch,
854  materialized_output, materialized_input, materialized_input_size);
855 
856  } else if (params.input_dims[params.bcast_dim] == 1) {
857  // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
858  const int broadcast_bcast_dim =
859  IsColMajor ? 2 * params.inner_dim_count + 1
860  : 2 * NumDims - 2 * params.inner_dim_count - 2;
861 
862  params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
863  params.bcast_input_strides[broadcast_bcast_dim] = 0;
864  params.bcast_block_strides[broadcast_bcast_dim] =
865  params.output_strides[params.bcast_dim];
866 
867  return BroadcastBlock(
868  params.input_block_sizes, params.input_block_strides,
869  params.bcast_block_sizes, params.bcast_block_strides,
870  params.bcast_input_strides, bcast_offset, 0, scratch,
871  materialized_output, materialized_input, materialized_input_size);
872 
873  } else {
874  // Keep track of the total number of the coefficients written to the
875  // output block.
876  Index num_output_coeffs = 0;
877 
878  // The general case. Let's denote the output block as
879  //
880  // x[..., a:a+bcast_dim_size, :, ..., :]
881  //
882  // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
883  // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
884  // sub-blocks:
885  //
886  // (1) a:b, where b is the smallest multiple of
887  // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
888  //
889  // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
890  // in [a, a+bcast_dim_size].
891  //
892  // (3) c:a+bcast_dim_size .
893  //
894  // Or, when b and c do not exist, we just need to process the whole block
895  // together.
896 
897  // Find a.
898  const Index bcast_dim_left_index =
899  bcast_offset / m_outputStrides[params.bcast_dim];
900 
901  // Find b and c.
902  const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
903 
904  // First multiple after a. This is b when <= bcast_dim_left_index +
905  // bcast_dim_size.
906  const Index first_multiple =
907  divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
908  input_bcast_dim_size;
909 
910  if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
911  // b exists, so does c. Find it.
912  const Index last_multiple =
913  (bcast_dim_left_index + params.bcast_dim_size) /
914  input_bcast_dim_size * input_bcast_dim_size;
915  const int copy_bcast_dim =
916  IsColMajor ? 2 * params.inner_dim_count
917  : 2 * NumDims - 2 * params.inner_dim_count - 1;
918  const int broadcast_bcast_dim =
919  IsColMajor ? 2 * params.inner_dim_count + 1
920  : 2 * NumDims - 2 * params.inner_dim_count - 2;
921 
922  if (first_multiple > bcast_dim_left_index) {
923  const Index head_size = first_multiple - bcast_dim_left_index;
924  params.input_block_sizes[params.bcast_dim] = head_size;
925  params.bcast_block_sizes[copy_bcast_dim] = head_size;
926  params.bcast_input_strides[copy_bcast_dim] =
927  params.input_block_strides[params.bcast_dim];
928  params.bcast_block_strides[copy_bcast_dim] =
929  params.output_strides[params.bcast_dim];
930  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
931  params.bcast_input_strides[broadcast_bcast_dim] = 0;
932  params.bcast_block_strides[broadcast_bcast_dim] =
933  params.output_strides[params.bcast_dim] *
934  params.input_dims[params.bcast_dim];
935 
936  num_output_coeffs += BroadcastBlock(
937  params.input_block_sizes, params.input_block_strides,
938  params.bcast_block_sizes, params.bcast_block_strides,
939  params.bcast_input_strides, bcast_offset, 0, scratch,
940  materialized_output, materialized_input, materialized_input_size);
941  }
942  if (first_multiple < last_multiple) {
943  params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
944  params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
945  params.bcast_input_strides[copy_bcast_dim] =
946  params.input_block_strides[params.bcast_dim];
947  params.bcast_block_strides[copy_bcast_dim] =
948  params.output_strides[params.bcast_dim];
949  params.bcast_block_sizes[broadcast_bcast_dim] =
950  (last_multiple - first_multiple) / input_bcast_dim_size;
951  params.bcast_input_strides[broadcast_bcast_dim] = 0;
952  params.bcast_block_strides[broadcast_bcast_dim] =
953  params.output_strides[params.bcast_dim] *
954  params.input_dims[params.bcast_dim];
955  const Index offset = (first_multiple - bcast_dim_left_index) *
956  m_outputStrides[params.bcast_dim];
957 
958  num_output_coeffs += BroadcastBlock(
959  params.input_block_sizes, params.input_block_strides,
960  params.bcast_block_sizes, params.bcast_block_strides,
961  params.bcast_input_strides, bcast_offset, offset, scratch,
962  materialized_output, materialized_input, materialized_input_size);
963  }
964  if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
965  const Index tail_size =
966  bcast_dim_left_index + params.bcast_dim_size - last_multiple;
967  params.input_block_sizes[params.bcast_dim] = tail_size;
968  params.bcast_block_sizes[copy_bcast_dim] = tail_size;
969  params.bcast_input_strides[copy_bcast_dim] =
970  params.input_block_strides[params.bcast_dim];
971  params.bcast_block_strides[copy_bcast_dim] =
972  params.output_strides[params.bcast_dim];
973  params.bcast_block_sizes[broadcast_bcast_dim] = 1;
974  params.bcast_input_strides[broadcast_bcast_dim] = 0;
975  params.bcast_block_strides[broadcast_bcast_dim] =
976  params.output_strides[params.bcast_dim] *
977  params.input_dims[params.bcast_dim];
978  const Index offset = (last_multiple - bcast_dim_left_index) *
979  m_outputStrides[params.bcast_dim];
980 
981  num_output_coeffs += BroadcastBlock(
982  params.input_block_sizes, params.input_block_strides,
983  params.bcast_block_sizes, params.bcast_block_strides,
984  params.bcast_input_strides, bcast_offset, offset, scratch,
985  materialized_output, materialized_input, materialized_input_size);
986  }
987  } else {
988  // b and c do not exist.
989  const int copy_bcast_dim =
990  IsColMajor ? 2 * params.inner_dim_count
991  : 2 * NumDims - 2 * params.inner_dim_count - 1;
992  params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
993  params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
994  params.bcast_input_strides[copy_bcast_dim] =
995  params.input_block_strides[params.bcast_dim];
996  params.bcast_block_strides[copy_bcast_dim] =
997  params.output_strides[params.bcast_dim];
998 
999  num_output_coeffs += BroadcastBlock(
1000  params.input_block_sizes, params.input_block_strides,
1001  params.bcast_block_sizes, params.bcast_block_strides,
1002  params.bcast_input_strides, bcast_offset, 0, scratch,
1003  materialized_output, materialized_input, materialized_input_size);
1004  }
1005 
1006  return num_output_coeffs;
1007  }
1008  }
1009 
1010  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
1011  const Dimensions& input_block_sizes,
1012  const Dimensions& input_block_strides,
1013  const BroadcastDimensions& bcast_block_sizes,
1014  const BroadcastDimensions& bcast_block_strides,
1015  const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
1016  Index offset, TensorBlockScratch& scratch,
1017  ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
1018  size_t* materialized_input_size) const {
1019  // ---------------------------------------------------------------------- //
1020  // Tensor block descriptor for reading block from the input.
1021  const Index input_offset = bcast_offset + offset;
1022  TensorBlockDesc input_desc(
1023  IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
1024  input_block_sizes);
1025 
1026  ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
1027 
1028  // ---------------------------------------------------------------------- //
1029  // Materialize input block into a temporary memory buffer only if it's not
1030  // already available in the arg block.
1031  const ScalarNoConst* input_buffer = NULL;
1032 
1033  if (input_block.data() != NULL) {
1034  // Input block already has raw data, there is no need to materialize it.
1035  input_buffer = input_block.data();
1036 
1037  } else {
1038  // Otherwise we have to do block assignment into a temporary buffer.
1039 
1040  // Maybe reuse previously allocated buffer, or allocate a new one with a
1041  // scratch allocator.
1042  const size_t input_total_size = input_block_sizes.TotalSize();
1043  if (*materialized_input == NULL ||
1044  *materialized_input_size < input_total_size) {
1045  *materialized_input_size = input_total_size;
1046  void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
1047  *materialized_input = static_cast<ScalarNoConst*>(mem);
1048  }
1049 
1050  typedef internal::TensorBlockAssignment<
1051  ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
1052  TensorBlockAssignment;
1053 
1054  TensorBlockAssignment::Run(
1055  TensorBlockAssignment::target(input_block_sizes, input_block_strides,
1056  *materialized_input),
1057  input_block.expr());
1058 
1059  input_buffer = *materialized_input;
1060  }
1061 
1062  // ---------------------------------------------------------------------- //
1063  // Copy data from materialized input block to the materialized output, using
1064  // given broadcast strides (strides with zeroes).
1065  typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
1066  TensorBlockIO;
1067 
1068  typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
1069  typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
1070  materialized_output + offset);
1071 
1072  return TensorBlockIO::Copy(dst, src);
1073  }
1074 
1075 protected:
1076  const Device EIGEN_DEVICE_REF m_device;
1077  const std::remove_reference_t<Broadcast> m_broadcast;
1078  Dimensions m_dimensions;
1079  array<Index, NumDims> m_outputStrides;
1080  array<Index, NumDims> m_inputStrides;
1081  TensorEvaluator<ArgType, Device> m_impl;
1082 };
1083 
1084 
1085 } // end namespace Eigen
1086 
1087 #endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index