ADD: new track message, Entity class and Position class

This commit is contained in:
Henry Winkel
2022-12-20 17:20:35 +01:00
parent 469ecfb099
commit 98ebb563a8
2114 changed files with 482360 additions and 24 deletions

View File

@@ -0,0 +1,11 @@
add_subdirectory(Eigen)
if(EIGEN_BUILD_DOC)
add_subdirectory(doc EXCLUDE_FROM_ALL)
endif()
if(BUILD_TESTING)
if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
else()
add_subdirectory(test EXCLUDE_FROM_ALL)
endif()
endif()

View File

@@ -0,0 +1,159 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ADLOC_FORWARD
#define EIGEN_ADLOC_FORWARD
//--------------------------------------------------------------------------------
//
// This file provides support for adolc's adouble type in forward mode.
// ADOL-C is a C++ automatic differentiation library,
// see https://projects.coin-or.org/ADOL-C for more information.
//
// Note that the maximal number of directions is controlled by
// the preprocessor token NUMBER_DIRECTIONS. The default is 2.
//
//--------------------------------------------------------------------------------
#define ADOLC_TAPELESS
#ifndef NUMBER_DIRECTIONS
# define NUMBER_DIRECTIONS 2
#endif
#include <adolc/adtl.h>
// adolc defines some very stupid macros:
#if defined(malloc)
# undef malloc
#endif
#if defined(calloc)
# undef calloc
#endif
#if defined(realloc)
# undef realloc
#endif
#include "../../Eigen/Core"
namespace Eigen {
/**
* \defgroup AdolcForward_Module Adolc forward module
* This module provides support for adolc's adouble type in forward mode.
* ADOL-C is a C++ automatic differentiation library,
* see https://projects.coin-or.org/ADOL-C for more information.
* It mainly consists in:
* - a struct Eigen::NumTraits<adtl::adouble> specialization
* - overloads of internal::* math function for adtl::adouble type.
*
* Note that the maximal number of directions is controlled by
* the preprocessor token NUMBER_DIRECTIONS. The default is 2.
*
* \code
* #include <unsupported/Eigen/AdolcSupport>
* \endcode
*/
//@{
} // namespace Eigen
// Eigen's require a few additional functions which must be defined in the same namespace
// than the custom scalar type own namespace
namespace adtl {
inline const adouble& conj(const adouble& x) { return x; }
inline const adouble& real(const adouble& x) { return x; }
inline adouble imag(const adouble&) { return 0.; }
inline adouble abs(const adouble& x) { return fabs(x); }
inline adouble abs2(const adouble& x) { return x*x; }
inline bool (isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); }
inline bool (isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); }
}
namespace Eigen {
template<> struct NumTraits<adtl::adouble>
: NumTraits<double>
{
typedef adtl::adouble Real;
typedef adtl::adouble NonInteger;
typedef adtl::adouble Nested;
enum {
IsComplex = 0,
IsInteger = 0,
IsSigned = 1,
RequireInitialization = 1,
ReadCost = 1,
AddCost = 1,
MulCost = 1
};
};
template<typename Functor> class AdolcForwardJacobian : public Functor
{
typedef adtl::adouble ActiveScalar;
public:
AdolcForwardJacobian() : Functor() {}
AdolcForwardJacobian(const Functor& f) : Functor(f) {}
// forward constructors
template<typename T0>
AdolcForwardJacobian(const T0& a0) : Functor(a0) {}
template<typename T0, typename T1>
AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
template<typename T0, typename T1, typename T2>
AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {}
typedef typename Functor::InputType InputType;
typedef typename Functor::ValueType ValueType;
typedef typename Functor::JacobianType JacobianType;
typedef Matrix<ActiveScalar, InputType::SizeAtCompileTime, 1> ActiveInput;
typedef Matrix<ActiveScalar, ValueType::SizeAtCompileTime, 1> ActiveValue;
void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const
{
eigen_assert(v!=0);
if (!_jac)
{
Functor::operator()(x, v);
return;
}
JacobianType& jac = *_jac;
ActiveInput ax = x.template cast<ActiveScalar>();
ActiveValue av(jac.rows());
for (int j=0; j<jac.cols(); j++)
for (int i=0; i<jac.cols(); i++)
ax[i].setADValue(j, i==j ? 1 : 0);
Functor::operator()(ax, &av);
for (int i=0; i<jac.rows(); i++)
{
(*v)[i] = av[i].getValue();
for (int j=0; j<jac.cols(); j++)
jac.coeffRef(i,j) = av[i].getADValue(j);
}
}
protected:
};
//@}
}
#endif // EIGEN_ADLOC_FORWARD

View File

@@ -0,0 +1,234 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ALIGNED_VECTOR3
#define EIGEN_ALIGNED_VECTOR3
#include "../../Eigen/Geometry"
#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
namespace Eigen {
/**
* \defgroup AlignedVector3_Module Aligned vector3 module
*
* \code
* #include <unsupported/Eigen/AlignedVector3>
* \endcode
*/
//@{
/** \class AlignedVector3
*
* \brief A vectorization friendly 3D vector
*
* This class represents a 3D vector internally using a 4D vector
* such that vectorization can be seamlessly enabled. Of course,
* the same result can be achieved by directly using a 4D vector.
* This class makes this process simpler.
*
*/
// TODO specialize Cwise
template<typename _Scalar> class AlignedVector3;
namespace internal {
template<typename _Scalar> struct traits<AlignedVector3<_Scalar> >
: traits<Matrix<_Scalar,3,1,0,4,1> >
{
};
}
template<typename _Scalar> class AlignedVector3
: public MatrixBase<AlignedVector3<_Scalar> >
{
typedef Matrix<_Scalar,4,1> CoeffType;
CoeffType m_coeffs;
public:
typedef MatrixBase<AlignedVector3<_Scalar> > Base;
EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3)
using Base::operator*;
inline Index rows() const { return 3; }
inline Index cols() const { return 1; }
Scalar* data() { return m_coeffs.data(); }
const Scalar* data() const { return m_coeffs.data(); }
Index innerStride() const { return 1; }
Index outerStride() const { return 3; }
inline const Scalar& coeff(Index row, Index col) const
{ return m_coeffs.coeff(row, col); }
inline Scalar& coeffRef(Index row, Index col)
{ return m_coeffs.coeffRef(row, col); }
inline const Scalar& coeff(Index index) const
{ return m_coeffs.coeff(index); }
inline Scalar& coeffRef(Index index)
{ return m_coeffs.coeffRef(index);}
inline AlignedVector3()
{}
inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
: m_coeffs(x, y, z, Scalar(0))
{}
inline AlignedVector3(const AlignedVector3& other)
: Base(), m_coeffs(other.m_coeffs)
{}
template<typename XprType, int Size=XprType::SizeAtCompileTime>
struct generic_assign_selector {};
template<typename XprType> struct generic_assign_selector<XprType,4>
{
inline static void run(AlignedVector3& dest, const XprType& src)
{
dest.m_coeffs = src;
}
};
template<typename XprType> struct generic_assign_selector<XprType,3>
{
inline static void run(AlignedVector3& dest, const XprType& src)
{
dest.m_coeffs.template head<3>() = src;
dest.m_coeffs.w() = Scalar(0);
}
};
template<typename Derived>
inline AlignedVector3(const MatrixBase<Derived>& other)
{
generic_assign_selector<Derived>::run(*this,other.derived());
}
inline AlignedVector3& operator=(const AlignedVector3& other)
{ m_coeffs = other.m_coeffs; return *this; }
template <typename Derived>
inline AlignedVector3& operator=(const MatrixBase<Derived>& other)
{
generic_assign_selector<Derived>::run(*this,other.derived());
return *this;
}
inline AlignedVector3 operator+(const AlignedVector3& other) const
{ return AlignedVector3(m_coeffs + other.m_coeffs); }
inline AlignedVector3& operator+=(const AlignedVector3& other)
{ m_coeffs += other.m_coeffs; return *this; }
inline AlignedVector3 operator-(const AlignedVector3& other) const
{ return AlignedVector3(m_coeffs - other.m_coeffs); }
inline AlignedVector3 operator-() const
{ return AlignedVector3(-m_coeffs); }
inline AlignedVector3 operator-=(const AlignedVector3& other)
{ m_coeffs -= other.m_coeffs; return *this; }
inline AlignedVector3 operator*(const Scalar& s) const
{ return AlignedVector3(m_coeffs * s); }
inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec)
{ return AlignedVector3(s * vec.m_coeffs); }
inline AlignedVector3& operator*=(const Scalar& s)
{ m_coeffs *= s; return *this; }
inline AlignedVector3 operator/(const Scalar& s) const
{ return AlignedVector3(m_coeffs / s); }
inline AlignedVector3& operator/=(const Scalar& s)
{ m_coeffs /= s; return *this; }
inline Scalar dot(const AlignedVector3& other) const
{
eigen_assert(m_coeffs.w()==Scalar(0));
eigen_assert(other.m_coeffs.w()==Scalar(0));
return m_coeffs.dot(other.m_coeffs);
}
inline void normalize()
{
m_coeffs /= norm();
}
inline AlignedVector3 normalized() const
{
return AlignedVector3(m_coeffs / norm());
}
inline Scalar sum() const
{
eigen_assert(m_coeffs.w()==Scalar(0));
return m_coeffs.sum();
}
inline Scalar squaredNorm() const
{
eigen_assert(m_coeffs.w()==Scalar(0));
return m_coeffs.squaredNorm();
}
inline Scalar norm() const
{
using std::sqrt;
return sqrt(squaredNorm());
}
inline AlignedVector3 cross(const AlignedVector3& other) const
{
return AlignedVector3(m_coeffs.cross3(other.m_coeffs));
}
template<typename Derived>
inline bool isApprox(const MatrixBase<Derived>& other, const RealScalar& eps=NumTraits<Scalar>::dummy_precision()) const
{
return m_coeffs.template head<3>().isApprox(other,eps);
}
CoeffType& coeffs() { return m_coeffs; }
const CoeffType& coeffs() const { return m_coeffs; }
};
namespace internal {
template<typename _Scalar>
struct eval<AlignedVector3<_Scalar>, Dense>
{
typedef const AlignedVector3<_Scalar>& type;
};
template<typename Scalar>
struct evaluator<AlignedVector3<Scalar> >
: evaluator<Matrix<Scalar,4,1> >
{
typedef AlignedVector3<Scalar> XprType;
typedef evaluator<Matrix<Scalar,4,1> > Base;
evaluator(const XprType &m) : Base(m.coeffs()) {}
};
}
//@}
}
#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_ALIGNED_VECTOR3

View File

@@ -0,0 +1,30 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_ARPACKSUPPORT_MODULE_H
#define EIGEN_ARPACKSUPPORT_MODULE_H
#include "../../Eigen/Core"
/** \defgroup ArpackSupport_Module Arpack support module
*
* This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition.
*
* \code
* #include <Eigen/ArpackSupport>
* \endcode
*/
#include "../../Eigen/SparseCholesky"
#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
#include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_ARPACKSUPPORT_MODULE_H

View File

@@ -0,0 +1,46 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2008-2009 Gael Guennebaud <g.gael@free.fr>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_AUTODIFF_MODULE
#define EIGEN_AUTODIFF_MODULE
namespace Eigen {
/**
* \defgroup AutoDiff_Module Auto Diff module
*
* This module features forward automatic differentation via a simple
* templated scalar type wrapper AutoDiffScalar.
*
* Warning : this should NOT be confused with numerical differentiation, which
* is a different method and has its own module in Eigen : \ref NumericalDiff_Module.
*
* \code
* #include <unsupported/Eigen/AutoDiff>
* \endcode
*/
//@{
}
#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
#include "src/AutoDiff/AutoDiffScalar.h"
// #include "src/AutoDiff/AutoDiffVector.h"
#include "src/AutoDiff/AutoDiffJacobian.h"
#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
namespace Eigen {
//@}
}
#endif // EIGEN_AUTODIFF_MODULE

View File

@@ -0,0 +1,95 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_BVH_MODULE_H
#define EIGEN_BVH_MODULE_H
#include "../../Eigen/Core"
#include "../../Eigen/Geometry"
#include "../../Eigen/StdVector"
#include <algorithm>
#include <queue>
namespace Eigen {
/**
* \defgroup BVH_Module BVH module
* \brief This module provides generic bounding volume hierarchy algorithms
* and reference tree implementations.
*
*
* \code
* #include <unsupported/Eigen/BVH>
* \endcode
*
* A bounding volume hierarchy (BVH) can accelerate many geometric queries. This module provides a generic implementation
* of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization
* of a function over the objects in the hierarchy. It also provides intersection and minimization over a cartesian product of
* two BVH's. A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot
* intersect any object contained in that volume. Similarly, a BVH accelerates minimization because the minimum of a function
* over a volume is no greater than the minimum of a function over any object contained in it.
*
* Some sample queries that can be written in terms of intersection are:
* - Determine all points where a ray intersects a triangle mesh
* - Given a set of points, determine which are contained in a query sphere
* - Given a set of spheres, determine which contain the query point
* - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$
* in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction)
* - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set
* of points with itself)
*
* Some sample queries that can be written in terms of function minimization over a set of objects are:
* - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray)
* - Given a polyline and a query point, determine the closest point on the polyline to the query
* - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function)
* - Determine how far two meshes are from colliding (this is also a cartesian product query)
*
* This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and
* from the particulars of the query. To enable abstraction from the BVH, the BVH is required to implement a generic mechanism
* for traversal. To abstract from the query, the query is responsible for keeping track of results.
*
* To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code
typedef Volume //the type of bounding volume
typedef Object //the type of object in the hierarchy
typedef Index //a reference to a node in the hierarchy--typically an int or a pointer
typedef VolumeIterator //an iterator type over node children--returns Index
typedef ObjectIterator //an iterator over object (leaf) children--returns const Object &
Index getRootIndex() const //returns the index of the hierarchy root
const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index
void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
//getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children
//and [outOBegin, outOEnd) range over its object children
\endcode
*
* To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector.
* For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions:
* \code
bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume
bool intersectObject(const Object &object) //returns true if the intersection search should terminate immediately
\endcode
* The guarantee that BVIntersect provides is that intersectObject will be called on every object whose bounding volume
* intersects the query (but possibly on other objects too) unless the search is terminated prematurely. It is the
* responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate.
* The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation.
*
* The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair:
* \include BVH_Example.cpp
* Output: \verbinclude BVH_Example.out
*/
}
//@{
#include "src/BVH/BVAlgorithms.h"
#include "src/BVH/KdBVH.h"
//@}
#endif // EIGEN_BVH_MODULE_H

View File

@@ -0,0 +1,32 @@
set(Eigen_HEADERS
AdolcForward
AlignedVector3
ArpackSupport
AutoDiff
BVH
EulerAngles
FFT
IterativeSolvers
KroneckerProduct
LevenbergMarquardt
MatrixFunctions
MoreVectorization
MPRealSupport
NonLinearOptimization
NumericalDiff
OpenGLSupport
Polynomials
Skyline
SparseExtra
SpecialFunctions
Splines
)
install(FILES
${Eigen_HEADERS}
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
)
install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
add_subdirectory(CXX11)

View File

@@ -0,0 +1,8 @@
set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
install(FILES
${Eigen_CXX11_HEADERS}
DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
)
install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")

View File

@@ -0,0 +1,137 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
//#ifndef EIGEN_CXX11_TENSOR_MODULE
//#define EIGEN_CXX11_TENSOR_MODULE
#include "../../../Eigen/Core"
#if EIGEN_HAS_CXX11
#include "../SpecialFunctions"
#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
#include "src/util/CXX11Meta.h"
#include "src/util/MaxSizeVector.h"
/** \defgroup CXX11_Tensor_Module Tensor Module
*
* This module provides a Tensor class for storing arbitrarily indexed
* objects.
*
* \code
* #include <Eigen/CXX11/Tensor>
* \endcode
*
* Much of the documentation can be found \ref eigen_tensors "here".
*/
#include <atomic>
#include <chrono>
#include <cmath>
#include <cstddef>
#include <cstring>
#include <random>
#include <thread>
#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
#include "ThreadPool"
#endif
#ifdef EIGEN_USE_GPU
#include <iostream>
#if defined(EIGEN_USE_HIP)
#include <hip/hip_runtime.h>
#else
#include <cuda_runtime.h>
#endif
#endif
#include "src/Tensor/TensorMacros.h"
#include "src/Tensor/TensorForwardDeclarations.h"
#include "src/Tensor/TensorMeta.h"
#include "src/Tensor/TensorFunctors.h"
#include "src/Tensor/TensorCostModel.h"
#include "src/Tensor/TensorDeviceDefault.h"
#include "src/Tensor/TensorDeviceThreadPool.h"
#include "src/Tensor/TensorDeviceGpu.h"
#ifndef gpu_assert
#define gpu_assert(x)
#endif
#include "src/Tensor/TensorDeviceSycl.h"
#include "src/Tensor/TensorIndexList.h"
#include "src/Tensor/TensorDimensionList.h"
#include "src/Tensor/TensorDimensions.h"
#include "src/Tensor/TensorInitializer.h"
#include "src/Tensor/TensorTraits.h"
#include "src/Tensor/TensorRandom.h"
#include "src/Tensor/TensorUInt128.h"
#include "src/Tensor/TensorIntDiv.h"
#include "src/Tensor/TensorGlobalFunctions.h"
#include "src/Tensor/TensorBase.h"
#include "src/Tensor/TensorBlock.h"
#include "src/Tensor/TensorEvaluator.h"
#include "src/Tensor/TensorExpr.h"
#include "src/Tensor/TensorReduction.h"
#include "src/Tensor/TensorReductionGpu.h"
#include "src/Tensor/TensorArgMax.h"
#include "src/Tensor/TensorConcatenation.h"
#include "src/Tensor/TensorContractionMapper.h"
#include "src/Tensor/TensorContractionBlocking.h"
#include "src/Tensor/TensorContraction.h"
#include "src/Tensor/TensorContractionThreadPool.h"
#include "src/Tensor/TensorContractionGpu.h"
#include "src/Tensor/TensorConversion.h"
#include "src/Tensor/TensorConvolution.h"
#include "src/Tensor/TensorFFT.h"
#include "src/Tensor/TensorPatch.h"
#include "src/Tensor/TensorImagePatch.h"
#include "src/Tensor/TensorVolumePatch.h"
#include "src/Tensor/TensorBroadcasting.h"
#include "src/Tensor/TensorChipping.h"
#include "src/Tensor/TensorInflation.h"
#include "src/Tensor/TensorLayoutSwap.h"
#include "src/Tensor/TensorMorphing.h"
#include "src/Tensor/TensorPadding.h"
#include "src/Tensor/TensorReverse.h"
#include "src/Tensor/TensorShuffling.h"
#include "src/Tensor/TensorStriding.h"
#include "src/Tensor/TensorCustomOp.h"
#include "src/Tensor/TensorEvalTo.h"
#include "src/Tensor/TensorForcedEval.h"
#include "src/Tensor/TensorGenerator.h"
#include "src/Tensor/TensorAssign.h"
#include "src/Tensor/TensorScan.h"
#include "src/Tensor/TensorTrace.h"
#ifdef EIGEN_USE_SYCL
#include "src/Tensor/TensorReductionSycl.h"
#include "src/Tensor/TensorConvolutionSycl.h"
#include "src/Tensor/TensorContractionSycl.h"
#include "src/Tensor/TensorScanSycl.h"
#endif
#include "src/Tensor/TensorExecutor.h"
#include "src/Tensor/TensorDevice.h"
#include "src/Tensor/TensorStorage.h"
#include "src/Tensor/Tensor.h"
#include "src/Tensor/TensorFixedSize.h"
#include "src/Tensor/TensorMap.h"
#include "src/Tensor/TensorRef.h"
#include "src/Tensor/TensorIO.h"
#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_HAS_CXX11
//#endif // EIGEN_CXX11_TENSOR_MODULE

View File

@@ -0,0 +1,42 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
#define EIGEN_CXX11_TENSORSYMMETRY_MODULE
#include "Tensor"
#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
#include "src/util/CXX11Meta.h"
/** \defgroup CXX11_TensorSymmetry_Module Tensor Symmetry Module
*
* This module provides a classes that allow for the definition of
* symmetries w.r.t. tensor indices.
*
* Including this module will implicitly include the Tensor module.
*
* \code
* #include <Eigen/TensorSymmetry>
* \endcode
*/
#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
#include "src/TensorSymmetry/Symmetry.h"
#include "src/TensorSymmetry/StaticSymmetry.h"
#include "src/TensorSymmetry/DynamicSymmetry.h"
#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@@ -0,0 +1,74 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_MODULE
#define EIGEN_CXX11_THREADPOOL_MODULE
#include "../../../Eigen/Core"
#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
/** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
*
* This module provides 2 threadpool implementations
* - a simple reference implementation
* - a faster non blocking implementation
*
* This module requires C++11.
*
* \code
* #include <Eigen/CXX11/ThreadPool>
* \endcode
*/
// The code depends on CXX11, so only include the module if the
// compiler supports it.
#if (EIGEN_COMP_CXXVER >= 11)
#include <cstddef>
#include <cstring>
#include <time.h>
#include <vector>
#include <atomic>
#include <condition_variable>
#include <deque>
#include <mutex>
#include <thread>
#include <functional>
#include <memory>
#include <utility>
// There are non-parenthesized calls to "max" in the <unordered_map> header,
// which trigger a check in test/main.h causing compilation to fail.
// We work around the check here by removing the check for max in
// the case where we have to emulate thread_local.
#ifdef max
#undef max
#endif
#include <unordered_map>
#include "src/util/CXX11Meta.h"
#include "src/util/MaxSizeVector.h"
#include "src/ThreadPool/ThreadLocal.h"
#include "src/ThreadPool/ThreadYield.h"
#include "src/ThreadPool/ThreadCancel.h"
#include "src/ThreadPool/EventCount.h"
#include "src/ThreadPool/RunQueue.h"
#include "src/ThreadPool/ThreadPoolInterface.h"
#include "src/ThreadPool/ThreadEnvironment.h"
#include "src/ThreadPool/Barrier.h"
#include "src/ThreadPool/NonBlockingThreadPool.h"
#endif
#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_CXX11_THREADPOOL_MODULE

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,554 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_H
#define EIGEN_CXX11_TENSOR_TENSOR_H
namespace Eigen {
/** \class Tensor
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor class.
*
* The %Tensor class is the work-horse for all \em dense tensors within Eigen.
*
* The %Tensor class encompasses only dynamic-size objects so far.
*
* The first two template parameters are required:
* \tparam Scalar_ Numeric type, e.g. float, double, int or `std::complex<float>`.
* User defined scalar types are supported as well (see \ref user_defined_scalars "here").
* \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
*
* The remaining template parameters are optional -- in most cases you don't have to worry about them.
* \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either
* \b #AutoAlign or \b #DontAlign.
* The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
* for vectorization. It defaults to aligning tensors. Note that tensors currently do not support any operations that profit from vectorization.
* Support for such operations (i.e. adding two tensors etc.) is planned.
*
* You can access elements of tensors using normal subscripting:
*
* \code
* Eigen::Tensor<double, 4> t(10, 10, 10, 10);
* t(0, 1, 2, 3) = 42.0;
* \endcode
*
* This class can be extended with the help of the plugin mechanism described on the page
* \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN.
*
* <i><b>Some notes:</b></i>
*
* <dl>
* <dt><b>Relation to other parts of Eigen:</b></dt>
* <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
* taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
* by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the %Tensor
* class does not provide any of these features and is only available as a stand-alone class that just allows for
* coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to
* change dramatically.</dd>
* </dl>
*
* \ref TopicStorageOrders
*/
template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
{
public:
typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
typedef typename Eigen::internal::nested<Self>::type Nested;
typedef typename internal::traits<Self>::StorageKind StorageKind;
typedef typename internal::traits<Self>::Index Index;
typedef Scalar_ Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename Base::CoeffReturnType CoeffReturnType;
enum {
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0) & !(Options_&DontAlign),
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
RawAccess = true
};
static const int Options = Options_;
static const int NumIndices = NumIndices_;
typedef DSizes<Index, NumIndices_> Dimensions;
protected:
TensorStorage<Scalar, Dimensions, Options> m_storage;
#ifdef EIGEN_HAS_SFINAE
template<typename CustomIndices>
struct isOfNormalIndex{
static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
static const bool is_int = NumTraits<CustomIndices>::IsInteger;
static const bool value = is_array | is_int;
};
#endif
public:
// Metadata
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
// This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
// work, because that uses base().coeffRef() - and we don't yet
// implement a similar class hierarchy
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
#endif
// normal indices
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
// custom indices
#ifdef EIGEN_HAS_SFINAE
template<typename CustomIndices,
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const
{
return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
return m_storage.data()[0];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
#endif
// normal indices
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
// custom indices
#ifdef EIGEN_HAS_SFINAE
template<typename CustomIndices,
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices)
{
return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef()
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
return m_storage.data()[0];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
{
return coeff(array<Index, 2>(i0, i1));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
{
return coeff(array<Index, 3>(i0, i1, i2));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
{
return coeff(array<Index, 4>(i0, i1, i2, i3));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
{
return coeff(array<Index, 5>(i0, i1, i2, i3, i4));
}
#endif
// custom indices
#ifdef EIGEN_HAS_SFINAE
template<typename CustomIndices,
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const
{
return coeff(internal::customIndices2Array<Index,NumIndices>(indices));
}
#endif
// normal indices
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
{
return coeff(indices);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return coeff(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeff();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
{
// The bracket operator is only for vectors, use the parenthesis operator instead.
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeff(index);
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
{
return coeffRef(array<Index, 2>(i0, i1));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
{
return coeffRef(array<Index, 3>(i0, i1, i2));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
{
return coeffRef(array<Index, 4>(i0, i1, i2, i3));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
{
return coeffRef(array<Index, 5>(i0, i1, i2, i3, i4));
}
#endif
// normal indices
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
{
return coeffRef(indices);
}
// custom indices
#ifdef EIGEN_HAS_SFINAE
template<typename CustomIndices,
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomIndices>::value) )
>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices)
{
return coeffRef(internal::customIndices2Array<Index,NumIndices>(indices));
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index)
{
eigen_assert(index >= 0 && index < size());
return coeffRef(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()()
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeffRef();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index)
{
// The bracket operator is only for vectors, use the parenthesis operator instead
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor()
: m_storage()
{
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor(const Self& other)
: m_storage(other.m_storage)
{
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
: m_storage(firstDimension, otherDimensions...)
{
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#else
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1)
: m_storage(dim1, array<Index, 1>(dim1))
{
EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
: m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
{
EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
: m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
{
EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
: m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
{
EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
: m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
{
EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#endif
/** Normal Dimension */
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
: m_storage(internal::array_prod(dimensions), dimensions)
{
EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
{
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
Assign assign(*this, other.derived());
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other)
{
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
Assign assign(*this, other.derived());
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
}
#if EIGEN_HAS_RVALUE_REFERENCES
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor(Self&& other)
: m_storage(std::move(other.m_storage))
{
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor& operator=(Self&& other)
{
m_storage = std::move(other.m_storage);
return *this;
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
{
typedef TensorAssignOp<Tensor, const Tensor> Assign;
Assign assign(*this, other);
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return *this;
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other)
{
typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
Assign assign(*this, other);
resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
return *this;
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
void resize(Index firstDimension, IndexTypes... otherDimensions)
{
// The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
}
#endif
/** Normal Dimension */
EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions)
{
int i;
Index size = Index(1);
for (i = 0; i < NumIndices; i++) {
internal::check_rows_cols_for_overflow<Dynamic>::run(size, dimensions[i]);
size *= dimensions[i];
}
#ifdef EIGEN_INITIALIZE_COEFFS
bool size_changed = size != this->size();
m_storage.resize(size, dimensions);
if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
#else
m_storage.resize(size, dimensions);
#endif
}
// Why this overload, DSizes is derived from array ??? //
EIGEN_DEVICE_FUNC void resize(const DSizes<Index, NumIndices>& dimensions) {
array<Index, NumIndices> dims;
for (int i = 0; i < NumIndices; ++i) {
dims[i] = dimensions[i];
}
resize(dims);
}
EIGEN_DEVICE_FUNC
void resize()
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
// Nothing to do: rank 0 tensors have fixed size
}
#ifdef EIGEN_HAS_INDEX_LIST
template <typename FirstType, typename... OtherTypes>
EIGEN_DEVICE_FUNC
void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
array<Index, NumIndices> dims;
for (int i = 0; i < NumIndices; ++i) {
dims[i] = static_cast<Index>(dimensions[i]);
}
resize(dims);
}
#endif
/** Custom Dimension */
#ifdef EIGEN_HAS_SFINAE
template<typename CustomDimension,
EIGEN_SFINAE_ENABLE_IF( !(isOfNormalIndex<CustomDimension>::value) )
>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions)
{
resize(internal::customIndices2Array<Index,NumIndices>(dimensions));
}
#endif
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::ptrdiff_t... Indices>
EIGEN_DEVICE_FUNC
void resize(const Sizes<Indices...>& dimensions) {
array<Index, NumIndices> dims;
for (int i = 0; i < NumIndices; ++i) {
dims[i] = static_cast<Index>(dimensions[i]);
}
resize(dims);
}
#else
template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
EIGEN_DEVICE_FUNC
void resize(const Sizes<V1, V2, V3, V4, V5>& dimensions) {
array<Index, NumIndices> dims;
for (int i = 0; i < NumIndices; ++i) {
dims[i] = static_cast<Index>(dimensions[i]);
}
resize(dims);
}
#endif
protected:
bool checkIndexRange(const array<Index, NumIndices>& indices) const
{
using internal::array_apply_and_reduce;
using internal::array_zip_and_reduce;
using internal::greater_equal_zero_op;
using internal::logical_and_op;
using internal::lesser_op;
return
// check whether the indices are all >= 0
array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
// check whether the indices fit in the dimensions
array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
{
if (Options&RowMajor) {
return m_storage.dimensions().IndexOfRowMajor(indices);
} else {
return m_storage.dimensions().IndexOfColMajor(indices);
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_H

View File

@@ -0,0 +1,329 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
// Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
namespace Eigen {
namespace internal {
/** \class TensorIndexTuple
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor + Index Tuple class.
*
*
*/
template<typename XprType>
struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
{
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef Tuple<Index, typename XprTraits::Scalar> Scalar;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
};
template<typename XprType>
struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
{
typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
};
template<typename XprType>
struct nested<TensorIndexTupleOp<XprType>, 1,
typename eval<TensorIndexTupleOp<XprType> >::type>
{
typedef TensorIndexTupleOp<XprType> type;
};
} // end namespace internal
template<typename XprType>
class TensorIndexTupleOp : public TensorBase<TensorIndexTupleOp<XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename Eigen::internal::nested<TensorIndexTupleOp>::type Nested;
typedef typename Eigen::internal::traits<TensorIndexTupleOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorIndexTupleOp>::Index Index;
typedef Tuple<Index, typename XprType::CoeffReturnType> CoeffReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexTupleOp(const XprType& expr)
: m_xpr(expr) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
};
// Eval as rvalue
template<typename ArgType, typename Device>
struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
{
typedef TensorIndexTupleOp<ArgType> XprType;
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
static const int NumDims = internal::array_size<Dimensions>::value;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device) { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
return m_impl.dimensions();
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return CoeffReturnType(index, m_impl.coeff(index));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
};
namespace internal {
/** \class TensorTupleIndex
* \ingroup CXX11_Tensor_Module
*
* \brief Converts to Tensor<Tuple<Index, Scalar> > and reduces to Tensor<Index>.
*
*/
template<typename ReduceOp, typename Dims, typename XprType>
struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<XprType>
{
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef Index Scalar;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
static const int Layout = XprTraits::Layout;
};
template<typename ReduceOp, typename Dims, typename XprType>
struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
{
typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
};
template<typename ReduceOp, typename Dims, typename XprType>
struct nested<TensorTupleReducerOp<ReduceOp, Dims, XprType>, 1,
typename eval<TensorTupleReducerOp<ReduceOp, Dims, XprType> >::type>
{
typedef TensorTupleReducerOp<ReduceOp, Dims, XprType> type;
};
} // end namespace internal
template<typename ReduceOp, typename Dims, typename XprType>
class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename Eigen::internal::nested<TensorTupleReducerOp>::type Nested;
typedef typename Eigen::internal::traits<TensorTupleReducerOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorTupleReducerOp>::Index Index;
typedef Index CoeffReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
const ReduceOp& reduce_op,
const Index return_dim,
const Dims& reduce_dims)
: m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_DEVICE_FUNC
const ReduceOp& reduce_op() const { return m_reduce_op; }
EIGEN_DEVICE_FUNC
const Dims& reduce_dims() const { return m_reduce_dims; }
EIGEN_DEVICE_FUNC
Index return_dim() const { return m_return_dim; }
protected:
typename XprType::Nested m_xpr;
const ReduceOp m_reduce_op;
const Index m_return_dim;
const Dims m_reduce_dims;
};
// Eval as rvalue
template<typename ReduceOp, typename Dims, typename ArgType, typename Device>
struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Device>
{
typedef TensorTupleReducerOp<ReduceOp, Dims, ArgType> XprType;
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename TensorIndexTupleOp<ArgType>::CoeffReturnType TupleType;
typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Dimensions Dimensions;
typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
static const int NumDims = internal::array_size<InputDimensions>::value;
typedef array<Index, NumDims> StrideDims;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
typedef StorageMemory<TupleType, Device> TupleStorageMem;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_orig_impl(op.expression(), device),
m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
m_return_dim(op.return_dim())
{
gen_strides(m_orig_impl.dimensions(), m_strides);
if (Layout == static_cast<int>(ColMajor)) {
const Index total_size = internal::array_prod(m_orig_impl.dimensions());
m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
} else {
const Index total_size = internal::array_prod(m_orig_impl.dimensions());
m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
}
// If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
m_stride_div = ((m_return_dim >= 0) &&
(m_return_dim < static_cast<Index>(m_strides.size())))
? m_strides[m_return_dim] : 1;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
return m_impl.dimensions();
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
const TupleType v = m_impl.coeff(index);
return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_orig_impl.bind(cgh);
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
const double compute_cost = 1.0 +
(m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
return m_orig_impl.costPerCoeff(vectorized) +
m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
}
private:
EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
if (m_return_dim < 0) {
return; // Won't be using the strides.
}
eigen_assert(m_return_dim < NumDims &&
"Asking to convert index to a dimension outside of the rank");
// Calculate m_stride_div and m_stride_mod, which are used to
// calculate the value of an index w.r.t. the m_return_dim.
if (Layout == static_cast<int>(ColMajor)) {
strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
strides[i] = strides[i-1] * dims[i-1];
}
} else {
strides[NumDims-1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
strides[i] = strides[i+1] * dims[i+1];
}
}
}
protected:
TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
const Index m_return_dim;
StrideDims m_strides;
Index m_stride_mod;
Index m_stride_div;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H

View File

@@ -0,0 +1,247 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
namespace Eigen {
/** \class TensorAssign
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor assignment class.
*
* This class is represents the assignment of the values resulting from the evaluation of
* the rhs expression to the memory locations denoted by the lhs expression.
*/
namespace internal {
template<typename LhsXprType, typename RhsXprType>
struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
{
typedef typename LhsXprType::Scalar Scalar;
typedef typename traits<LhsXprType>::StorageKind StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
typename traits<RhsXprType>::Index>::type Index;
typedef typename LhsXprType::Nested LhsNested;
typedef typename RhsXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
static const int Layout = internal::traits<LhsXprType>::Layout;
typedef typename traits<LhsXprType>::PointerType PointerType;
enum {
Flags = 0
};
};
template<typename LhsXprType, typename RhsXprType>
struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense>
{
typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
};
template<typename LhsXprType, typename RhsXprType>
struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type>
{
typedef TensorAssignOp<LhsXprType, RhsXprType> type;
};
} // end namespace internal
template<typename LhsXprType, typename RhsXprType>
class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> >
{
public:
typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
/** \returns the nested expressions */
EIGEN_DEVICE_FUNC
typename internal::remove_all<typename LhsXprType::Nested>::type&
lhsExpression() const { return *((typename internal::remove_all<typename LhsXprType::Nested>::type*)&m_lhs_xpr); }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename RhsXprType::Nested>::type&
rhsExpression() const { return m_rhs_xpr; }
protected:
typename internal::remove_all<typename LhsXprType::Nested>::type& m_lhs_xpr;
const typename internal::remove_all<typename RhsXprType::Nested>::type& m_rhs_xpr;
};
template<typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
{
typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
static const int NumDims = XprType::NumDims;
enum {
IsAligned = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
int(TensorEvaluator<RightArgType, Device>::IsAligned),
PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
int(TensorEvaluator<RightArgType, Device>::PacketAccess),
BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
int(TensorEvaluator<RightArgType, Device>::BlockAccess),
PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
RightTensorBlock;
//===--------------------------------------------------------------------===//
TensorEvaluator(const XprType& op, const Device& device) :
m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device)
{
EIGEN_STATIC_ASSERT(
(static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
}
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// The dimensions of the lhs and the rhs tensors should be equal to prevent
// overflows and ensure the result is fully initialized.
// TODO: use left impl instead if right impl dimensions are known at compile time.
return m_rightImpl.dimensions();
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
m_leftImpl.evalSubExprsIfNeeded(NULL);
// If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
// null value), attempt to evaluate the rhs expression in place. Returns true iff in place
// evaluation isn't supported and the caller still needs to manually assign the values generated
// by the rhs to the lhs.
return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
m_rightImpl.evalSubExprsIfNeededAsync(
m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
});
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {
m_leftImpl.cleanup();
m_rightImpl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_leftImpl.coeff(index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
{
return m_leftImpl.template packet<LoadMode>(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
// We assume that evalPacket or evalScalar is called to perform the
// assignment and account for the cost of the write here, but reduce left
// cost by one load because we are using m_leftImpl.coeffRef.
TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
return m_rightImpl.costPerCoeff(vectorized) +
TensorOpCost(
numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)),
left.bytes_stored(), left.compute_cycles()) +
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
return internal::TensorBlockResourceRequirements::merge(
m_leftImpl.getResourceRequirements(),
m_rightImpl.getResourceRequirements());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
m_leftImpl.data() != NULL) {
// If destination has raw data access, we pass it as a potential
// destination for a block descriptor evaluation.
desc.template AddDestinationBuffer<Layout>(
/*dst_base=*/m_leftImpl.data() + desc.offset(),
/*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
}
RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
// If block was evaluated into a destination, there is no need to do assignment.
if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
m_leftImpl.writeBlock(desc, block);
}
block.cleanup();
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_leftImpl.bind(cgh);
m_rightImpl.bind(cgh);
}
#endif
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
private:
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
};
}
#endif // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,518 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
namespace Eigen {
/** \class TensorKChippingReshaping
* \ingroup CXX11_Tensor_Module
*
* \brief A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
*
*
*/
namespace internal {
template<DenseIndex DimId, typename XprType>
struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions - 1;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<DenseIndex DimId, typename XprType>
struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
{
typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
};
template<DenseIndex DimId, typename XprType>
struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type>
{
typedef TensorChippingOp<DimId, XprType> type;
};
template <DenseIndex DimId>
struct DimensionId
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
EIGEN_UNUSED_VARIABLE(dim);
eigen_assert(dim == DimId);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
return DimId;
}
};
template <>
struct DimensionId<Dynamic>
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) {
eigen_assert(dim >= 0);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
return actual_dim;
}
private:
const DenseIndex actual_dim;
};
} // end namespace internal
template<DenseIndex DimId, typename XprType>
class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> >
{
public:
typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
: m_xpr(expr), m_offset(offset), m_dim(dim) {
}
EIGEN_DEVICE_FUNC
const Index offset() const { return m_offset; }
EIGEN_DEVICE_FUNC
const Index dim() const { return m_dim.actualDim(); }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
protected:
typename XprType::Nested m_xpr;
const Index m_offset;
const internal::DimensionId<DimId> m_dim;
};
// Eval as rvalue
template<DenseIndex DimId, typename ArgType, typename Device>
struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
{
typedef TensorChippingOp<DimId, ArgType> XprType;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
static const int NumDims = NumInputDims-1;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
// Alignment can't be guaranteed at compile time since it depends on the
// slice offsets.
IsAligned = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
// Chipping of outer-most dimension is a trivial operation, because we can
// read and write directly from the underlying tensor using single offset.
IsOuterChipping = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
(static_cast<int>(Layout) == RowMajor && DimId == 0),
// Chipping inner-most dimension.
IsInnerChipping = (static_cast<int>(Layout) == ColMajor && DimId == 0) ||
(static_cast<int>(Layout) == RowMajor && DimId == NumInputDims - 1),
// Prefer block access if the underlying expression prefers it, otherwise
// only if chipping is not trivial.
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess ||
!IsOuterChipping,
CoordAccess = false, // to be implemented
RawAccess = false
};
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef internal::TensorBlockDescriptor<NumInputDims, Index>
ArgTensorBlockDesc;
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
ArgTensorBlock;
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
Layout, Index>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
{
EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(NumInputDims > m_dim.actualDim());
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
int j = 0;
for (int i = 0; i < NumInputDims; ++i) {
if (i != m_dim.actualDim()) {
m_dimensions[j] = input_dims[i];
++j;
}
}
m_stride = 1;
m_inputStride = 1;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < m_dim.actualDim(); ++i) {
m_stride *= input_dims[i];
m_inputStride *= input_dims[i];
}
} else {
for (int i = NumInputDims-1; i > m_dim.actualDim(); --i) {
m_stride *= input_dims[i];
m_inputStride *= input_dims[i];
}
}
m_inputStride *= input_dims[m_dim.actualDim()];
m_inputOffset = m_stride * op.offset();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(srcCoeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (isInnerChipping()) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
Index inputIndex = index * m_inputStride + m_inputOffset;
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = m_impl.coeff(inputIndex);
inputIndex += m_inputStride;
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
} else if (isOuterChipping()) {
// m_stride is always greater than index, so let's avoid the integer division.
eigen_assert(m_stride > index);
return m_impl.template packet<LoadMode>(index + m_inputOffset);
} else {
const Index idx = index / m_stride;
const Index rem = index - idx * m_stride;
if (rem + PacketSize <= m_stride) {
Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
return m_impl.template packet<LoadMode>(inputIndex);
} else {
// Cross the stride boundary. Fallback to slow path.
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index);
++index;
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
double cost = 0;
if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
m_dim.actualDim() == 0) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
m_dim.actualDim() == NumInputDims - 1)) {
cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
} else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) &&
m_dim.actualDim() == NumInputDims - 1) ||
(static_cast<int>(Layout) == static_cast<int>(RowMajor) &&
m_dim.actualDim() == 0)) {
cost += TensorOpCost::AddCost<Index>();
} else {
cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() +
3 * TensorOpCost::AddCost<Index>();
}
return m_impl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, cost, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
const size_t target_size = m_device.lastLevelCacheSize();
return internal::TensorBlockResourceRequirements::merge(
internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
m_impl.getResourceRequirements());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool root_of_expr_ast = false) const {
const Index chip_dim = m_dim.actualDim();
DSizes<Index, NumInputDims> input_block_dims;
for (int i = 0; i < NumInputDims; ++i) {
input_block_dims[i]
= i < chip_dim ? desc.dimension(i)
: i > chip_dim ? desc.dimension(i - 1)
: 1;
}
ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
// Try to reuse destination buffer for materializing argument block.
if (desc.HasDestinationBuffer()) {
DSizes<Index, NumInputDims> arg_destination_strides;
for (int i = 0; i < NumInputDims; ++i) {
arg_destination_strides[i]
= i < chip_dim ? desc.destination().strides()[i]
: i > chip_dim ? desc.destination().strides()[i - 1]
: 0; // for dimensions of size `1` stride should never be used.
}
arg_desc.template AddDestinationBuffer<Layout>(
desc.destination().template data<ScalarNoConst>(),
arg_destination_strides);
}
ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
if (arg_block.data() != NULL) {
// Forward argument block buffer if possible.
return TensorBlock(arg_block.kind(), arg_block.data(),
desc.dimensions());
} else {
// Assign argument block expression to a buffer.
// Prepare storage for the materialized chipping result.
const typename TensorBlock::Storage block_storage =
TensorBlock::prepareStorage(desc, scratch);
typedef internal::TensorBlockAssignment<
ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
TensorBlockAssignment;
TensorBlockAssignment::Run(
TensorBlockAssignment::target(
arg_desc.dimensions(),
internal::strides<Layout>(arg_desc.dimensions()),
block_storage.data()),
arg_block.expr());
return block_storage.AsTensorMaterializedBlock();
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
typename Storage::Type result = constCast(m_impl.data());
if (isOuterChipping() && result) {
return result + m_inputOffset;
} else {
return NULL;
}
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex;
if (isInnerChipping()) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(m_stride == 1);
inputIndex = index * m_inputStride + m_inputOffset;
} else if (isOuterChipping()) {
// m_stride is always greater than index, so let's avoid the integer
// division.
eigen_assert(m_stride > index);
inputIndex = index + m_inputOffset;
} else {
const Index idx = index / m_stride;
inputIndex = idx * m_inputStride + m_inputOffset;
index -= idx * m_stride;
inputIndex += index;
}
return inputIndex;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
return IsInnerChipping ||
(static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == 0) ||
(static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
return IsOuterChipping ||
(static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) ||
(static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == 0);
}
Dimensions m_dimensions;
Index m_stride;
Index m_inputOffset;
Index m_inputStride;
TensorEvaluator<ArgType, Device> m_impl;
const internal::DimensionId<DimId> m_dim;
const Device EIGEN_DEVICE_REF m_device;
};
// Eval as lvalue
template<DenseIndex DimId, typename ArgType, typename Device>
struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
: public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
{
typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
typedef TensorChippingOp<DimId, ArgType> XprType;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
static const int NumDims = NumInputDims-1;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
if (this->isInnerChipping()) {
// m_stride is equal to 1, so let's avoid the integer division.
eigen_assert(this->m_stride == 1);
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
this->m_impl.coeffRef(inputIndex) = values[i];
inputIndex += this->m_inputStride;
}
} else if (this->isOuterChipping()) {
// m_stride is always greater than index, so let's avoid the integer division.
eigen_assert(this->m_stride > index);
this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
} else {
const Index idx = index / this->m_stride;
const Index rem = index - idx * this->m_stride;
if (rem + PacketSize <= this->m_stride) {
const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
this->m_impl.template writePacket<StoreMode>(inputIndex, x);
} else {
// Cross stride boundary. Fallback to slow path.
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index) = values[i];
++index;
}
}
}
}
template <typename TensorBlock>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
const TensorBlockDesc& desc, const TensorBlock& block) {
assert(this->m_impl.data() != NULL);
const Index chip_dim = this->m_dim.actualDim();
DSizes<Index, NumInputDims> input_block_dims;
for (int i = 0; i < NumInputDims; ++i) {
input_block_dims[i] = i < chip_dim ? desc.dimension(i)
: i > chip_dim ? desc.dimension(i - 1)
: 1;
}
typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
const typename TensorBlock::XprType>
TensorBlockExpr;
typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
TensorBlockExpr, Index>
TensorBlockAssign;
TensorBlockAssign::Run(
TensorBlockAssign::target(
input_block_dims,
internal::strides<Layout>(this->m_impl.dimensions()),
this->m_impl.data(), this->srcCoeff(desc.offset())),
block.expr().reshape(input_block_dims));
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H

View File

@@ -0,0 +1,377 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
namespace Eigen {
/** \class TensorConcatenationOp
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor concatenation class.
*
*
*/
namespace internal {
template<typename Axis, typename LhsXprType, typename RhsXprType>
struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename promote_storage_type<typename LhsXprType::Scalar,
typename RhsXprType::Scalar>::ret Scalar;
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
typename traits<RhsXprType>::Index>::type Index;
typedef typename LhsXprType::Nested LhsNested;
typedef typename RhsXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = traits<LhsXprType>::NumDimensions;
static const int Layout = traits<LhsXprType>::Layout;
enum { Flags = 0 };
typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
};
template<typename Axis, typename LhsXprType, typename RhsXprType>
struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense>
{
typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
};
template<typename Axis, typename LhsXprType, typename RhsXprType>
struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1, typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type>
{
typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
};
} // end namespace internal
template<typename Axis, typename LhsXprType, typename RhsXprType>
class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors>
{
public:
typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
typedef typename internal::traits<TensorConcatenationOp>::Index Index;
typedef typename internal::nested<TensorConcatenationOp>::type Nested;
typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename LhsXprType::Nested>::type&
lhsExpression() const { return m_lhs_xpr; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename RhsXprType::Nested>::type&
rhsExpression() const { return m_rhs_xpr; }
EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
protected:
typename LhsXprType::Nested m_lhs_xpr;
typename RhsXprType::Nested m_rhs_xpr;
const Axis m_axis;
};
// Eval as rvalue
template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
{
typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
static const int RightNumDims = internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(0 <= m_axis && m_axis < NumDims);
const Dimensions& lhs_dims = m_leftImpl.dimensions();
const Dimensions& rhs_dims = m_rightImpl.dimensions();
{
int i = 0;
for (; i < m_axis; ++i) {
eigen_assert(lhs_dims[i] > 0);
eigen_assert(lhs_dims[i] == rhs_dims[i]);
m_dimensions[i] = lhs_dims[i];
}
eigen_assert(lhs_dims[i] > 0); // Now i == m_axis.
eigen_assert(rhs_dims[i] > 0);
m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
for (++i; i < NumDims; ++i) {
eigen_assert(lhs_dims[i] > 0);
eigen_assert(lhs_dims[i] == rhs_dims[i]);
m_dimensions[i] = lhs_dims[i];
}
}
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_leftStrides[0] = 1;
m_rightStrides[0] = 1;
m_outputStrides[0] = 1;
for (int j = 1; j < NumDims; ++j) {
m_leftStrides[j] = m_leftStrides[j-1] * lhs_dims[j-1];
m_rightStrides[j] = m_rightStrides[j-1] * rhs_dims[j-1];
m_outputStrides[j] = m_outputStrides[j-1] * m_dimensions[j-1];
}
} else {
m_leftStrides[NumDims - 1] = 1;
m_rightStrides[NumDims - 1] = 1;
m_outputStrides[NumDims - 1] = 1;
for (int j = NumDims - 2; j >= 0; --j) {
m_leftStrides[j] = m_leftStrides[j+1] * lhs_dims[j+1];
m_rightStrides[j] = m_rightStrides[j+1] * rhs_dims[j+1];
m_outputStrides[j] = m_outputStrides[j+1] * m_dimensions[j+1];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
// TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
{
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup()
{
m_leftImpl.cleanup();
m_rightImpl.cleanup();
}
// TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
// See CL/76180724 comments for more ideas.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
// Collect dimension-wise indices (subs).
array<Index, NumDims> subs;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
subs[i] = index / m_outputStrides[i];
index -= subs[i] * m_outputStrides[i];
}
subs[0] = index;
} else {
for (int i = 0; i < NumDims - 1; ++i) {
subs[i] = index / m_outputStrides[i];
index -= subs[i] * m_outputStrides[i];
}
subs[NumDims - 1] = index;
}
const Dimensions& left_dims = m_leftImpl.dimensions();
if (subs[m_axis] < left_dims[m_axis]) {
Index left_index;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
left_index = subs[0];
EIGEN_UNROLL_LOOP
for (int i = 1; i < NumDims; ++i) {
left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
}
} else {
left_index = subs[NumDims - 1];
EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i >= 0; --i) {
left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
}
}
return m_leftImpl.coeff(left_index);
} else {
subs[m_axis] -= left_dims[m_axis];
const Dimensions& right_dims = m_rightImpl.dimensions();
Index right_index;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
right_index = subs[0];
EIGEN_UNROLL_LOOP
for (int i = 1; i < NumDims; ++i) {
right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
}
} else {
right_index = subs[NumDims - 1];
EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i >= 0; --i) {
right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
}
}
return m_rightImpl.coeff(right_index);
}
}
// TODO(phli): Add a real vectorization.
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = PacketType<CoeffReturnType, Device>::size;
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
2 * TensorOpCost::MulCost<Index>() +
TensorOpCost::DivCost<Index>() +
TensorOpCost::ModCost<Index>());
const double lhs_size = m_leftImpl.dimensions().TotalSize();
const double rhs_size = m_rightImpl.dimensions().TotalSize();
return (lhs_size / (lhs_size + rhs_size)) *
m_leftImpl.costPerCoeff(vectorized) +
(rhs_size / (lhs_size + rhs_size)) *
m_rightImpl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, compute_cost);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_leftImpl.bind(cgh);
m_rightImpl.bind(cgh);
}
#endif
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_leftStrides;
array<Index, NumDims> m_rightStrides;
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
const Axis m_axis;
};
// Eval as lvalue
template<typename Axis, typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
: public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
{
typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base;
typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
typedef typename Base::Dimensions Dimensions;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
TensorEvaluator<RightArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
: Base(op, device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
// Collect dimension-wise indices (subs).
array<Index, Base::NumDims> subs;
for (int i = Base::NumDims - 1; i > 0; --i) {
subs[i] = index / this->m_outputStrides[i];
index -= subs[i] * this->m_outputStrides[i];
}
subs[0] = index;
const Dimensions& left_dims = this->m_leftImpl.dimensions();
if (subs[this->m_axis] < left_dims[this->m_axis]) {
Index left_index = subs[0];
for (int i = 1; i < Base::NumDims; ++i) {
left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i];
}
return this->m_leftImpl.coeffRef(left_index);
} else {
subs[this->m_axis] -= left_dims[this->m_axis];
const Dimensions& right_dims = this->m_rightImpl.dimensions();
Index right_index = subs[0];
for (int i = 1; i < Base::NumDims; ++i) {
right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i];
}
return this->m_rightImpl.coeffRef(right_index);
}
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
const int packetSize = PacketType<CoeffReturnType, Device>::size;
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
for (int i = 0; i < packetSize; ++i) {
coeffRef(index+i) = values[i];
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,73 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
namespace Eigen {
namespace internal {
enum {
ShardByRow = 0,
ShardByCol = 1
};
// Default Blocking Strategy
template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
class TensorContractionBlocking {
public:
/*
adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
(else HIPCC will error out)
However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
results in NVCC erroring out with the following error
../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
*/
#if !defined(EIGEN_HIPCC)
EIGEN_DEVICE_FUNC
#endif
TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
kc_(k), mc_(m), nc_(n)
{
if (ShardingType == ShardByCol) {
computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
}
else {
computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
}
const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
private:
StorageIndex kc_;
StorageIndex mc_;
StorageIndex nc_;
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H

View File

@@ -0,0 +1,6 @@
#if defined(__clang__) || defined(__GNUC__)
#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
#endif
#include "TensorContractionGpu.h"

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,575 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
namespace Eigen {
namespace internal {
enum {
Rhs = 0,
Lhs = 1
};
/*
* Implementation of the Eigen blas_data_mapper class for tensors.
*/
/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
/// is scalar * for CoeffLoader.
template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
struct CoeffLoader;
template <typename Scalar, typename Index, int side, typename Tensor,
typename nocontract_t, typename contract_t, int packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
template <class> class MakePointer_ = MakePointer>
class BaseTensorContractionMapper;
template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
struct CoeffLoader {
enum {
DirectOffsets = false
};
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) { }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
eigen_assert(false && "unsupported");
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
data() const {
eigen_assert(false && "unsupported");
return NULL;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
{
return m_tensor.template packet<LoadMode>(index);
}
#ifdef EIGEN_USE_SYCL
// The placeholder accessors require to be bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_tensor.bind(cgh);
}
#endif
private:
const Tensor m_tensor;
};
template <typename Tensor, template <class> class MakePointer_>
struct CoeffLoader<Tensor, true, MakePointer_> {
enum {
DirectOffsets = true
};
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
m_data += offset;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
data() const {
return m_data;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename Tensor::PacketReturnType packet(typename Tensor::Index index) const
{
return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
}
#ifdef EIGEN_USE_SYCL
// The placeholder accessors require to be bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_data.bind(cgh);
}
#endif
private:
typedef typename Tensor::Scalar Scalar;
typename MakePointer_<const Scalar>::Type m_data;
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
class SimpleTensorContractionMapper {
public:
EIGEN_DEVICE_FUNC
SimpleTensorContractionMapper(const Tensor& tensor,
const nocontract_t& nocontract_strides,
const nocontract_t& ij_strides,
const contract_t& contract_strides,
const contract_t& k_strides) :
m_tensor(tensor),
m_nocontract_strides(nocontract_strides),
m_ij_strides(ij_strides),
m_contract_strides(contract_strides),
m_k_strides(k_strides) { }
enum {
DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
};
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
m_tensor.offsetBuffer(offset);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE void prefetch(Index /*i*/) { }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
// column major assumption
return operator()(row, 0);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
return m_tensor.coeff(computeIndex(row, col));
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
const bool left = (side == Lhs);
EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
Index nocontract_val = left ? row : col;
Index linidx = 0;
EIGEN_UNROLL_LOOP
for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
const Index idx = nocontract_val / m_ij_strides[i];
linidx += idx * m_nocontract_strides[i];
nocontract_val -= idx * m_ij_strides[i];
}
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
if (side == Lhs && inner_dim_contiguous) {
eigen_assert(m_nocontract_strides[0] == 1);
linidx += nocontract_val;
} else {
linidx += nocontract_val * m_nocontract_strides[0];
}
}
Index contract_val = left ? col : row;
if(array_size<contract_t>::value > 0) {
EIGEN_UNROLL_LOOP
for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
const Index idx = contract_val / m_k_strides[i];
linidx += idx * m_contract_strides[i];
contract_val -= idx * m_k_strides[i];
}
if (side == Rhs && inner_dim_contiguous) {
eigen_assert(m_contract_strides[0] == 1);
linidx += contract_val;
} else {
linidx += contract_val * m_contract_strides[0];
}
}
return linidx;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col, const Index distance) const {
const bool left = (side == Lhs);
EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
Index linidx[2] = {0, 0};
if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
EIGEN_UNROLL_LOOP
for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
const Index idx0 = nocontract_val[0] / m_ij_strides[i];
const Index idx1 = nocontract_val[1] / m_ij_strides[i];
linidx[0] += idx0 * m_nocontract_strides[i];
linidx[1] += idx1 * m_nocontract_strides[i];
nocontract_val[0] -= idx0 * m_ij_strides[i];
nocontract_val[1] -= idx1 * m_ij_strides[i];
}
if (side == Lhs && inner_dim_contiguous) {
eigen_assert(m_nocontract_strides[0] == 1);
linidx[0] += nocontract_val[0];
linidx[1] += nocontract_val[1];
} else {
linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
}
}
Index contract_val[2] = {left ? col : row, left ? col : row + distance};
if (array_size<contract_t>::value> 0) {
EIGEN_UNROLL_LOOP
for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
const Index idx0 = contract_val[0] / m_k_strides[i];
const Index idx1 = contract_val[1] / m_k_strides[i];
linidx[0] += idx0 * m_contract_strides[i];
linidx[1] += idx1 * m_contract_strides[i];
contract_val[0] -= idx0 * m_k_strides[i];
contract_val[1] -= idx1 * m_k_strides[i];
}
if (side == Rhs && inner_dim_contiguous) {
eigen_assert(m_contract_strides[0] == 1);
linidx[0] += contract_val[0];
linidx[1] += contract_val[1];
} else {
linidx[0] += contract_val[0] * m_contract_strides[0];
linidx[1] += contract_val[1] * m_contract_strides[0];
}
}
return IndexPair<Index>(linidx[0], linidx[1]);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
// Only claim alignment when we can compute the actual stride (ie when we're
// dealing with the lhs with inner_dim_contiguous. This is because the
// matrix-vector product relies on the stride when dealing with aligned inputs.
return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
}
#ifdef EIGEN_USE_SYCL
// The placeholder accessors require to be bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_tensor.bind(cgh);
}
#endif
const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
return m_tensor;
}
const nocontract_t& nocontract_strides() const {
return m_nocontract_strides;
}
const nocontract_t& ij_strides() const { return m_ij_strides; }
const contract_t& contract_strides() const { return m_contract_strides; }
const contract_t& k_strides() const { return m_k_strides; }
protected:
CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
const nocontract_t m_nocontract_strides;
const nocontract_t m_ij_strides;
const contract_t m_contract_strides;
const contract_t m_k_strides;
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size, bool inner_dim_contiguous,
bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
{
public:
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
const nocontract_t& nocontract_strides,
const nocontract_t& ij_strides,
const contract_t& contract_strides,
const contract_t& k_strides) :
ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
template <typename PacketT,int AlignmentType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type
load(Index i, Index j) const
{
// whole method makes column major assumption
// don't need to add offsets for now (because operator handles that)
// current code assumes packet size must be a multiple of 2
EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
const Index index = this->computeIndex(i, j);
eigen_assert(this->computeIndex(i+packet_size-1, j) == index + packet_size-1);
return this->m_tensor.template packet<AlignmentType>(index);
}
const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
const Index first = indexPair.first;
const Index lastIdx = indexPair.second;
// We can always do optimized packet reads from left hand side right now, because
// the vertical matrix dimension on the left hand side is never contracting.
// On the right hand side we need to check if the contracting dimensions may have
// been shuffled first.
if (Tensor::PacketAccess &&
(side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
(lastIdx - first) == (packet_size - 1)) {
return this->m_tensor.template packet<AlignmentType>(first);
}
EIGEN_ALIGN_MAX Scalar data[packet_size];
data[0] = this->m_tensor.coeff(first);
EIGEN_UNROLL_LOOP
for (Index k = 1; k < packet_size - 1; k += 2) {
const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
data[k] = this->m_tensor.coeff(internal_pair.first);
data[k + 1] = this->m_tensor.coeff(internal_pair.second);
}
data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
return pload<PacketT>(data);
}
template <typename PacketT,int AlignmentType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type
load(Index i, Index j) const
{
const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
const Index first = indexPair.first;
const Index lastIdx = indexPair.second;
data[0] = this->m_tensor.coeff(first);
for (Index k = 1; k < requested_packet_size - 1; k += 2) {
const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
data[k] = this->m_tensor.coeff(internal_pair.first);
data[k + 1] = this->m_tensor.coeff(internal_pair.second);
}
data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
return pload<PacketT>(data);
}
template <typename PacketT,int AlignmentType>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
return this->load<PacketT,AlignmentType>(i,j);
}
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
bool inner_dim_contiguous,
bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
: public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
{
public:
typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
EIGEN_DEVICE_FUNC
BaseTensorContractionMapper(const Tensor& tensor,
const nocontract_t& nocontract_strides,
const nocontract_t& ij_strides,
const contract_t& contract_strides,
const contract_t& k_strides) :
ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
template <typename PacketT,int> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
EIGEN_ALIGN_MAX Scalar data[1];
data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
return pload<PacketT>(data);
}
template <typename PacketT,int> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
EIGEN_ALIGN_MAX Scalar data[1];
data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
return pload<PacketT>(data);
}
};
template<typename Scalar, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
class TensorContractionSubMapper {
public:
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
typedef Self LinearMapper;
enum {
// We can use direct offsets iff the parent mapper supports then and we can compute the strides.
// TODO: we should also enable direct offsets for the Rhs case.
UseDirectOffsets = ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
};
EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
: m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
// Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
// this offset every time we attempt to access a coefficient.
if (UseDirectOffsets) {
Index stride = m_base_mapper.stride();
m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
}
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
if (UseDirectOffsets) {
return m_base_mapper(i, 0);
}
return m_base_mapper(i + m_vert_offset, m_horiz_offset);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
if (UseDirectOffsets) {
return m_base_mapper(i, j);
}
return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
}
template <typename PacketT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
if (UseDirectOffsets) {
return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0);
}
return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset);
}
template <typename PacketT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
if (UseDirectOffsets) {
return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j);
}
return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset);
}
template <typename PacketT, int AlignmentType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
if (UseDirectOffsets) {
return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
}
return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
}
template <typename PacketT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
if (UseDirectOffsets) {
m_base_mapper.storePacket(i, 0, p);
}
m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
if (UseDirectOffsets) {
return LinearMapper(m_base_mapper, i, j);
}
return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
}
template <typename PacketT, int AlignmentType>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
if (UseDirectOffsets) {
return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0);
}
return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset);
}
template <typename PacketT>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
return false;
}
#ifdef EIGEN_USE_SYCL
// The placeholder accessors require to be bound to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_base_mapper.bind(cgh);
}
#endif
const ParentMapper& base_mapper() const { return m_base_mapper; }
Index vert_offset() const { return m_vert_offset; }
Index horiz_offset() const { return m_horiz_offset; }
private:
ParentMapper m_base_mapper;
const Index m_vert_offset;
const Index m_horiz_offset;
};
template<typename Scalar_, typename Index, int side,
typename Tensor,
typename nocontract_t, typename contract_t,
int packet_size,
bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
class TensorContractionInputMapper
: public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
public:
typedef Scalar_ Scalar;
typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
typedef SubMapper VectorMapper;
EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
const nocontract_t& nocontract_strides,
const nocontract_t& ij_strides,
const contract_t& contract_strides,
const contract_t& k_strides)
: Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
return SubMapper(*this, i, j);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
return VectorMapper(*this, i, j);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
return Base::m_tensor;
}
};
template <typename T> struct TensorContractionInputMapperTrait;
template<typename Scalar_, typename Index_, int side_,
typename Tensor_,
typename nocontract_t_, typename contract_t_,
int packet_size_,
bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_, template <class> class MakePointer_>
struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_,
nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_,
inner_dim_reordered_, Alignment_, MakePointer_> > {
typedef Tensor_ XprType;
static const bool inner_dim_contiguous = inner_dim_contiguous_;
static const bool inner_dim_reordered = inner_dim_reordered_;
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,456 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
namespace Eigen {
/** \class TensorConversionOp
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor conversion class. This class makes it possible to vectorize
* type casting operations when the number of scalars per packet in the source
* and the destination type differ
*/
namespace internal {
template<typename TargetType, typename XprType>
struct traits<TensorConversionOp<TargetType, XprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef TargetType Scalar;
typedef typename traits<XprType>::StorageKind StorageKind;
typedef typename traits<XprType>::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = traits<XprType>::NumDimensions;
static const int Layout = traits<XprType>::Layout;
enum { Flags = 0 };
typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
};
template<typename TargetType, typename XprType>
struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense>
{
typedef const TensorConversionOp<TargetType, XprType>& type;
};
template<typename TargetType, typename XprType>
struct nested<TensorConversionOp<TargetType, XprType>, 1, typename eval<TensorConversionOp<TargetType, XprType> >::type>
{
typedef TensorConversionOp<TargetType, XprType> type;
};
} // end namespace internal
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
struct PacketConverter;
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketConverter(const TensorEvaluator& impl)
: m_impl(impl) {}
template<int LoadMode, typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
}
private:
const TensorEvaluator& m_impl;
};
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketConverter(const TensorEvaluator& impl)
: m_impl(impl) {}
template<int LoadMode, typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
SrcPacket src1 = m_impl.template packet<LoadMode>(index);
SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
return result;
}
private:
const TensorEvaluator& m_impl;
};
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketConverter(const TensorEvaluator& impl)
: m_impl(impl) {}
template<int LoadMode, typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
SrcPacket src1 = m_impl.template packet<LoadMode>(index);
SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
return result;
}
private:
const TensorEvaluator& m_impl;
};
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketConverter(const TensorEvaluator& impl)
: m_impl(impl) {}
template<int LoadMode, typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
SrcPacket src1 = m_impl.template packet<LoadMode>(index);
SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize);
SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize);
SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize);
SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize);
SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize);
TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8);
return result;
}
private:
const TensorEvaluator& m_impl;
};
template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketConverter(const TensorEvaluator& impl)
: m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
template<int LoadMode, typename Index>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
// Only call m_impl.packet() when we have direct access to the underlying data. This
// ensures that we don't compute the subexpression twice. We may however load some
// coefficients twice, but in practice this doesn't negatively impact performance.
if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
// Force unaligned memory loads since we can't ensure alignment anymore
return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
} else {
const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
internal::scalar_cast_op<SrcType, TgtType> converter;
EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < TgtPacketSize; ++i) {
values[i] = converter(m_impl.coeff(index+i));
}
TgtPacket rslt = internal::pload<TgtPacket>(values);
return rslt;
}
}
private:
const TensorEvaluator& m_impl;
const typename TensorEvaluator::Index m_maxIndex;
};
template<typename TargetType, typename XprType>
class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors>
{
public:
typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
typedef typename internal::traits<TensorConversionOp>::Index Index;
typedef typename internal::nested<TensorConversionOp>::type Nested;
typedef Scalar CoeffReturnType;
typedef typename NumTraits<Scalar>::Real RealScalar;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr)
: m_xpr(xpr) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
};
template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
impl.evalSubExprsIfNeeded(NULL);
return true;
}
};
template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
return impl.evalSubExprsIfNeeded(data);
}
};
#ifdef EIGEN_USE_THREADS
template <bool SameType, typename Eval, typename EvalPointerType,
typename EvalSubExprsCallback>
struct ConversionSubExprEvalAsync {
static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
}
};
template <typename Eval, typename EvalPointerType,
typename EvalSubExprsCallback>
struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType,
EvalSubExprsCallback> {
static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
impl.evalSubExprsIfNeededAsync(data, std::move(done));
}
};
#endif
namespace internal {
template <typename SrcType, typename TargetType, bool IsSameT>
struct CoeffConv {
template <typename ArgType, typename Device>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
internal::scalar_cast_op<SrcType, TargetType> converter;
return converter(impl.coeff(index));
}
};
template <typename SrcType, typename TargetType>
struct CoeffConv<SrcType, TargetType, true> {
template <typename ArgType, typename Device>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
return impl.coeff(index);
}
};
template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
struct PacketConv {
typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
template <typename ArgType, typename Device>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
internal::scalar_cast_op<SrcType, TargetType> converter;
EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = converter(impl.coeff(index+i));
}
TargetPacket rslt = internal::pload<TargetPacket>(values);
return rslt;
}
};
template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
template <typename ArgType, typename Device>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket,
SrcCoeffRatio, TgtCoeffRatio> converter(impl);
return converter.template packet<LoadMode>(index);
}
};
template <typename SrcPacket, typename TargetPacket, int LoadMode>
struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
template <typename ArgType, typename Device>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i);
return internal::pload<TargetPacket>(values);
}
};
template <typename SrcPacket, typename TargetPacket, int LoadMode>
struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
template <typename ArgType, typename Device>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
return impl.template packet<LoadMode>(index);
}
};
} // namespace internal
// Eval as rvalue
template<typename TargetType, typename ArgType, typename Device>
struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
{
typedef TensorConversionOp<TargetType, ArgType> XprType;
typedef typename XprType::Index Index;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
typedef TargetType Scalar;
typedef TargetType CoeffReturnType;
typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename PacketType<SrcType, Device>::type PacketSourceType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess =
#ifndef EIGEN_USE_SYCL
true,
#else
TensorEvaluator<ArgType, Device>::PacketAccess &
internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
#endif
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
static const int NumDims = internal::array_size<Dimensions>::value;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
ArgTensorBlock;
struct TensorConversionOpBlockFactory {
template <typename ArgXprType>
struct XprType {
typedef TensorConversionOp<TargetType, const ArgXprType> type;
};
template <typename ArgXprType>
typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
return typename XprType<ArgXprType>::type(expr);
}
};
typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
ArgTensorBlock>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
{
return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType data, EvalSubExprsCallback done) {
ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>,
EvaluatorPointerType,
EvalSubExprsCallback>::run(m_impl, data, std::move(done));
}
#endif
EIGEN_STRONG_INLINE void cleanup()
{
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
packet(Index index) const {
// If we are not going to do the cast, we just need to check that base
// TensorEvaluator has packet access. Otherwise we also need to make sure,
// that we have an implementation of vectorized cast.
const bool Vectorizable =
IsSameType
? TensorEvaluator<ArgType, Device>::PacketAccess
: int(TensorEvaluator<ArgType, Device>::PacketAccess) &
int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
Vectorizable, IsSameType>::run(m_impl, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
if (vectorized) {
const double SrcCoeffRatio =
internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
const double TgtCoeffRatio =
internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
} else {
return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
return m_impl.getResourceRequirements();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
return TensorBlock(m_impl.block(desc, scratch),
TensorConversionOpBlockFactory());
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
/// required by sycl in order to extract the sycl accessor
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,544 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
namespace Eigen {
/** \class TensorConvolution
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor convolution class.
*
*
*/
enum class convolution_type { CONV1D, CONV2D, CONV3D };
template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
struct EigenConvolutionKernel;
template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
typename Kernel_accessor, typename Buffer_accessor>
struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
Buffer_accessor, convolution_type::CONV1D> {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
Local_accessor;
Local_accessor local_acc;
Evaluator device_evaluator;
Kernel_accessor kernel_filter;
Buffer_accessor buffer_acc;
internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
const size_t kernelSize;
const cl::sycl::range<2> input_range;
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
Buffer_accessor buffer_acc_,
internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
const size_t kernelSize_, const cl::sycl::range<2> input_range_)
: local_acc(local_acc_),
device_evaluator(device_evaluator_),
kernel_filter(kernel_filter_),
buffer_acc(buffer_acc_),
indexMapper(indexMapper_),
kernelSize(kernelSize_),
input_range(input_range_) {}
template <typename BooleanDim2>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
return (boolean_check[0] && boolean_check[1]);
}
void operator()(cl::sycl::nd_item<2> itemID) {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
// the required row to be calculated for the for each plane in shered memory
const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
/// fill the shared memory
for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
const size_t local_index = i + plane_kernel_offset;
const size_t tensor_index =
plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
local_acc[local_index] =
(((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
? device_evaluator.coeff(tensor_index)
: CoeffReturnType(0);
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
// calculate the convolution // output start x
const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
if (boundary_check(itemID.get_global_id() < input_range)) {
CoeffReturnType result = static_cast<CoeffReturnType>(0);
const size_t index = plane_kernel_offset + itemID.get_local_id(0);
for (size_t k = 0; k < kernelSize; ++k) {
result += (local_acc[k + index] * kernel_ptr[k]);
}
const size_t tensor_index =
indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
buffer_ptr[tensor_index] = result;
}
}
};
template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
typename Kernel_accessor, typename Buffer_accessor>
struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
Buffer_accessor, convolution_type::CONV2D> {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
Local_accessor;
Local_accessor local_acc;
Evaluator device_evaluator;
Kernel_accessor kernel_filter;
Buffer_accessor buffer_acc;
internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
const cl::sycl::range<2> kernel_size;
const cl::sycl::range<3> input_range;
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
Buffer_accessor buffer_acc_,
internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
: local_acc(local_acc_),
device_evaluator(device_evaluator_),
kernel_filter(kernel_filter_),
buffer_acc(buffer_acc_),
indexMapper(indexMapper_),
kernel_size(kernel_size_),
input_range(input_range_) {}
template <typename BooleanDim3>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
}
void operator()(cl::sycl::nd_item<3> itemID) {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
// the required row to be calculated for the for each plane in shered memory
const auto num_input = cl::sycl::range<2>{
(cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
itemID.get_group(1) * itemID.get_local_range()[1]};
// fill the local memory
bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
const size_t local_index = i + local_input_offset;
const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i + input_offset[0], j + input_offset[1]);
local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
in_range_dim1 && in_range_dim2)
? device_evaluator.coeff(tensor_index)
: CoeffReturnType(0);
}
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
// output offset start for each thread
const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
itemID.get_group(1) * itemID.get_local_range()[1]};
if (boundary_check(itemID.get_global_id() < input_range)) {
CoeffReturnType result = static_cast<CoeffReturnType>(0);
for (size_t j = 0; j < kernel_size[1]; j++) {
size_t kernel_offset = kernel_size[0] * j;
const size_t index =
(num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
for (size_t i = 0; i < kernel_size[0]; i++) {
result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
}
}
const size_t tensor_index =
indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
itemID.get_local_id(1) + output_offset[1]);
buffer_ptr[tensor_index] = result;
}
}
};
template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
typename Kernel_accessor, typename Buffer_accessor>
struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
Buffer_accessor, convolution_type::CONV3D> {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
Local_accessor;
Local_accessor local_acc;
Evaluator device_evaluator;
Kernel_accessor kernel_filter;
Buffer_accessor buffer_acc;
internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
const cl::sycl::range<3> kernel_size;
const cl::sycl::range<3> input_range;
const size_t numP;
EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
Buffer_accessor buffer_acc_,
internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
const size_t numP_)
: local_acc(local_acc_),
device_evaluator(device_evaluator_),
kernel_filter(kernel_filter_),
buffer_acc(buffer_acc_),
indexMapper(indexMapper_),
kernel_size(kernel_size_),
input_range(input_range_),
numP(numP_) {}
template <typename BooleanDim3>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
}
void operator()(cl::sycl::nd_item<3> itemID) {
auto buffer_ptr = buffer_acc.get_pointer();
auto kernel_ptr = kernel_filter.get_pointer();
const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
const auto output_offset =
cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
for (size_t p = 0; p < numP; p++) {
/// fill the shared memory
const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
size_t local_index_dim2 = num_input[0] * num_input[1] * k;
bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
const size_t local_index = local_index_dim1 + i;
const size_t tensor_index =
plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
i + input_offset[0], j + input_offset[1], k + input_offset[2]);
local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
}
}
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
// calculate the convolution
if (boundary_check(itemID.get_global_id() < input_range)) {
CoeffReturnType result = static_cast<CoeffReturnType>(0);
for (size_t k = 0; k < kernel_size[2]; k++) {
for (size_t j = 0; j < kernel_size[1]; j++) {
for (size_t i = 0; i < kernel_size[0]; i++) {
const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
const size_t local_index =
((i + itemID.get_local_id(0)) +
num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
result += (local_acc[local_index] * kernel_ptr[kernel_index]);
}
}
}
const size_t tensor_index =
indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
buffer_ptr[tensor_index] = result;
}
itemID.barrier(cl::sycl::access::fence_space::local_space);
}
}
};
template <typename Indices, typename InputArgType, typename KernelArgType>
struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
static const int NumDims =
internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
static const int NumKernelDims = internal::array_size<Indices>::value;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
typedef const Eigen::SyclDevice Device;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
typedef typename InputArgType::Scalar Scalar;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
typedef typename Storage::Type EvaluatorPointerType;
typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
enum {
IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
PacketAccess = false,
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
: m_inputImpl(op.inputExpression(), device),
m_kernelArg(op.kernelExpression()),
m_kernelImpl(op.kernelExpression(), device),
m_indices(op.indices()),
m_buf(NULL),
m_kernel(NULL),
m_local_kernel(false),
m_device(device) {
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
m_kernelImpl.dimensions();
m_dimensions = m_inputImpl.dimensions();
for (int i = 0; i < NumKernelDims; ++i) {
const Index index = op.indices()[i];
const Index input_dim = input_dims[index];
const Index kernel_dim = kernel_dims[i];
const Index result_dim = input_dim - kernel_dim + 1;
m_dimensions[index] = result_dim;
}
}
EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
preloadKernel();
m_inputImpl.evalSubExprsIfNeeded(NULL);
if (data) {
executeEval(data);
return false;
} else {
m_buf = (EvaluatorPointerType)m_device.get(
(Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
executeEval(m_buf);
return true;
}
}
EIGEN_STRONG_INLINE void cleanup() {
m_inputImpl.cleanup();
if (m_buf) {
m_device.deallocate_temp(m_buf);
m_buf = NULL;
}
if (m_local_kernel) {
m_device.deallocate_temp(m_kernel);
m_local_kernel = false;
}
m_kernel = NULL;
}
/// used by sycl in order to build the sycl buffer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
/// used by sycl in order to build the sycl buffer
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
// Don't make a local copy of the kernel unless we have to (i.e. it's an
// expression that needs to be evaluated)
typename KernelStorage::Type in_place = m_kernelImpl.data();
if (in_place) {
m_kernel = in_place;
m_local_kernel = false;
} else {
ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
typedef TensorEvalToOp<const KernelArgType> EvalTo;
EvalTo evalToTmp(m_device.get(local), m_kernelArg);
const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
m_kernel = local;
m_local_kernel = true;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
typedef typename InputEvaluator::Dimensions InputDims;
switch (NumKernelDims) {
case 1: {
const size_t numX = dimensions()[m_indices[0]];
const size_t numP = dimensions().TotalSize() / numX;
const auto input_dim = std::array<size_t, 2>{numX, numP};
auto global_range = cl::sycl::range<2>{};
auto local_range = cl::sycl::range<2>{};
const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
m_device.parallel_for_setup(input_dim, global_range, local_range);
const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
const array<Index, 1> indices{{m_indices[0]}};
const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
ConvKernel;
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
break;
}
case 2: {
auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
(size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
const size_t numX = dimensions()[m_indices[kernel_index[0]]];
const size_t numY = dimensions()[m_indices[kernel_index[1]]];
const size_t numP = dimensions().TotalSize() / (numX * numY);
auto input_dim = std::array<size_t, 3>{numX, numY, numP};
auto global_range = cl::sycl::range<3>{};
auto local_range = cl::sycl::range<3>{};
m_device.parallel_for_setup(input_dim, global_range, local_range);
const size_t local_memory_size =
(local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
const array<Index, 2> kernel_dims{
{m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
ConvKernel;
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
break;
}
case 3: {
auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
(size_t)m_kernelImpl.dimensions()[kernel_index[1]],
(size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
const size_t numX = dimensions()[m_indices[kernel_index[0]]];
const size_t numY = dimensions()[m_indices[kernel_index[1]]];
const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
const array<Index, 3> indices{
{m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
m_kernelImpl.dimensions()[kernel_index[1]],
m_kernelImpl.dimensions()[kernel_index[2]]}};
internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
auto global_range = cl::sycl::range<3>{};
auto local_range = cl::sycl::range<3>{};
m_device.parallel_for_setup(input_dim, global_range, local_range);
auto local_memory_range = (local_range + kernel_size - 1);
const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
ConvKernel;
m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
break;
}
default: {
EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
eigen_assert(m_buf != NULL);
eigen_assert(index < m_dimensions.TotalSize());
return m_buf[index];
}
template <int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
eigen_assert(m_buf != NULL);
eigen_assert(index < m_dimensions.TotalSize());
return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
// TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
// model.
const double kernel_size = m_kernelImpl.dimensions().TotalSize();
// We ignore the use of fused multiply-add.
const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
const double firstIndex_compute_cost =
NumDims *
(2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
}
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_kernelImpl.bind(cgh);
m_inputImpl.bind(cgh);
m_buf.bind(cgh);
m_kernel.bind(cgh);
}
private:
// No assignment (copies are needed by the kernels)
TensorEvaluator &operator=(const TensorEvaluator &);
TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
KernelArgType m_kernelArg;
TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
Indices m_indices;
Dimensions m_dimensions;
EvaluatorPointerType m_buf;
typename KernelStorage::Type m_kernel;
bool m_local_kernel;
const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
}; // namespace Eigen
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H

View File

@@ -0,0 +1,214 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
namespace Eigen {
/** \class TensorEvaluator
* \ingroup CXX11_Tensor_Module
*
* \brief A cost model used to limit the number of threads used for evaluating
* tensor expression.
*
*/
// Class storing the cost of evaluating a tensor expression in terms of the
// estimated number of operand bytes loads, bytes stored, and compute cycles.
class TensorOpCost {
public:
// TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
// model based on minimal reciprocal throughput numbers from Intel or
// Agner Fog's tables would be better than what is there now.
template <typename ArgType>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
return internal::functor_traits<
internal::scalar_product_op<ArgType, ArgType> >::Cost;
}
template <typename ArgType>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
}
template <typename ArgType>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
return internal::functor_traits<
internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
}
template <typename ArgType>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
}
template <typename SrcType, typename TargetType>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
return internal::functor_traits<
internal::scalar_cast_op<SrcType, TargetType> >::Cost;
}
EIGEN_DEVICE_FUNC
TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
: bytes_loaded_(bytes_loaded),
bytes_stored_(bytes_stored),
compute_cycles_(compute_cycles) {}
EIGEN_DEVICE_FUNC
TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
bool vectorized, double packet_size)
: bytes_loaded_(bytes_loaded),
bytes_stored_(bytes_stored),
compute_cycles_(vectorized ? compute_cycles / packet_size
: compute_cycles) {
eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
return bytes_loaded_;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const {
return bytes_stored_;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const {
return compute_cycles_;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(
double load_cost, double store_cost, double compute_cost) const {
return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
compute_cost * compute_cycles_;
}
// Drop memory access component. Intended for cases when memory accesses are
// sequential or are completely masked by computations.
EIGEN_DEVICE_FUNC void dropMemoryCost() {
bytes_loaded_ = 0;
bytes_stored_ = 0;
}
// TODO(rmlarsen): Define min in terms of total cost, not elementwise.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
const TensorOpCost& rhs) const {
double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
// TODO(rmlarsen): Define max in terms of total cost, not elementwise.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
const TensorOpCost& rhs) const {
double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
const TensorOpCost& rhs) {
bytes_loaded_ += rhs.bytes_loaded();
bytes_stored_ += rhs.bytes_stored();
compute_cycles_ += rhs.compute_cycles();
return *this;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
bytes_loaded_ *= rhs;
bytes_stored_ *= rhs;
compute_cycles_ *= rhs;
return *this;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(
TensorOpCost lhs, const TensorOpCost& rhs) {
lhs += rhs;
return lhs;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
TensorOpCost lhs, double rhs) {
lhs *= rhs;
return lhs;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(
double lhs, TensorOpCost rhs) {
rhs *= lhs;
return rhs;
}
friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
return os << "[bytes_loaded = " << tc.bytes_loaded()
<< ", bytes_stored = " << tc.bytes_stored()
<< ", compute_cycles = " << tc.compute_cycles() << "]";
}
private:
double bytes_loaded_;
double bytes_stored_;
double compute_cycles_;
};
// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
// in [1:max_threads] instead of just switching multi-threading off for small
// work units.
template <typename Device>
class TensorCostModel {
public:
// Scaling from Eigen compute cost to device cycles.
static const int kDeviceCyclesPerComputeCycle = 1;
// Costs in device cycles.
static const int kStartupCycles = 100000;
static const int kPerThreadCycles = 100000;
static const int kTaskSize = 40000;
// Returns the number of threads in [1:max_threads] to use for
// evaluating an expression with the given output size and cost per
// coefficient.
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
double cost = totalCost(output_size, cost_per_coeff);
double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
// Make sure we don't invoke undefined behavior when we convert to an int.
threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
return numext::mini(max_threads,
numext::maxi<int>(1, static_cast<int>(threads)));
}
// taskSize assesses parallel task size.
// Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
// granularity needs to be increased to mitigate parallelization overheads.
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(
double output_size, const TensorOpCost& cost_per_coeff) {
return totalCost(output_size, cost_per_coeff) / kTaskSize;
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
double output_size, const TensorOpCost& cost_per_coeff) {
// Cost of memory fetches from L2 cache. 64 is typical cache line size.
// 11 is L2 cache latency on Haswell.
// We don't know whether data is in L1, L2 or L3. But we are most interested
// in single-threaded computational time around 100us-10ms (smaller time
// is too small for parallelization, larger time is not interesting
// either because we are probably using all available threads already).
// And for the target time range, L2 seems to be what matters. Data set
// fitting into L1 is too small to take noticeable time. Data set fitting
// only into L3 presumably will take more than 10ms to load and process.
const double kLoadCycles = 1.0 / 64 * 11;
const double kStoreCycles = 1.0 / 64 * 11;
// Scaling from Eigen compute cost to device cycles.
return output_size *
cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
kDeviceCyclesPerComputeCycle);
}
};
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H

View File

@@ -0,0 +1,347 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
namespace Eigen {
/** \class TensorCustomUnaryOp
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor custom class.
*
*
*/
namespace internal {
template<typename CustomUnaryFunc, typename XprType>
struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
{
typedef typename XprType::Scalar Scalar;
typedef typename XprType::StorageKind StorageKind;
typedef typename XprType::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = traits<XprType>::NumDimensions;
static const int Layout = traits<XprType>::Layout;
typedef typename traits<XprType>::PointerType PointerType;
};
template<typename CustomUnaryFunc, typename XprType>
struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
{
typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
};
template<typename CustomUnaryFunc, typename XprType>
struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
{
typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
};
} // end namespace internal
template<typename CustomUnaryFunc, typename XprType>
class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors>
{
public:
typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
: m_expr(expr), m_func(func) {}
EIGEN_DEVICE_FUNC
const CustomUnaryFunc& func() const { return m_func; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_expr; }
protected:
typename XprType::Nested m_expr;
const CustomUnaryFunc m_func;
};
// Eval as rvalue
template<typename CustomUnaryFunc, typename XprType, typename Device>
struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device>
{
typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
typedef typename internal::traits<ArgType>::Index Index;
static const int NumDims = internal::traits<ArgType>::NumDimensions;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<XprType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
: m_op(op), m_device(device), m_result(NULL)
{
m_dimensions = op.func().dimensions(op.expression());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
if (data) {
evalTo(data);
return false;
} else {
m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
evalTo(m_result);
return true;
}
}
EIGEN_STRONG_INLINE void cleanup() {
if (m_result) {
m_device.deallocate_temp(m_result);
m_result = NULL;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
return m_result[index];
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
// TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_result.bind(cgh);
}
#endif
protected:
void evalTo(EvaluatorPointerType data) {
TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
m_op.func().eval(m_op.expression(), result, m_device);
}
Dimensions m_dimensions;
const ArgType m_op;
const Device EIGEN_DEVICE_REF m_device;
EvaluatorPointerType m_result;
};
/** \class TensorCustomBinaryOp
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor custom class.
*
*
*/
namespace internal {
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
{
typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
typename RhsXprType::Scalar>::ret Scalar;
typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<LhsXprType>::Index,
typename traits<RhsXprType>::Index>::type Index;
typedef typename LhsXprType::Nested LhsNested;
typedef typename RhsXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = traits<LhsXprType>::NumDimensions;
static const int Layout = traits<LhsXprType>::Layout;
typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
};
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense>
{
typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
};
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
{
typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
};
} // end namespace internal
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
class TensorCustomBinaryOp : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors>
{
public:
typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const CustomBinaryFunc& func)
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
EIGEN_DEVICE_FUNC
const CustomBinaryFunc& func() const { return m_func; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename LhsXprType::Nested>::type&
lhsExpression() const { return m_lhs_xpr; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename RhsXprType::Nested>::type&
rhsExpression() const { return m_rhs_xpr; }
protected:
typename LhsXprType::Nested m_lhs_xpr;
typename RhsXprType::Nested m_rhs_xpr;
const CustomBinaryFunc m_func;
};
// Eval as rvalue
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device>
{
typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
typedef typename internal::traits<XprType>::Index Index;
static const int NumDims = internal::traits<XprType>::NumDimensions;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<LhsXprType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_op(op), m_device(device), m_result(NULL)
{
m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
if (data) {
evalTo(data);
return false;
} else {
m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
evalTo(m_result);
return true;
}
}
EIGEN_STRONG_INLINE void cleanup() {
if (m_result != NULL) {
m_device.deallocate_temp(m_result);
m_result = NULL;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
return m_result[index];
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
// TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_result.bind(cgh);
}
#endif
protected:
void evalTo(EvaluatorPointerType data) {
TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
}
Dimensions m_dimensions;
const XprType m_op;
const Device EIGEN_DEVICE_REF m_device;
EvaluatorPointerType m_result;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H

View File

@@ -0,0 +1,137 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
namespace Eigen {
/** \class TensorDevice
* \ingroup CXX11_Tensor_Module
*
* \brief Pseudo expression providing an operator = that will evaluate its argument
* on the specified computing 'device' (GPU, thread pool, ...)
*
* Example:
* C.device(EIGEN_GPU) = A + B;
*
* Todo: operator *= and /=.
*/
template <typename ExpressionType, typename DeviceType> class TensorDevice {
public:
TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
Assign assign(m_expression, other);
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
return *this;
}
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
typedef typename OtherDerived::Scalar Scalar;
typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
Sum sum(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Sum> Assign;
Assign assign(m_expression, sum);
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
return *this;
}
template<typename OtherDerived>
EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
typedef typename OtherDerived::Scalar Scalar;
typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived> Difference;
Difference difference(m_expression, other);
typedef TensorAssignOp<ExpressionType, const Difference> Assign;
Assign assign(m_expression, difference);
internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
return *this;
}
protected:
const DeviceType& m_device;
ExpressionType& m_expression;
};
/** \class TensorAsyncDevice
* \ingroup CXX11_Tensor_Module
*
* \brief Pseudo expression providing an operator = that will evaluate its
* argument asynchronously on the specified device. Currently only
* ThreadPoolDevice implements proper asynchronous execution, while the default
* and GPU devices just run the expression synchronously and call m_done() on
* completion..
*
* Example:
* auto done = []() { ... expression evaluation done ... };
* C.device(thread_pool_device, std::move(done)) = A + B;
*/
template <typename ExpressionType, typename DeviceType, typename DoneCallback>
class TensorAsyncDevice {
public:
TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
DoneCallback done)
: m_device(device), m_expression(expression), m_done(std::move(done)) {}
template <typename OtherDerived>
EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
Assign assign(m_expression, other);
Executor::run(assign, m_device);
m_done();
return *this;
}
protected:
const DeviceType& m_device;
ExpressionType& m_expression;
DoneCallback m_done;
};
#ifdef EIGEN_USE_THREADS
template <typename ExpressionType, typename DoneCallback>
class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
public:
TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression,
DoneCallback done)
: m_device(device), m_expression(expression), m_done(std::move(done)) {}
template <typename OtherDerived>
EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
// WARNING: After assignment 'm_done' callback will be in undefined state.
Assign assign(m_expression, other);
Executor::runAsync(assign, m_device, std::move(m_done));
return *this;
}
protected:
const ThreadPoolDevice& m_device;
ExpressionType& m_expression;
DoneCallback m_done;
};
#endif
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H

View File

@@ -0,0 +1,6 @@
#if defined(__clang__) || defined(__GNUC__)
#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
#endif
#include "TensorDeviceGpu.h"

View File

@@ -0,0 +1,104 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
namespace Eigen {
// Default device for the machine (typically a single cpu core)
struct DefaultDevice {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return internal::aligned_malloc(num_bytes);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
internal::aligned_free(buffer);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
return allocate(num_bytes);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
deallocate(buffer);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
::memcpy(dst, src, n);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
::memset(buffer, c, n);
}
template<typename Type>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
return data;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
#if !defined(EIGEN_GPU_COMPILE_PHASE)
// Running on the host CPU
return 1;
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
return 64;
#else
// Running on a CUDA device
return 32;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
// Running on the host CPU
return l1CacheSize();
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
return 48*1024; // FIXME : update this number for HIP
#else
// Running on a CUDA device, return the amount of shared memory available.
return 48*1024;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
// Running single threaded on the host CPU
return l3CacheSize();
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
return firstLevelCacheSize(); // FIXME : update this number for HIP
#else
// Running on a CUDA device
return firstLevelCacheSize();
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
#if !defined(EIGEN_GPU_COMPILE_PHASE)
// Running single threaded on the host CPU
// Should return an enum that encodes the ISA supported by the CPU
return 1;
#elif defined(EIGEN_HIP_DEVICE_COMPILE)
// Running on a HIP device
// return 1 as major for HIP
return 1;
#else
// Running on a CUDA device
return EIGEN_CUDA_ARCH / 100;
#endif
}
};
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H

View File

@@ -0,0 +1,389 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
// This header file container defines fo gpu* macros which will resolve to
// their equivalent hip* or cuda* versions depending on the compiler in use
// A separate header (included at the end of this file) will undefine all
#include "TensorGpuHipCudaDefines.h"
namespace Eigen {
static const int kGpuScratchSize = 1024;
// This defines an interface that GPUDevice can take to use
// HIP / CUDA streams underneath.
class StreamInterface {
public:
virtual ~StreamInterface() {}
virtual const gpuStream_t& stream() const = 0;
virtual const gpuDeviceProp_t& deviceProperties() const = 0;
// Allocate memory on the actual device where the computation will run
virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0;
// Return a scratchpad buffer of size 1k
virtual void* scratchpad() const = 0;
// Return a semaphore. The semaphore is initially initialized to 0, and
// each kernel using it is responsible for resetting to 0 upon completion
// to maintain the invariant that the semaphore is always equal to 0 upon
// each kernel start.
virtual unsigned int* semaphore() const = 0;
};
class GpuDeviceProperties {
public:
GpuDeviceProperties() :
initialized_(false), first_(true), device_properties_(nullptr) {}
~GpuDeviceProperties() {
if (device_properties_) {
delete[] device_properties_;
}
}
EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
return device_properties_[device];
}
EIGEN_STRONG_INLINE bool isInitialized() const {
return initialized_;
}
void initialize() {
if (!initialized_) {
// Attempts to ensure proper behavior in the case of multiple threads
// calling this function simultaneously. This would be trivial to
// implement if we could use std::mutex, but unfortunately mutex don't
// compile with nvcc, so we resort to atomics and thread fences instead.
// Note that if the caller uses a compiler that doesn't support c++11 we
// can't ensure that the initialization is thread safe.
if (first_.exchange(false)) {
// We're the first thread to reach this point.
int num_devices;
gpuError_t status = gpuGetDeviceCount(&num_devices);
if (status != gpuSuccess) {
std::cerr << "Failed to get the number of GPU devices: "
<< gpuGetErrorString(status)
<< std::endl;
gpu_assert(status == gpuSuccess);
}
device_properties_ = new gpuDeviceProp_t[num_devices];
for (int i = 0; i < num_devices; ++i) {
status = gpuGetDeviceProperties(&device_properties_[i], i);
if (status != gpuSuccess) {
std::cerr << "Failed to initialize GPU device #"
<< i
<< ": "
<< gpuGetErrorString(status)
<< std::endl;
gpu_assert(status == gpuSuccess);
}
}
std::atomic_thread_fence(std::memory_order_release);
initialized_ = true;
} else {
// Wait for the other thread to inititialize the properties.
while (!initialized_) {
std::atomic_thread_fence(std::memory_order_acquire);
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
}
}
}
}
private:
volatile bool initialized_;
std::atomic<bool> first_;
gpuDeviceProp_t* device_properties_;
};
EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
if (!deviceProperties->isInitialized()) {
deviceProperties->initialize();
}
return *deviceProperties;
}
EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
return GetGpuDeviceProperties().get(device);
}
static const gpuStream_t default_stream = gpuStreamDefault;
class GpuStreamDevice : public StreamInterface {
public:
// Use the default stream on the current device
GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
gpuGetDevice(&device_);
}
// Use the default stream on the specified device
GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
// Use the specified stream. Note that it's the
// caller responsibility to ensure that the stream can run on
// the specified device. If no device is specified the code
// assumes that the stream is associated to the current gpu device.
GpuStreamDevice(const gpuStream_t* stream, int device = -1)
: stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
if (device < 0) {
gpuGetDevice(&device_);
} else {
int num_devices;
gpuError_t err = gpuGetDeviceCount(&num_devices);
EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess);
gpu_assert(device < num_devices);
device_ = device;
}
}
virtual ~GpuStreamDevice() {
if (scratch_) {
deallocate(scratch_);
}
}
const gpuStream_t& stream() const { return *stream_; }
const gpuDeviceProp_t& deviceProperties() const {
return GetGpuDeviceProperties(device_);
}
virtual void* allocate(size_t num_bytes) const {
gpuError_t err = gpuSetDevice(device_);
EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess);
void* result;
err = gpuMalloc(&result, num_bytes);
gpu_assert(err == gpuSuccess);
gpu_assert(result != NULL);
return result;
}
virtual void deallocate(void* buffer) const {
gpuError_t err = gpuSetDevice(device_);
EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess);
gpu_assert(buffer != NULL);
err = gpuFree(buffer);
gpu_assert(err == gpuSuccess);
}
virtual void* scratchpad() const {
if (scratch_ == NULL) {
scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
}
return scratch_;
}
virtual unsigned int* semaphore() const {
if (semaphore_ == NULL) {
char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
semaphore_ = reinterpret_cast<unsigned int*>(scratch);
gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess);
}
return semaphore_;
}
private:
const gpuStream_t* stream_;
int device_;
mutable void* scratch_;
mutable unsigned int* semaphore_;
};
struct GpuDevice {
// The StreamInterface is not owned: the caller is
// responsible for its initialization and eventual destruction.
explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
eigen_assert(stream);
}
explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
eigen_assert(stream);
}
// TODO(bsteiner): This is an internal API, we should not expose it.
EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
return stream_->stream();
}
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return stream_->allocate(num_bytes);
}
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
stream_->deallocate(buffer);
}
EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
return stream_->allocate(num_bytes);
}
EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
stream_->deallocate(buffer);
}
template<typename Type>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
return data;
}
EIGEN_STRONG_INLINE void* scratchpad() const {
return stream_->scratchpad();
}
EIGEN_STRONG_INLINE unsigned int* semaphore() const {
return stream_->semaphore();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifndef EIGEN_GPU_COMPILE_PHASE
gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess);
#else
EIGEN_UNUSED_VARIABLE(dst);
EIGEN_UNUSED_VARIABLE(src);
EIGEN_UNUSED_VARIABLE(n);
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
gpuError_t err =
gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess);
}
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
gpuError_t err =
gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
#ifndef EIGEN_GPU_COMPILE_PHASE
gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
EIGEN_UNUSED_VARIABLE(err)
gpu_assert(err == gpuSuccess);
#else
eigen_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_STRONG_INLINE size_t numThreads() const {
// FIXME
return 32;
}
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
// FIXME
return 48*1024;
}
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// We won't try to take advantage of the l2 cache for the time being, and
// there is no l3 cache on hip/cuda devices.
return firstLevelCacheSize();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
#ifndef EIGEN_GPU_COMPILE_PHASE
gpuError_t err = gpuStreamSynchronize(stream_->stream());
if (err != gpuSuccess) {
std::cerr << "Error detected in GPU stream: "
<< gpuGetErrorString(err)
<< std::endl;
gpu_assert(err == gpuSuccess);
}
#else
gpu_assert(false && "The default device should be used instead to generate kernel code");
#endif
}
EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
return stream_->deviceProperties().multiProcessorCount;
}
EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
return stream_->deviceProperties().maxThreadsPerBlock;
}
EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
return stream_->deviceProperties().maxThreadsPerMultiProcessor;
}
EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
return stream_->deviceProperties().sharedMemPerBlock;
}
EIGEN_STRONG_INLINE int majorDeviceVersion() const {
return stream_->deviceProperties().major;
}
EIGEN_STRONG_INLINE int minorDeviceVersion() const {
return stream_->deviceProperties().minor;
}
EIGEN_STRONG_INLINE int maxBlocks() const {
return max_blocks_;
}
// This function checks if the GPU runtime recorded an error for the
// underlying stream device.
inline bool ok() const {
#ifdef EIGEN_GPUCC
gpuError_t error = gpuStreamQuery(stream_->stream());
return (error == gpuSuccess) || (error == gpuErrorNotReady);
#else
return false;
#endif
}
private:
const StreamInterface* stream_;
int max_blocks_;
};
#if defined(EIGEN_HIPCC)
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
gpu_assert(hipGetLastError() == hipSuccess);
#else
#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \
(kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \
gpu_assert(cudaGetLastError() == cudaSuccess);
#endif
// FIXME: Should be device and kernel specific.
#ifdef EIGEN_GPUCC
static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
#ifndef EIGEN_GPU_COMPILE_PHASE
gpuError_t status = gpuDeviceSetSharedMemConfig(config);
EIGEN_UNUSED_VARIABLE(status)
gpu_assert(status == gpuSuccess);
#else
EIGEN_UNUSED_VARIABLE(config)
#endif
}
#endif
} // end namespace Eigen
// undefine all the gpu* macros we defined at the beginning of the file
#include "TensorGpuHipCudaUndefines.h"
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,409 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
namespace Eigen {
// Runs an arbitrary function and then calls Notify() on the passed in
// Notification.
template <typename Function, typename... Args> struct FunctionWrapperWithNotification
{
static void run(Notification* n, Function f, Args... args) {
f(args...);
if (n) {
n->Notify();
}
}
};
template <typename Function, typename... Args> struct FunctionWrapperWithBarrier
{
static void run(Barrier* b, Function f, Args... args) {
f(args...);
if (b) {
b->Notify();
}
}
};
template <typename SyncType>
static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
if (n) {
n->Wait();
}
}
// An abstract interface to a device specific memory allocator.
class Allocator {
public:
virtual ~Allocator() {}
virtual void* allocate(size_t num_bytes) const = 0;
virtual void deallocate(void* buffer) const = 0;
};
// Build a thread pool device on top the an existing pool of threads.
struct ThreadPoolDevice {
// The ownership of the thread pool remains with the caller.
ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
: pool_(pool), num_threads_(num_cores), allocator_(allocator) { }
EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
return allocator_ ? allocator_->allocate(num_bytes)
: internal::aligned_malloc(num_bytes);
}
EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
if (allocator_) {
allocator_->deallocate(buffer);
} else {
internal::aligned_free(buffer);
}
}
EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
return allocate(num_bytes);
}
EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
deallocate(buffer);
}
template<typename Type>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
return data;
}
EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
#ifdef __ANDROID__
::memcpy(dst, src, n);
#else
// TODO(rmlarsen): Align blocks on cache lines.
// We have observed that going beyond 4 threads usually just wastes
// CPU cycles due to the threads competing for memory bandwidth, so we
// statically schedule at most 4 block copies here.
const size_t kMinBlockSize = 32768;
const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
if (n <= kMinBlockSize || num_threads < 2) {
::memcpy(dst, src, n);
} else {
const char* src_ptr = static_cast<const char*>(src);
char* dst_ptr = static_cast<char*>(dst);
const size_t blocksize = (n + (num_threads - 1)) / num_threads;
Barrier barrier(static_cast<int>(num_threads - 1));
// Launch the last 3 blocks on worker threads.
for (size_t i = 1; i < num_threads; ++i) {
enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
numext::mini(blocksize, n - (i * blocksize)));
});
}
// Launch the first block on the main thread.
::memcpy(dst_ptr, src_ptr, blocksize);
barrier.Wait();
}
#endif
}
EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
}
EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
memcpy(dst, src, n);
}
EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
::memset(buffer, c, n);
}
EIGEN_STRONG_INLINE int numThreads() const {
return num_threads_;
}
// Number of theads available in the underlying thread pool. This number can
// be different from the value returned by numThreads().
EIGEN_STRONG_INLINE int numThreadsInPool() const {
return pool_->NumThreads();
}
EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
return l1CacheSize();
}
EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
// The l3 cache size is shared between all the cores.
return l3CacheSize() / num_threads_;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
// Should return an enum that encodes the ISA supported by the CPU
return 1;
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE Notification* enqueue(Function&& f,
Args&&... args) const {
Notification* n = new Notification();
pool_->Schedule(
std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n,
std::move(f), args...));
return n;
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f,
Args&&... args) const {
pool_->Schedule(
std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b,
std::move(f), args...));
}
template <class Function, class... Args>
EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f,
Args&&... args) const {
if (sizeof...(args) > 0) {
pool_->Schedule(std::bind(std::move(f), args...));
} else {
pool_->Schedule(std::move(f));
}
}
// Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
// called from one of the threads in pool_. Returns -1 otherwise.
EIGEN_STRONG_INLINE int currentThreadId() const {
return pool_->CurrentThreadId();
}
// WARNING: This function is synchronous and will block the calling thread.
//
// Synchronous parallelFor executes f with [0, n) arguments in parallel and
// waits for completion. F accepts a half-open interval [first, last). Block
// size is chosen based on the iteration cost and resulting parallel
// efficiency. If block_align is not nullptr, it is called to round up the
// block size.
void parallelFor(Index n, const TensorOpCost& cost,
std::function<Index(Index)> block_align,
std::function<void(Index, Index)> f) const {
if (EIGEN_PREDICT_FALSE(n <= 0)){
return;
// Compute small problems directly in the caller thread.
} else if (n == 1 || numThreads() == 1 ||
CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
f(0, n);
return;
}
// Compute block size and total count of blocks.
ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
// Recursively divide size into halves until we reach block_size.
// Division code rounds mid to block_size, so we are guaranteed to get
// block_count leaves that do actual computations.
Barrier barrier(static_cast<unsigned int>(block.count));
std::function<void(Index, Index)> handleRange;
handleRange = [=, &handleRange, &barrier, &f](Index firstIdx,
Index lastIdx) {
while (lastIdx - firstIdx > block.size) {
// Split into halves and schedule the second half on a different thread.
const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
lastIdx = midIdx;
}
// Single block or less, execute directly.
f(firstIdx, lastIdx);
barrier.Notify();
};
if (block.count <= numThreads()) {
// Avoid a thread hop by running the root of the tree and one block on the
// main thread.
handleRange(0, n);
} else {
// Execute the root in the thread pool to avoid running work on more than
// numThreads() threads.
pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
}
barrier.Wait();
}
// Convenience wrapper for parallelFor that does not align blocks.
void parallelFor(Index n, const TensorOpCost& cost,
std::function<void(Index, Index)> f) const {
parallelFor(n, cost, nullptr, std::move(f));
}
// WARNING: This function is asynchronous and will not block the calling thread.
//
// Asynchronous parallelFor executes f with [0, n) arguments in parallel
// without waiting for completion. When the last block finished, it will call
// 'done' callback. F accepts a half-open interval [first, last). Block size
// is chosen based on the iteration cost and resulting parallel efficiency. If
// block_align is not nullptr, it is called to round up the block size.
void parallelForAsync(Index n, const TensorOpCost& cost,
std::function<Index(Index)> block_align,
std::function<void(Index, Index)> f,
std::function<void()> done) const {
// Compute small problems directly in the caller thread.
if (n <= 1 || numThreads() == 1 ||
CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
f(0, n);
done();
return;
}
// Compute block size and total count of blocks.
ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
ParallelForAsyncContext* const ctx =
new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
// Recursively divide size into halves until we reach block_size.
// Division code rounds mid to block_size, so we are guaranteed to get
// block_count leaves that do actual computations.
ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
while (lastIdx - firstIdx > block.size) {
// Split into halves and schedule the second half on a different thread.
const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
pool_->Schedule(
[ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
lastIdx = midIdx;
}
// Single block or less, execute directly.
ctx->f(firstIdx, lastIdx);
// Delete async context if it was the last block.
if (ctx->count.fetch_sub(1) == 1) delete ctx;
};
if (block.count <= numThreads()) {
// Avoid a thread hop by running the root of the tree and one block on the
// main thread.
ctx->handle_range(0, n);
} else {
// Execute the root in the thread pool to avoid running work on more than
// numThreads() threads.
pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
}
}
// Convenience wrapper for parallelForAsync that does not align blocks.
void parallelForAsync(Index n, const TensorOpCost& cost,
std::function<void(Index, Index)> f,
std::function<void()> done) const {
parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
}
// Thread pool accessor.
ThreadPoolInterface* getPool() const { return pool_; }
// Allocator accessor.
Allocator* allocator() const { return allocator_; }
private:
typedef TensorCostModel<ThreadPoolDevice> CostModel;
// For parallelForAsync we must keep passed in closures on the heap, and
// delete them only after `done` callback finished.
struct ParallelForAsyncContext {
ParallelForAsyncContext(Index block_count,
std::function<void(Index, Index)> block_f,
std::function<void()> done_callback)
: count(block_count),
f(std::move(block_f)),
done(std::move(done_callback)) {}
~ParallelForAsyncContext() { done(); }
std::atomic<Index> count;
std::function<void(Index, Index)> f;
std::function<void()> done;
std::function<void(Index, Index)> handle_range;
};
struct ParallelForBlock {
Index size; // block size
Index count; // number of blocks
};
// Calculates block size based on (1) the iteration cost and (2) parallel
// efficiency. We want blocks to be not too small to mitigate parallelization
// overheads; not too large to mitigate tail effect and potential load
// imbalance and we also want number of blocks to be evenly dividable across
// threads.
ParallelForBlock CalculateParallelForBlock(
const Index n, const TensorOpCost& cost,
std::function<Index(Index)> block_align) const {
const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
const Index max_oversharding_factor = 4;
Index block_size = numext::mini(
n, numext::maxi<Index>(
divup<Index>(n, max_oversharding_factor * numThreads()),
block_size_f));
const Index max_block_size = numext::mini(n, 2 * block_size);
if (block_align) {
Index new_block_size = block_align(block_size);
eigen_assert(new_block_size >= block_size);
block_size = numext::mini(n, new_block_size);
}
Index block_count = divup(n, block_size);
// Calculate parallel efficiency as fraction of total CPU time used for
// computations:
double max_efficiency =
static_cast<double>(block_count) /
(divup<int>(block_count, numThreads()) * numThreads());
// Now try to increase block size up to max_block_size as long as it
// doesn't decrease parallel efficiency.
for (Index prev_block_count = block_count;
max_efficiency < 1.0 && prev_block_count > 1;) {
// This is the next block size that divides size into a smaller number
// of blocks than the current block_size.
Index coarser_block_size = divup(n, prev_block_count - 1);
if (block_align) {
Index new_block_size = block_align(coarser_block_size);
eigen_assert(new_block_size >= coarser_block_size);
coarser_block_size = numext::mini(n, new_block_size);
}
if (coarser_block_size > max_block_size) {
break; // Reached max block size. Stop.
}
// Recalculate parallel efficiency.
const Index coarser_block_count = divup(n, coarser_block_size);
eigen_assert(coarser_block_count < prev_block_count);
prev_block_count = coarser_block_count;
const double coarser_efficiency =
static_cast<double>(coarser_block_count) /
(divup<int>(coarser_block_count, numThreads()) * numThreads());
if (coarser_efficiency + 0.01 >= max_efficiency) {
// Taking it.
block_size = coarser_block_size;
block_count = coarser_block_count;
if (max_efficiency < coarser_efficiency) {
max_efficiency = coarser_efficiency;
}
}
}
return {block_size, block_count};
}
ThreadPoolInterface* pool_;
int num_threads_;
Allocator* allocator_;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H

View File

@@ -0,0 +1,236 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
namespace Eigen {
/** \internal
*
* \class TensorDimensionList
* \ingroup CXX11_Tensor_Module
*
* \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
*
* \sa Tensor
*/
template <typename Index, std::size_t Rank> struct DimensionList {
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
const Index operator[] (const Index i) const { return i; }
};
namespace internal {
template<typename Index, std::size_t Rank> struct array_size<DimensionList<Index, Rank> > {
static const size_t value = Rank;
};
template<typename Index, std::size_t Rank> struct array_size<const DimensionList<Index, Rank> > {
static const size_t value = Rank;
};
template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(DimensionList<Index, Rank>&) {
return n;
}
template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(const DimensionList<Index, Rank>&) {
return n;
}
#if EIGEN_HAS_CONSTEXPR
template <typename Index, std::size_t Rank>
struct index_known_statically_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
return true;
}
};
template <typename Index, std::size_t Rank>
struct index_known_statically_impl<const DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
return true;
}
};
template <typename Index, std::size_t Rank>
struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return true;
}
};
template <typename Index, std::size_t Rank>
struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return true;
}
};
template <typename Index, std::size_t Rank>
struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return true;
}
};
template <typename Index, std::size_t Rank>
struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return true;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_eq_impl<DimensionList<Index, Rank> > {
static constexpr bool run(const DenseIndex i, const DenseIndex value) {
return i == value;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
return i == value;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_ne_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
return i != value;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
static constexpr bool run(const DenseIndex i, const DenseIndex value) {
return i != value;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_gt_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
return i > value;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
return i > value;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_lt_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
return i < value;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
return i < value;
}
};
#else
template <typename Index, std::size_t Rank>
struct index_known_statically_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
return true;
}
};
template <typename Index, std::size_t Rank>
struct index_known_statically_impl<const DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
return true;
}
};
template <typename Index, std::size_t Rank>
struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
return true;
}
};
template <typename Index, std::size_t Rank>
struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE bool run() {
return true;
}
};
template <typename Index, std::size_t Rank>
struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
return true;
}
};
template <typename Index, std::size_t Rank>
struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
return true;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_eq_impl<DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
return false;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
return false;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_ne_impl<DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex){
return false;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
return false;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_gt_impl<DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
return false;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
return false;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_lt_impl<DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
return false;
}
};
template <typename Index, std::size_t Rank>
struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex, const DenseIndex) {
return false;
}
};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H

View File

@@ -0,0 +1,490 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
namespace Eigen {
/** \internal
*
* \class TensorDimensions
* \ingroup CXX11_Tensor_Module
*
* \brief Set of classes used to encode and store the dimensions of a Tensor.
*
* The Sizes class encodes as part of the type the number of dimensions and the
* sizes corresponding to each dimension. It uses no storage space since it is
* entirely known at compile time.
* The DSizes class is its dynamic sibling: the number of dimensions is known
* at compile time but the sizes are set during execution.
*
* \sa Tensor
*/
// Boilerplate code
namespace internal {
template<std::ptrdiff_t n, typename Dimension> struct dget {
static const std::ptrdiff_t value = get<n, Dimension>::value;
};
template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
struct fixed_size_tensor_index_linearization_helper
{
template <typename Dimensions> EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
const Dimensions& dimensions)
{
return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
dget<RowMajor ? n - 1 : (NumIndices - n), Dimensions>::value *
fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
}
};
template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
{
template <typename Dimensions> EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&)
{
return 0;
}
};
template<typename Index, std::ptrdiff_t n>
struct fixed_size_tensor_index_extraction_helper
{
template <typename Dimensions> EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Index run(const Index index,
const Dimensions& dimensions)
{
const Index mult = (index == n-1) ? 1 : 0;
return array_get<n-1>(dimensions) * mult +
fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
}
};
template<typename Index>
struct fixed_size_tensor_index_extraction_helper<Index, 0>
{
template <typename Dimensions> EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE Index run(const Index,
const Dimensions&)
{
return 0;
}
};
} // end namespace internal
// Fixed size
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::ptrdiff_t... Indices>
struct Sizes {
typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
const Base t = Base();
static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
static const ptrdiff_t count = Base::count;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
return Base::count;
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() {
return internal::arg_prod(Indices...);
}
EIGEN_DEVICE_FUNC Sizes() { }
template <typename DenseIndex>
explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
// todo: add assertion
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
// todo: add assertion
}
#endif
template <typename T> Sizes& operator = (const T& /*other*/) {
// add assertion failure if the size of other is different
return *this;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const {
return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
}
};
namespace internal {
template <typename std::ptrdiff_t... Indices>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
return Sizes<Indices...>::total_size;
}
}
#else
template <std::ptrdiff_t n>
struct non_zero_size {
typedef internal::type2val<std::ptrdiff_t, n> type;
};
template <>
struct non_zero_size<0> {
typedef internal::null_type type;
};
template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes {
typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
static const std::ptrdiff_t count = Base::count;
static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const {
return count;
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() {
return internal::arg_prod<Base>::value;
}
Sizes() { }
template <typename DenseIndex>
explicit Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
// todo: add assertion
}
template <typename T> Sizes& operator = (const T& /*other*/) {
// add assertion failure if the size of other is different
return *this;
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
explicit Sizes(std::initializer_list<std::ptrdiff_t>) {
// todo: add assertion
}
#else
EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
}
EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
}
EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
}
EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
}
EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index operator[] (const Index index) const {
switch (index) {
case 0:
return internal::get<0, Base>::value;
case 1:
return internal::get<1, Base>::value;
case 2:
return internal::get<2, Base>::value;
case 3:
return internal::get<3, Base>::value;
case 4:
return internal::get<4, Base>::value;
default:
eigen_assert(false && "index overflow");
return static_cast<Index>(-1);
}
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
}
template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
}
};
namespace internal {
template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
return Sizes<V1, V2, V3, V4, V5>::total_size;
}
}
#endif
// Boilerplate
namespace internal {
template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
struct tensor_index_linearization_helper
{
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const& dimensions)
{
return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
}
};
template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
{
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index run(array<Index, NumIndices> const& indices, array<Index, NumIndices> const&)
{
return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
}
};
} // end namespace internal
// Dynamic size
template <typename DenseIndex, int NumDims>
struct DSizes : array<DenseIndex, NumDims> {
typedef array<DenseIndex, NumDims> Base;
static const int count = NumDims;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const {
return NumDims;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = 0;
}
}
EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) { }
EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
eigen_assert(NumDims == 1);
(*this)[0] = i0;
}
EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = a[i];
}
}
// Enable DSizes index type promotion only if we are promoting to the
// larger type, e.g. allow to promote dimensions of type int to long.
template<typename OtherIndex>
EIGEN_DEVICE_FUNC
explicit DSizes(const array<OtherIndex, NumDims>& other,
// Default template parameters require c++11.
typename internal::enable_if<
internal::is_same<
DenseIndex,
typename internal::promote_index_type<
DenseIndex,
OtherIndex
>::type
>::value, void*>::type = 0) {
for (int i = 0; i < NumDims; ++i) {
(*this)[i] = static_cast<DenseIndex>(other[i]);
}
}
#ifdef EIGEN_HAS_INDEX_LIST
template <typename FirstType, typename... OtherTypes>
EIGEN_DEVICE_FUNC
explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
for (int i = 0; i < dimensions.count; ++i) {
(*this)[i] = dimensions[i];
}
}
#endif
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::ptrdiff_t... Indices>
EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = a[i];
}
}
#else
template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
for (int i = 0 ; i < NumDims; ++i) {
(*this)[i] = a[i];
}
}
#endif
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#else
EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
eigen_assert(NumDims == 2);
(*this)[0] = i0;
(*this)[1] = i1;
}
EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
eigen_assert(NumDims == 3);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
}
EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
eigen_assert(NumDims == 4);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
(*this)[3] = i3;
}
EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
eigen_assert(NumDims == 5);
(*this)[0] = i0;
(*this)[1] = i1;
(*this)[2] = i2;
(*this)[3] = i3;
(*this)[4] = i4;
}
#endif
EIGEN_DEVICE_FUNC DSizes& operator = (const array<DenseIndex, NumDims>& other) {
*static_cast<Base*>(this) = other;
return *this;
}
// A constexpr would be so much better here
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(indices, *static_cast<const Base*>(this));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(indices, *static_cast<const Base*>(this));
}
};
template <typename IndexType, int NumDims>
std::ostream& operator<<(std::ostream& os,
const DSizes<IndexType, NumDims>& dims) {
os << "[";
for (int i = 0; i < NumDims; ++i) {
if (i > 0) os << ", ";
os << dims[i];
}
os << "]";
return os;
}
// Boilerplate
namespace internal {
template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
struct tensor_vsize_index_linearization_helper
{
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const& dimensions)
{
return array_get<RowMajor ? n : (NumIndices - n - 1)>(indices) +
array_get<RowMajor ? n : (NumIndices - n - 1)>(dimensions) *
tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(indices, dimensions);
}
};
template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
{
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Index run(array<Index, NumIndices> const& indices, std::vector<DenseIndex> const&)
{
return array_get<RowMajor ? 0 : NumIndices - 1>(indices);
}
};
} // end namespace internal
namespace internal {
template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
static const ptrdiff_t value = NumDims;
};
template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
static const ptrdiff_t value = NumDims;
};
#ifndef EIGEN_EMULATE_CXX11_META_H
template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
static const std::ptrdiff_t value = Sizes<Indices...>::count;
};
template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...> > {
static const std::ptrdiff_t value = Sizes<Indices...>::count;
};
template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
}
template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
eigen_assert(false && "should never be called");
return -1;
}
#else
template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
};
template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
};
template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
}
#endif
template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
struct sizes_match_below_dim {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
return false;
}
};
template <typename Dims1, typename Dims2, ptrdiff_t n>
struct sizes_match_below_dim<Dims1, Dims2, n, n> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &&
sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
}
};
template <typename Dims1, typename Dims2>
struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
return true;
}
};
} // end namespace internal
template <typename Dims1, typename Dims2>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
}
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H

View File

@@ -0,0 +1,236 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
namespace Eigen {
/** \class TensorForcedEval
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reshaping class.
*
*
*/
namespace internal {
template<typename XprType, template <class> class MakePointer_>
struct traits<TensorEvalToOp<XprType, MakePointer_> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename MakePointer_<Scalar>::Type PointerType;
enum {
Flags = 0
};
template <class T>
struct MakePointer {
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
};
};
template<typename XprType, template <class> class MakePointer_>
struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense>
{
typedef const TensorEvalToOp<XprType, MakePointer_>& type;
};
template<typename XprType, template <class> class MakePointer_>
struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type>
{
typedef TensorEvalToOp<XprType, MakePointer_> type;
};
} // end namespace internal
template<typename XprType, template <class> class MakePointer_>
class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename MakePointer_<CoeffReturnType>::Type PointerType;
typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
: m_xpr(expr), m_buffer(buffer) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; }
protected:
typename XprType::Nested m_xpr;
PointerType m_buffer;
};
template<typename ArgType, typename Device, template <class> class MakePointer_>
struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
{
typedef TensorEvalToOp<ArgType, MakePointer_> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
typedef typename XprType::Index Index;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = true,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = true
};
static const int NumDims = internal::traits<ArgType>::NumDimensions;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
ArgTensorBlock;
typedef internal::TensorBlockAssignment<
CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
TensorBlockAssignment;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
EIGEN_STRONG_INLINE ~TensorEvaluator() {
}
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
EIGEN_UNUSED_VARIABLE(scalar);
eigen_assert(scalar == NULL);
return m_impl.evalSubExprsIfNeeded(m_buffer);
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType scalar, EvalSubExprsCallback done) {
EIGEN_UNUSED_VARIABLE(scalar);
eigen_assert(scalar == NULL);
m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
m_buffer[i] = m_impl.coeff(i);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
return m_impl.getResourceRequirements();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
TensorBlockDesc& desc, TensorBlockScratch& scratch) {
// Add `m_buffer` as destination buffer to the block descriptor.
desc.template AddDestinationBuffer<Layout>(
/*dst_base=*/m_buffer + desc.offset(),
/*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
ArgTensorBlock block =
m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
// If block was evaluated into a destination buffer, there is no need to do
// an assignment.
if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
TensorBlockAssignment::Run(
TensorBlockAssignment::target(
desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()),
m_buffer, desc.offset()),
block.expr());
}
block.cleanup();
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_buffer[index];
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
// We assume that evalPacket or evalScalar is called to perform the
// assignment and account for the cost of the write here.
return m_impl.costPerCoeff(vectorized) +
TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
ArgType expression() const { return m_expression; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_buffer.bind(cgh);
}
#endif
private:
TensorEvaluator<ArgType, Device> m_impl;
EvaluatorPointerType m_buffer;
const ArgType m_expression;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H

View File

@@ -0,0 +1,983 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
namespace Eigen {
/** \class TensorEvaluator
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor evaluator classes.
*
* These classes are responsible for the evaluation of the tensor expression.
*
* TODO: add support for more types of expressions, in particular expressions
* leading to lvalues (slicing, reshaping, etc...)
*/
// Generic evaluator
template<typename Derived, typename Device>
struct TensorEvaluator
{
typedef typename Derived::Index Index;
typedef typename Derived::Scalar Scalar;
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
typedef Derived XprType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
typedef StorageMemory<Scalar, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
internal::traits<Derived>::NumDimensions : 0;
enum {
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
PreferBlockAccess = false,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
};
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
Layout, Index>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(device.get((const_cast<TensorPointerType>(m.data())))),
m_dims(m.dimensions()),
m_device(device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
return false;
}
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType dest, EvalSubExprsCallback done) {
// TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
done(evalSubExprsIfNeeded(dest));
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
eigen_assert(m_data != NULL);
return m_data[index];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
eigen_assert(m_data != NULL);
return m_data[index];
}
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
}
// Return a packet starting at `index` where `umask` specifies which elements
// have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
// Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
// float element will be loaded, otherwise 0 will be loaded.
// Function has been templatized to enable Sfinae.
template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
{
return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
eigen_assert(m_data != NULL);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
} else {
return m_data[m_dims.IndexOfRowMajor(coords)];
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
coeffRef(const array<DenseIndex, NumCoords>& coords) {
eigen_assert(m_data != NULL);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return m_data[m_dims.IndexOfColMajor(coords)];
} else {
return m_data[m_dims.IndexOfRowMajor(coords)];
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
PacketType<CoeffReturnType, Device>::size);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
return internal::TensorBlockResourceRequirements::any();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
assert(m_data != NULL);
return TensorBlock::materialize(m_data, m_dims, desc, scratch);
}
template<typename TensorBlock>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
const TensorBlockDesc& desc, const TensorBlock& block) {
assert(m_data != NULL);
typedef typename TensorBlock::XprType TensorBlockExpr;
typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
Index>
TensorBlockAssign;
TensorBlockAssign::Run(
TensorBlockAssign::target(desc.dimensions(),
internal::strides<Layout>(m_dims), m_data,
desc.offset()),
block.expr());
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_data.bind(cgh);
}
#endif
protected:
EvaluatorPointerType m_data;
Dimensions m_dims;
const Device EIGEN_DEVICE_REF m_device;
};
namespace {
template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T loadConstant(const T* address) {
return *address;
}
// Use the texture cache on CUDA devices whenever possible
#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
float loadConstant(const float* address) {
return __ldg(address);
}
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
double loadConstant(const double* address) {
return __ldg(address);
}
template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
Eigen::half loadConstant(const Eigen::half* address) {
return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
}
#endif
#ifdef EIGEN_USE_SYCL
// overload of load constant should be implemented here based on range access
template <cl::sycl::access::mode AcMd, typename T>
T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
return *address;
}
#endif
}
// Default evaluator for rvalues
template<typename Derived, typename Device>
struct TensorEvaluator<const Derived, Device>
{
typedef typename Derived::Index Index;
typedef typename Derived::Scalar Scalar;
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
typedef const Derived XprType;
typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
typedef StorageMemory<const Scalar, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
// NumDimensions is -1 for variable dim tensors
static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
internal::traits<Derived>::NumDimensions : 0;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
IsAligned = Derived::IsAligned,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
PreferBlockAccess = false,
Layout = Derived::Layout,
CoordAccess = NumCoords > 0,
RawAccess = true
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
Layout, Index>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
: m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
return false;
}
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType dest, EvalSubExprsCallback done) {
// TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
done(evalSubExprsIfNeeded(dest));
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
eigen_assert(m_data != NULL);
return loadConstant(m_data+index);
}
template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
}
// Return a packet starting at `index` where `umask` specifies which elements
// have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
// Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
// float element will be loaded, otherwise 0 will be loaded.
// Function has been templatized to enable Sfinae.
template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
{
return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
eigen_assert(m_data != NULL);
const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
: m_dims.IndexOfRowMajor(coords);
return loadConstant(m_data+index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
PacketType<CoeffReturnType, Device>::size);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
return internal::TensorBlockResourceRequirements::any();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
assert(m_data != NULL);
return TensorBlock::materialize(m_data, m_dims, desc, scratch);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_data.bind(cgh);
}
#endif
protected:
EvaluatorPointerType m_data;
Dimensions m_dims;
const Device EIGEN_DEVICE_REF m_device;
};
// -------------------- CwiseNullaryOp --------------------
template<typename NullaryOp, typename ArgType, typename Device>
struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
{
typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = true,
PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
#ifdef EIGEN_USE_SYCL
&& (PacketType<CoeffReturnType, Device>::size >1)
#endif
,
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
done(true);
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_wrapper(m_functor, index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
PacketType<CoeffReturnType, Device>::size);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_argImpl.bind(cgh);
}
#endif
private:
const NullaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
};
// -------------------- CwiseUnaryOp --------------------
template<typename UnaryOp, typename ArgType, typename Device>
struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
{
typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = int(TensorEvaluator<ArgType, Device>::PacketAccess) &
int(internal::functor_traits<UnaryOp>::PacketAccess),
BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
TensorEvaluator(const XprType& op, const Device& device)
: m_device(device),
m_functor(op.functor()),
m_argImpl(op.nestedExpression(), device)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
static const int NumDims = internal::array_size<Dimensions>::value;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
ArgTensorBlock;
typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_argImpl.evalSubExprsIfNeeded(NULL);
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {
m_argImpl.cleanup();
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_functor(m_argImpl.coeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
return m_argImpl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
return m_argImpl.getResourceRequirements().addCostPerCoeff(
{0, 0, functor_cost / PacketSize});
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
m_argImpl.bind(cgh);
}
#endif
private:
const Device EIGEN_DEVICE_REF m_device;
const UnaryOp m_functor;
TensorEvaluator<ArgType, Device> m_argImpl;
};
// -------------------- CwiseBinaryOp --------------------
template<typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device>
{
typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
enum {
IsAligned = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
int(TensorEvaluator<RightArgType, Device>::IsAligned),
PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
int(internal::functor_traits<BinaryOp>::PacketAccess),
BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
int(TensorEvaluator<RightArgType, Device>::BlockAccess),
PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
Layout = TensorEvaluator<LeftArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
TensorEvaluator(const XprType& op, const Device& device)
: m_device(device),
m_functor(op.functor()),
m_leftImpl(op.lhsExpression(), device),
m_rightImpl(op.rhsExpression(), device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
static const int NumDims = internal::array_size<
typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock
LeftTensorBlock;
typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
RightTensorBlock;
typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
RightTensorBlock>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use right impl instead if right impl dimensions are known at compile time.
return m_leftImpl.dimensions();
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_leftImpl.evalSubExprsIfNeeded(NULL);
m_rightImpl.evalSubExprsIfNeeded(NULL);
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
// TODO(ezhulenev): Evaluate two expression in parallel?
m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
m_rightImpl.evalSubExprsIfNeededAsync(nullptr,
[done](bool) { done(true); });
});
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {
m_leftImpl.cleanup();
m_rightImpl.cleanup();
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index), m_rightImpl.template packet<LoadMode>(index));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
return m_leftImpl.costPerCoeff(vectorized) +
m_rightImpl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
return internal::TensorBlockResourceRequirements::merge(
m_leftImpl.getResourceRequirements(),
m_rightImpl.getResourceRequirements())
.addCostPerCoeff({0, 0, functor_cost / PacketSize});
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
desc.DropDestinationBuffer();
return TensorBlock(m_leftImpl.block(desc, scratch),
m_rightImpl.block(desc, scratch), m_functor);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_leftImpl.bind(cgh);
m_rightImpl.bind(cgh);
}
#endif
private:
const Device EIGEN_DEVICE_REF m_device;
const BinaryOp m_functor;
TensorEvaluator<LeftArgType, Device> m_leftImpl;
TensorEvaluator<RightArgType, Device> m_rightImpl;
};
// -------------------- CwiseTernaryOp --------------------
template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
{
typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
enum {
IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess &&
TensorEvaluator<Arg2Type, Device>::PacketAccess &&
TensorEvaluator<Arg3Type, Device>::PacketAccess &&
internal::functor_traits<TernaryOp>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
Layout = TensorEvaluator<Arg1Type, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
TensorEvaluator(const XprType& op, const Device& device)
: m_functor(op.functor()),
m_arg1Impl(op.arg1Expression(), device),
m_arg2Impl(op.arg2Expression(), device),
m_arg3Impl(op.arg3Expression(), device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
typename internal::traits<Arg2Type>::StorageKind>::value),
STORAGE_KIND_MUST_MATCH)
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
typename internal::traits<Arg3Type>::StorageKind>::value),
STORAGE_KIND_MUST_MATCH)
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
typename internal::traits<Arg2Type>::Index>::value),
STORAGE_INDEX_MUST_MATCH)
EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
typename internal::traits<Arg3Type>::Index>::value),
STORAGE_INDEX_MUST_MATCH)
eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
}
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use arg2 or arg3 dimensions if they are known at compile time.
return m_arg1Impl.dimensions();
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_arg1Impl.evalSubExprsIfNeeded(NULL);
m_arg2Impl.evalSubExprsIfNeeded(NULL);
m_arg3Impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_arg1Impl.cleanup();
m_arg2Impl.cleanup();
m_arg3Impl.cleanup();
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
m_arg2Impl.template packet<LoadMode>(index),
m_arg3Impl.template packet<LoadMode>(index));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
return m_arg1Impl.costPerCoeff(vectorized) +
m_arg2Impl.costPerCoeff(vectorized) +
m_arg3Impl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_arg1Impl.bind(cgh);
m_arg2Impl.bind(cgh);
m_arg3Impl.bind(cgh);
}
#endif
private:
const TernaryOp m_functor;
TensorEvaluator<Arg1Type, Device> m_arg1Impl;
TensorEvaluator<Arg2Type, Device> m_arg2Impl;
TensorEvaluator<Arg3Type, Device> m_arg3Impl;
};
// -------------------- SelectOp --------------------
template<typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device>
{
typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
typedef typename XprType::Scalar Scalar;
enum {
IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned &
TensorEvaluator<ElseArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess &
TensorEvaluator<ElseArgType, Device>::PacketAccess &
PacketType<Scalar, Device>::HasBlend,
BlockAccess = TensorEvaluator<IfArgType, Device>::BlockAccess &&
TensorEvaluator<ThenArgType, Device>::BlockAccess &&
TensorEvaluator<ElseArgType, Device>::BlockAccess,
PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<IfArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
TensorEvaluator(const XprType& op, const Device& device)
: m_condImpl(op.ifExpression(), device),
m_thenImpl(op.thenExpression(), device),
m_elseImpl(op.elseExpression(), device)
{
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
}
typedef typename XprType::Index Index;
typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
static const int NumDims = internal::array_size<Dimensions>::value;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock
IfArgTensorBlock;
typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock
ThenArgTensorBlock;
typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock
ElseArgTensorBlock;
struct TensorSelectOpBlockFactory {
template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
struct XprType {
typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
};
template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
}
};
typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
IfArgTensorBlock, ThenArgTensorBlock,
ElseArgTensorBlock>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
{
// TODO: use then or else impl instead if they happen to be known at compile time.
return m_condImpl.dimensions();
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_condImpl.evalSubExprsIfNeeded(NULL);
m_thenImpl.evalSubExprsIfNeeded(NULL);
m_elseImpl.evalSubExprsIfNeeded(NULL);
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); });
});
});
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {
m_condImpl.cleanup();
m_thenImpl.cleanup();
m_elseImpl.cleanup();
}
EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
{
return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
{
internal::Selector<PacketSize> select;
EIGEN_UNROLL_LOOP
for (Index i = 0; i < PacketSize; ++i) {
select.select[i] = m_condImpl.coeff(index+i);
}
return internal::pblend(select,
m_thenImpl.template packet<LoadMode>(index),
m_elseImpl.template packet<LoadMode>(index));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
return m_condImpl.costPerCoeff(vectorized) +
m_thenImpl.costPerCoeff(vectorized)
.cwiseMax(m_elseImpl.costPerCoeff(vectorized));
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
auto then_req = m_thenImpl.getResourceRequirements();
auto else_req = m_elseImpl.getResourceRequirements();
auto merged_req =
internal::TensorBlockResourceRequirements::merge(then_req, else_req);
merged_req.cost_per_coeff =
then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
return internal::TensorBlockResourceRequirements::merge(
m_condImpl.getResourceRequirements(), merged_req);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
// It's unsafe to pass destination buffer to underlying expressions, because
// output might be aliased with one of the inputs.
desc.DropDestinationBuffer();
return TensorBlock(
m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_condImpl.bind(cgh);
m_thenImpl.bind(cgh);
m_elseImpl.bind(cgh);
}
#endif
private:
TensorEvaluator<IfArgType, Device> m_condImpl;
TensorEvaluator<ThenArgType, Device> m_thenImpl;
TensorEvaluator<ElseArgType, Device> m_elseImpl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H

View File

@@ -0,0 +1,703 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
namespace Eigen {
/**
* \class TensorExecutor
* \ingroup CXX11_Tensor_Module
*
* \brief The tensor executor class.
*
* This class is responsible for launch the evaluation of the expression on
* the specified computing device.
*
* @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
* instructions)
* @tparam Tiling can use block based tensor evaluation
* (see TensorBlock.h)
*/
namespace internal {
/**
* Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
* expensive. If expression has at least one broadcast op in it, and it supports
* block based evaluation, we always prefer it, even for the small tensors. For
* all other tileable ops, block evaluation overhead for small tensors (fits
* into L1) is too large, and we fallback on vectorized evaluation.
*/
// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
template<typename Expression>
struct ExpressionHasTensorBroadcastingOp {
enum { value = false };
};
template<typename LhsXprType, typename RhsXprType>
struct ExpressionHasTensorBroadcastingOp<
const TensorAssignOp<LhsXprType, RhsXprType> > {
enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
};
template<typename UnaryOp, typename XprType>
struct ExpressionHasTensorBroadcastingOp<
const TensorCwiseUnaryOp<UnaryOp, XprType> > {
enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
};
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
struct ExpressionHasTensorBroadcastingOp<
const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
enum {
value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
ExpressionHasTensorBroadcastingOp<RhsXprType>::value
};
};
template<typename Broadcast, typename XprType>
struct ExpressionHasTensorBroadcastingOp<
const TensorBroadcastingOp<Broadcast, XprType> > {
enum { value = true };
};
// -------------------------------------------------------------------------- //
/**
* Default strategy: the expression is evaluated sequentially with a single cpu
* thread, without vectorization and block evaluation.
*/
template <typename Expression, typename Device, bool Vectorizable,
TiledEvaluation Tiling>
class TensorExecutor {
public:
typedef typename Expression::Index StorageIndex;
// Including `unsupported/Eigen/CXX11/Tensor` in different translation units
// with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
// violation. If this template is instantiated with a non-default device, it
// means that this header file was included without defining
// `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
static_assert(std::is_same<Device, DefaultDevice>::value,
"Default executor instantiated with non-default device. "
"You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
"EIGEN_USE_SYCL before including Eigen headers.");
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const Device& device = Device()) {
TensorEvaluator<Expression, Device> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const StorageIndex size = array_prod(evaluator.dimensions());
for (StorageIndex i = 0; i < size; ++i) {
evaluator.evalScalar(i);
}
}
evaluator.cleanup();
}
};
/**
* Default async execution strategy is not implemented. Currently it's only
* available for ThreadPoolDevice (see definition below).
*/
template <typename Expression, typename Device, typename DoneCallback,
bool Vectorizable, TiledEvaluation Tiling>
class TensorAsyncExecutor {};
/**
* Process all the data with a single cpu thread, using vectorized instructions.
*/
template <typename Expression>
class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
/*Tiling=*/TiledEvaluation::Off> {
public:
typedef typename Expression::Index StorageIndex;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(
const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
const StorageIndex size = array_prod(evaluator.dimensions());
const int PacketSize = unpacket_traits<typename TensorEvaluator<
Expression, DefaultDevice>::PacketReturnType>::size;
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
const StorageIndex UnrolledSize =
(size / (4 * PacketSize)) * 4 * PacketSize;
for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
evaluator.evalPacket(i);
}
for (StorageIndex i = VectorizedSize; i < size; ++i) {
evaluator.evalScalar(i);
}
}
evaluator.cleanup();
}
};
/**
* Process all the data with a single cpu thread, using blocks of data. By
* sizing a block to fit L1 cache we get better cache performance.
*/
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, DefaultDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Scalar Scalar;
typedef typename remove_const<Scalar>::type ScalarNoConst;
typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
typedef typename traits<Expression>::Index StorageIndex;
static const int NumDims = traits<Expression>::NumDimensions;
EIGEN_DEVICE_FUNC
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const DefaultDevice& device = DefaultDevice()) {
typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
TensorBlockMapper;
typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<DefaultDevice>
TensorBlockScratch;
Evaluator evaluator(expr, device);
// TODO(ezhulenev): Do not use tiling for small tensors?
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
// Query expression tree for desired block size/shape.
const TensorBlockResourceRequirements requirements =
evaluator.getResourceRequirements();
const TensorBlockMapper block_mapper(
typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
requirements);
// Share scratch memory allocator between all blocks.
TensorBlockScratch scratch(device);
const StorageIndex total_block_count = block_mapper.blockCount();
for (StorageIndex i = 0; i < total_block_count; ++i) {
TensorBlockDesc desc = block_mapper.blockDescriptor(i);
evaluator.evalBlock(desc, scratch);
scratch.reset();
}
}
evaluator.cleanup();
}
};
/**
* Multicore strategy: the index space is partitioned and each partition is
* executed on a single core.
*
* (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
* pool, and will block the caller thread until all tasks are finished.
*
* (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
* the ThreadPoolDevice managed thread pool, and will return immediately.
* It will call 'done' callback after all tasks are finished.
*/
#ifdef EIGEN_USE_THREADS
template <typename TensorBlockMapper>
struct TensorExecutorTilingContext {
TensorExecutorTilingContext() = default;
TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
const TensorOpCost& b_cost, size_t b_aligned_size)
: block_mapper(b_mapper),
cost(b_cost),
aligned_blocksize(b_aligned_size) {}
TensorBlockMapper block_mapper; // navigate through blocks
TensorOpCost cost; // cost of computing a single block
size_t aligned_blocksize; // block size after memory alignment
};
// Computes a block evaluation parameters, and allocates temporary memory buffer
// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
const Evaluator& evaluator) {
// Query expression tree for desired block size/shape.
TensorBlockResourceRequirements requirements =
evaluator.getResourceRequirements();
// Update target block size based on cost model.
double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
1, requirements.cost_per_coeff);
requirements.size = static_cast<size_t>(1.0 / taskSize);
TensorBlockMapper block_mapper(
typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
requirements);
size_t block_size = block_mapper.blockTotalSize();
const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
const size_t aligned_blocksize =
align *
divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
return {block_mapper, requirements.cost_per_coeff * block_size,
aligned_blocksize};
}
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EvalRange {
static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
const StorageIndex lastIdx) {
Evaluator evaluator = *evaluator_in;
eigen_assert(lastIdx >= firstIdx);
for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
evaluator.evalScalar(i);
}
}
static StorageIndex alignBlockSize(StorageIndex size) { return size; }
};
template <typename Evaluator, typename StorageIndex>
struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
static const int PacketSize =
unpacket_traits<typename Evaluator::PacketReturnType>::size;
static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
const StorageIndex lastIdx) {
Evaluator evaluator = *evaluator_in;
eigen_assert(lastIdx >= firstIdx);
StorageIndex i = firstIdx;
if (lastIdx - firstIdx >= PacketSize) {
eigen_assert(firstIdx % PacketSize == 0);
StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
// Give compiler a strong possibility to unroll the loop. But don't insist
// on unrolling, because if the function is expensive compiler should not
// unroll the loop at the expense of inlining.
for (; i <= last_chunk_offset; i += 4 * PacketSize) {
for (StorageIndex j = 0; j < 4; j++) {
evaluator.evalPacket(i + j * PacketSize);
}
}
last_chunk_offset = lastIdx - PacketSize;
for (; i <= last_chunk_offset; i += PacketSize) {
evaluator.evalPacket(i);
}
}
for (; i < lastIdx; ++i) {
evaluator.evalScalar(i);
}
}
static StorageIndex alignBlockSize(StorageIndex size) {
// Align block size to packet size and account for unrolling in run above.
if (size >= 16 * PacketSize) {
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
}
// Aligning to 4 * PacketSize would increase block size by more than 25%.
return (size + PacketSize - 1) & ~(PacketSize - 1);
}
};
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const ThreadPoolDevice& device) {
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
const StorageIndex size = array_prod(evaluator.dimensions());
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
EvalRange::alignBlockSize,
[&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
EvalRange::run(&evaluator, firstIdx, lastIdx);
});
}
evaluator.cleanup();
}
};
template <typename Expression, bool Vectorizable>
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
/*Tiling=*/TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Index IndexType;
typedef typename traits<Expression>::Scalar Scalar;
typedef typename remove_const<Scalar>::type ScalarNoConst;
static const int NumDims = traits<Expression>::NumDimensions;
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
typedef internal::TensorBlockDescriptor<NumDims, IndexType>
TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
TensorBlockScratch;
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const ThreadPoolDevice& device) {
Evaluator evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
const TilingContext tiling =
internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
Vectorizable>(evaluator);
auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
IndexType lastBlockIdx) {
TensorBlockScratch scratch(device);
for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
++block_idx) {
TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
evaluator.evalBlock(desc, scratch);
scratch.reset();
}
};
// Evaluate small expressions directly as a single block.
if (tiling.block_mapper.blockCount() == 1) {
TensorBlockScratch scratch(device);
TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
evaluator.evalBlock(desc, scratch);
} else {
device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
eval_block);
}
}
evaluator.cleanup();
}
};
template <typename Expression, typename DoneCallback, bool Vectorizable,
TiledEvaluation Tiling>
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
const ThreadPoolDevice& device,
DoneCallback done) {
TensorAsyncExecutorContext* const ctx =
new TensorAsyncExecutorContext(expr, device, std::move(done));
const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
if (!need_assign) {
delete ctx;
return;
}
typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
const StorageIndex size = array_prod(ctx->evaluator.dimensions());
device.parallelForAsync(
size, ctx->evaluator.costPerCoeff(Vectorizable),
EvalRange::alignBlockSize,
[ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
},
[ctx]() { delete ctx; });
};
ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
}
private:
struct TensorAsyncExecutorContext {
TensorAsyncExecutorContext(const Expression& expr,
const ThreadPoolDevice& thread_pool,
DoneCallback done)
: evaluator(expr, thread_pool), on_done(std::move(done)) {}
~TensorAsyncExecutorContext() {
evaluator.cleanup();
on_done();
}
Evaluator evaluator;
private:
DoneCallback on_done;
};
};
template <typename Expression, typename DoneCallback, bool Vectorizable>
class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
Vectorizable, /*Tileable*/ TiledEvaluation::On> {
public:
typedef typename traits<Expression>::Index IndexType;
typedef typename traits<Expression>::Scalar Scalar;
typedef typename remove_const<Scalar>::type ScalarNoConst;
static const int NumDims = traits<Expression>::NumDimensions;
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
TensorBlockScratch;
static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
const ThreadPoolDevice& device,
DoneCallback done) {
TensorAsyncExecutorContext* const ctx =
new TensorAsyncExecutorContext(expr, device, std::move(done));
const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
if (!need_assign) {
delete ctx;
return;
}
ctx->tiling = internal::GetTensorExecutorTilingContext<
Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
TensorBlockScratch scratch(ctx->device);
for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
++block_idx) {
TensorBlockDesc desc =
ctx->tiling.block_mapper.blockDescriptor(block_idx);
ctx->evaluator.evalBlock(desc, scratch);
scratch.reset();
}
};
// Evaluate small expressions directly as a single block.
if (ctx->tiling.block_mapper.blockCount() == 1) {
TensorBlockScratch scratch(ctx->device);
TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
ctx->evaluator.evalBlock(desc, scratch);
delete ctx;
} else {
ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
ctx->tiling.cost, eval_block,
[ctx]() { delete ctx; });
}
};
ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
}
private:
struct TensorAsyncExecutorContext {
TensorAsyncExecutorContext(const Expression& expr,
const ThreadPoolDevice& thread_pool,
DoneCallback done)
: device(thread_pool),
evaluator(expr, thread_pool),
on_done(std::move(done)) {}
~TensorAsyncExecutorContext() {
evaluator.cleanup();
on_done();
}
const ThreadPoolDevice& device;
Evaluator evaluator;
TilingContext tiling;
private:
DoneCallback on_done;
};
};
#endif // EIGEN_USE_THREADS
// GPU: the evaluation of the expression is offloaded to a GPU.
#if defined(EIGEN_USE_GPU)
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index StorageIndex;
static void run(const Expression& expr, const GpuDevice& device);
};
#if defined(EIGEN_GPUCC)
template <typename Evaluator, typename StorageIndex, bool Vectorizable>
struct EigenMetaKernelEval {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
eval.evalScalar(i);
}
}
};
template <typename Evaluator, typename StorageIndex>
struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
const StorageIndex vectorized_step_size = step_size * PacketSize;
// Use the vector path
for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
i += vectorized_step_size) {
eval.evalPacket(i);
}
for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
eval.evalScalar(i);
}
}
};
template <typename Evaluator, typename StorageIndex>
__global__ void
__launch_bounds__(1024)
EigenMetaKernel(Evaluator eval, StorageIndex size) {
const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
const StorageIndex step_size = blockDim.x * gridDim.x;
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
}
/*static*/
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
const Expression& expr, const GpuDevice& device) {
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
if (needs_assign) {
const int block_size = device.maxGpuThreadsPerBlock();
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
const StorageIndex size = array_prod(evaluator.dimensions());
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
LAUNCH_GPU_KERNEL(
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
num_blocks, block_size, 0, device, evaluator, size);
}
evaluator.cleanup();
}
#endif // EIGEN_GPUCC
#endif // EIGEN_USE_GPU
// SYCL Executor policy
#ifdef EIGEN_USE_SYCL
template <typename Evaluator>
struct ExecExprFunctorKernel {
typedef typename Evaluator::Index Index;
Evaluator evaluator;
const Index range;
template <typename Scratch>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
const Scratch, Evaluator evaluator_, const Index range_)
: evaluator(evaluator_), range(range_) {}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
cl::sycl::nd_item<1> itemID) {
compute(itemID);
}
template <bool is_vec = Evaluator::PacketAccess>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
compute(const cl::sycl::nd_item<1>& itemID) {
Index gId = static_cast<Index>(itemID.get_global_linear_id());
Index total_threads = itemID.get_global_range(0);
for (Index i = gId; i < range; i += total_threads) {
evaluator.evalScalar(i);
}
}
template <bool is_vec = Evaluator::PacketAccess>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
compute(const cl::sycl::nd_item<1>& itemID) {
const Index vectorizedRange =
(range / Evaluator::PacketSize) * Evaluator::PacketSize;
Index gId = static_cast<Index>(itemID.get_global_linear_id());
const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
const Index start = Evaluator::PacketSize * gId;
for (Index i = start; i < vectorizedRange; i += step) {
evaluator.evalPacket(i);
}
gId += vectorizedRange;
for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
evaluator.evalScalar(i);
}
}
};
template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
public:
typedef typename Expression::Index Index;
static EIGEN_STRONG_INLINE void run(const Expression& expr,
const Eigen::SyclDevice& dev) {
typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
Evaluator evaluator(expr, dev);
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
if (needs_assign) {
Index range, GRange, tileSize;
Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
total_size = (total_size == 0) ? 1 : total_size;
const int PacketSize =
Eigen::PacketType<typename Evaluator::CoeffReturnType,
Eigen::SyclDevice>::size;
Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
range = total_size;
dev.template nullary_kernel_launcher<
typename Evaluator::CoeffReturnType,
ExecExprFunctorKernel<Evaluator> >(
evaluator,
cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
cl::sycl::range<1>(tileSize)),
Index(1), range);
}
evaluator.cleanup();
}
};
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H

View File

@@ -0,0 +1,388 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
namespace Eigen {
/** \class TensorExpr
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor expression classes.
*
* The TensorCwiseNullaryOp class applies a nullary operators to an expression.
* This is typically used to generate constants.
*
* The TensorCwiseUnaryOp class represents an expression where a unary operator
* (e.g. cwiseSqrt) is applied to an expression.
*
* The TensorCwiseBinaryOp class represents an expression where a binary
* operator (e.g. addition) is applied to a lhs and a rhs expression.
*
*/
namespace internal {
template<typename NullaryOp, typename XprType>
struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
: traits<XprType>
{
typedef traits<XprType> XprTraits;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::Nested XprTypeNested;
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
enum {
Flags = 0
};
};
} // end namespace internal
template<typename NullaryOp, typename XprType>
class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
: m_xpr(xpr), m_functor(func) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
nestedExpression() const { return m_xpr; }
EIGEN_DEVICE_FUNC
const NullaryOp& functor() const { return m_functor; }
protected:
typename XprType::Nested m_xpr;
const NullaryOp m_functor;
};
namespace internal {
template<typename UnaryOp, typename XprType>
struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
: traits<XprType>
{
// TODO(phli): Add InputScalar, InputPacket. Check references to
// current Scalar/Packet to see if the intent is Input or Output.
typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprType::Nested XprTypeNested;
typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename TypeConversion<Scalar,
typename XprTraits::PointerType
>::type
PointerType;
};
template<typename UnaryOp, typename XprType>
struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense>
{
typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
};
template<typename UnaryOp, typename XprType>
struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type>
{
typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
};
} // end namespace internal
template<typename UnaryOp, typename XprType>
class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors>
{
public:
// TODO(phli): Add InputScalar, InputPacket. Check references to
// current Scalar/Packet to see if the intent is Input or Output.
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef Scalar CoeffReturnType;
typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
: m_xpr(xpr), m_functor(func) {}
EIGEN_DEVICE_FUNC
const UnaryOp& functor() const { return m_functor; }
/** \returns the nested expression */
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
nestedExpression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const UnaryOp m_functor;
};
namespace internal {
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs
// are different.
// TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
// current Scalar/Packet to see if the intent is Inputs or Output.
typedef typename result_of<
BinaryOp(typename LhsXprType::Scalar,
typename RhsXprType::Scalar)>::type Scalar;
typedef traits<LhsXprType> XprTraits;
typedef typename promote_storage_type<
typename traits<LhsXprType>::StorageKind,
typename traits<RhsXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<
typename traits<LhsXprType>::Index,
typename traits<RhsXprType>::Index>::type Index;
typedef typename LhsXprType::Nested LhsNested;
typedef typename RhsXprType::Nested RhsNested;
typedef typename remove_reference<LhsNested>::type _LhsNested;
typedef typename remove_reference<RhsNested>::type _RhsNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename TypeConversion<Scalar,
typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
typename traits<LhsXprType>::PointerType,
typename traits<RhsXprType>::PointerType>::type
>::type
PointerType;
enum {
Flags = 0
};
};
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense>
{
typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
};
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1, typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type>
{
typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
};
} // end namespace internal
template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors>
{
public:
// TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket. Check references to
// current Scalar/Packet to see if the intent is Inputs or Output.
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef Scalar CoeffReturnType;
typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs, const BinaryOp& func = BinaryOp())
: m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
EIGEN_DEVICE_FUNC
const BinaryOp& functor() const { return m_functor; }
/** \returns the nested expressions */
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename LhsXprType::Nested>::type&
lhsExpression() const { return m_lhs_xpr; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename RhsXprType::Nested>::type&
rhsExpression() const { return m_rhs_xpr; }
protected:
typename LhsXprType::Nested m_lhs_xpr;
typename RhsXprType::Nested m_rhs_xpr;
const BinaryOp m_functor;
};
namespace internal {
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
{
// Type promotion to handle the case where the types of the args are different.
typedef typename result_of<
TernaryOp(typename Arg1XprType::Scalar,
typename Arg2XprType::Scalar,
typename Arg3XprType::Scalar)>::type Scalar;
typedef traits<Arg1XprType> XprTraits;
typedef typename traits<Arg1XprType>::StorageKind StorageKind;
typedef typename traits<Arg1XprType>::Index Index;
typedef typename Arg1XprType::Nested Arg1Nested;
typedef typename Arg2XprType::Nested Arg2Nested;
typedef typename Arg3XprType::Nested Arg3Nested;
typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename TypeConversion<Scalar,
typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
typename traits<Arg2XprType>::PointerType,
typename traits<Arg3XprType>::PointerType>::type
>::type
PointerType;
enum {
Flags = 0
};
};
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
{
typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
};
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
{
typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
};
} // end namespace internal
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef Scalar CoeffReturnType;
typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
: m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
EIGEN_DEVICE_FUNC
const TernaryOp& functor() const { return m_functor; }
/** \returns the nested expressions */
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename Arg1XprType::Nested>::type&
arg1Expression() const { return m_arg1_xpr; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename Arg2XprType::Nested>::type&
arg2Expression() const { return m_arg2_xpr; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename Arg3XprType::Nested>::type&
arg3Expression() const { return m_arg3_xpr; }
protected:
typename Arg1XprType::Nested m_arg1_xpr;
typename Arg2XprType::Nested m_arg2_xpr;
typename Arg3XprType::Nested m_arg3_xpr;
const TernaryOp m_functor;
};
namespace internal {
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
: traits<ThenXprType>
{
typedef typename traits<ThenXprType>::Scalar Scalar;
typedef traits<ThenXprType> XprTraits;
typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
typename traits<ElseXprType>::StorageKind>::ret StorageKind;
typedef typename promote_index_type<typename traits<ElseXprType>::Index,
typename traits<ThenXprType>::Index>::type Index;
typedef typename IfXprType::Nested IfNested;
typedef typename ThenXprType::Nested ThenNested;
typedef typename ElseXprType::Nested ElseNested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
typename traits<ThenXprType>::PointerType,
typename traits<ElseXprType>::PointerType>::type PointerType;
};
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense>
{
typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
};
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type>
{
typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
};
} // end namespace internal
template<typename IfXprType, typename ThenXprType, typename ElseXprType>
class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
EIGEN_DEVICE_FUNC
TensorSelectOp(const IfXprType& a_condition,
const ThenXprType& a_then,
const ElseXprType& a_else)
: m_condition(a_condition), m_then(a_then), m_else(a_else)
{ }
EIGEN_DEVICE_FUNC
const IfXprType& ifExpression() const { return m_condition; }
EIGEN_DEVICE_FUNC
const ThenXprType& thenExpression() const { return m_then; }
EIGEN_DEVICE_FUNC
const ElseXprType& elseExpression() const { return m_else; }
protected:
typename IfXprType::Nested m_condition;
typename ThenXprType::Nested m_then;
typename ElseXprType::Nested m_else;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H

View File

@@ -0,0 +1,669 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
namespace Eigen {
/** \class TensorFFT
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor FFT class.
*
* TODO:
* Vectorize the Cooley Tukey and the Bluestein algorithm
* Add support for multithreaded evaluation
* Improve the performance on GPU
*/
template <bool NeedUprade> struct MakeComplex {
template <typename T>
EIGEN_DEVICE_FUNC
T operator() (const T& val) const { return val; }
};
template <> struct MakeComplex<true> {
template <typename T>
EIGEN_DEVICE_FUNC
std::complex<T> operator() (const T& val) const { return std::complex<T>(val, 0); }
};
template <> struct MakeComplex<false> {
template <typename T>
EIGEN_DEVICE_FUNC
std::complex<T> operator() (const std::complex<T>& val) const { return val; }
};
template <int ResultType> struct PartOf {
template <typename T> T operator() (const T& val) const { return val; }
};
template <> struct PartOf<RealPart> {
template <typename T> T operator() (const std::complex<T>& val) const { return val.real(); }
};
template <> struct PartOf<ImagPart> {
template <typename T> T operator() (const std::complex<T>& val) const { return val.imag(); }
};
namespace internal {
template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
typedef traits<XprType> XprTraits;
typedef typename NumTraits<typename XprTraits::Scalar>::Real RealScalar;
typedef typename std::complex<RealScalar> ComplexScalar;
typedef typename XprTraits::Scalar InputScalar;
typedef typename conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename traits<XprType>::PointerType PointerType;
};
template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
};
template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1, typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
};
} // end namespace internal
template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
public:
typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename std::complex<RealScalar> ComplexScalar;
typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
typedef OutputScalar CoeffReturnType;
typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft)
: m_xpr(expr), m_fft(fft) {}
EIGEN_DEVICE_FUNC
const FFT& fft() const { return m_fft; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type& expression() const {
return m_xpr;
}
protected:
typename XprType::Nested m_xpr;
const FFT m_fft;
};
// Eval as rvalue
template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename std::complex<RealScalar> ComplexScalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
typedef internal::traits<XprType> XprTraits;
typedef typename XprTraits::Scalar InputScalar;
typedef typename internal::conditional<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>::type OutputScalar;
typedef OutputScalar CoeffReturnType;
typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = true,
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
eigen_assert(input_dims[i] > 0);
m_dimensions[i] = input_dims[i];
}
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
}
} else {
m_strides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
}
}
m_size = m_dimensions.TotalSize();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
return m_dimensions;
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
m_impl.evalSubExprsIfNeeded(NULL);
if (data) {
evalToBuf(data);
return false;
} else {
m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
evalToBuf(m_data);
return true;
}
}
EIGEN_STRONG_INLINE void cleanup() {
if (m_data) {
m_device.deallocate(m_data);
m_data = NULL;
}
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
return m_data[index];
}
template <int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType
packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_data.bind(cgh);
}
#endif
private:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
for (Index i = 0; i < m_size; ++i) {
buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
}
for (size_t i = 0; i < m_fft.size(); ++i) {
Index dim = m_fft[i];
eigen_assert(dim >= 0 && dim < NumDims);
Index line_len = m_dimensions[dim];
eigen_assert(line_len >= 1);
ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
const bool is_power_of_two = isPowerOfTwo(line_len);
const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
ComplexScalar* a = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
ComplexScalar* b = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
ComplexScalar* pos_j_base_powered = is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
if (!is_power_of_two) {
// Compute twiddle factors
// t_n = exp(sqrt(-1) * pi * n^2 / line_len)
// for n = 0, 1,..., line_len-1.
// For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
// The recurrence is correct in exact arithmetic, but causes
// numerical issues for large transforms, especially in
// single-precision floating point.
//
// pos_j_base_powered[0] = ComplexScalar(1, 0);
// if (line_len > 1) {
// const ComplexScalar pos_j_base = ComplexScalar(
// numext::cos(M_PI / line_len), numext::sin(M_PI / line_len));
// pos_j_base_powered[1] = pos_j_base;
// if (line_len > 2) {
// const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
// for (int i = 2; i < line_len + 1; ++i) {
// pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
// pos_j_base_powered[i - 1] /
// pos_j_base_powered[i - 2] *
// pos_j_base_sq;
// }
// }
// }
// TODO(rmlarsen): Find a way to use Eigen's vectorized sin
// and cosine functions here.
for (int j = 0; j < line_len + 1; ++j) {
double arg = ((EIGEN_PI * j) * j) / line_len;
std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
}
}
for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
const Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
// get data into line_buf
const Index stride = m_strides[dim];
if (stride == 1) {
m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
} else {
Index offset = base_offset;
for (int j = 0; j < line_len; ++j, offset += stride) {
line_buf[j] = buf[offset];
}
}
// process the line
if (is_power_of_two) {
processDataLineCooleyTukey(line_buf, line_len, log_len);
}
else {
processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
}
// write back
if (FFTDir == FFT_FORWARD && stride == 1) {
m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
} else {
Index offset = base_offset;
const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);
for (int j = 0; j < line_len; ++j, offset += stride) {
buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor;
}
}
}
m_device.deallocate(line_buf);
if (!is_power_of_two) {
m_device.deallocate(a);
m_device.deallocate(b);
m_device.deallocate(pos_j_base_powered);
}
}
if(!write_to_out) {
for (Index i = 0; i < m_size; ++i) {
data[i] = PartOf<FFTResultType>()(buf[i]);
}
m_device.deallocate(buf);
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
eigen_assert(x > 0);
return !(x & (x - 1));
}
// The composite number for padding, used in Bluestein's FFT algorithm
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
Index i = 2;
while (i < 2 * n - 1) i *= 2;
return i;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
Index log2m = 0;
while (m >>= 1) log2m++;
return log2m;
}
// Call Cooley Tukey algorithm directly, data length must be power of 2
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len, Index log_len) {
eigen_assert(isPowerOfTwo(line_len));
scramble_FFT(line_buf, line_len);
compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
}
// Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len, Index good_composite, Index log_len, ComplexScalar* a, ComplexScalar* b, const ComplexScalar* pos_j_base_powered) {
Index n = line_len;
Index m = good_composite;
ComplexScalar* data = line_buf;
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
}
else {
a[i] = data[i] * pos_j_base_powered[i];
}
}
for (Index i = n; i < m; ++i) {
a[i] = ComplexScalar(0, 0);
}
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
b[i] = pos_j_base_powered[i];
}
else {
b[i] = numext::conj(pos_j_base_powered[i]);
}
}
for (Index i = n; i < m - n; ++i) {
b[i] = ComplexScalar(0, 0);
}
for (Index i = m - n; i < m; ++i) {
if(FFTDir == FFT_FORWARD) {
b[i] = pos_j_base_powered[m-i];
}
else {
b[i] = numext::conj(pos_j_base_powered[m-i]);
}
}
scramble_FFT(a, m);
compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
scramble_FFT(b, m);
compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
for (Index i = 0; i < m; ++i) {
a[i] *= b[i];
}
scramble_FFT(a, m);
compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
//Do the scaling after ifft
for (Index i = 0; i < m; ++i) {
a[i] /= m;
}
for (Index i = 0; i < n; ++i) {
if(FFTDir == FFT_FORWARD) {
data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
}
else {
data[i] = a[i] * pos_j_base_powered[i];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
eigen_assert(isPowerOfTwo(n));
Index j = 1;
for (Index i = 1; i < n; ++i){
if (j > i) {
std::swap(data[j-1], data[i-1]);
}
Index m = n >> 1;
while (m >= 2 && j > m) {
j -= m;
m >>= 1;
}
j += m;
}
}
template <int Dir>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) {
ComplexScalar tmp = data[1];
data[1] = data[0] - data[1];
data[0] += tmp;
}
template <int Dir>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) {
ComplexScalar tmp[4];
tmp[0] = data[0] + data[1];
tmp[1] = data[0] - data[1];
tmp[2] = data[2] + data[3];
if (Dir == FFT_FORWARD) {
tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
} else {
tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
}
data[0] = tmp[0] + tmp[2];
data[1] = tmp[1] + tmp[3];
data[2] = tmp[0] - tmp[2];
data[3] = tmp[1] - tmp[3];
}
template <int Dir>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) {
ComplexScalar tmp_1[8];
ComplexScalar tmp_2[8];
tmp_1[0] = data[0] + data[1];
tmp_1[1] = data[0] - data[1];
tmp_1[2] = data[2] + data[3];
if (Dir == FFT_FORWARD) {
tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
} else {
tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
}
tmp_1[4] = data[4] + data[5];
tmp_1[5] = data[4] - data[5];
tmp_1[6] = data[6] + data[7];
if (Dir == FFT_FORWARD) {
tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
} else {
tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
}
tmp_2[0] = tmp_1[0] + tmp_1[2];
tmp_2[1] = tmp_1[1] + tmp_1[3];
tmp_2[2] = tmp_1[0] - tmp_1[2];
tmp_2[3] = tmp_1[1] - tmp_1[3];
tmp_2[4] = tmp_1[4] + tmp_1[6];
// SQRT2DIV2 = sqrt(2)/2
#define SQRT2DIV2 0.7071067811865476
if (Dir == FFT_FORWARD) {
tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
} else {
tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
}
data[0] = tmp_2[0] + tmp_2[4];
data[1] = tmp_2[1] + tmp_2[5];
data[2] = tmp_2[2] + tmp_2[6];
data[3] = tmp_2[3] + tmp_2[7];
data[4] = tmp_2[0] - tmp_2[4];
data[5] = tmp_2[1] - tmp_2[5];
data[6] = tmp_2[2] - tmp_2[6];
data[7] = tmp_2[3] - tmp_2[7];
}
template <int Dir>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(
ComplexScalar* data, Index n, Index n_power_of_2) {
// Original code:
// RealScalar wtemp = std::sin(M_PI/n);
// RealScalar wpi = -std::sin(2 * M_PI/n);
const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
const RealScalar wpi = (Dir == FFT_FORWARD)
? m_minus_sin_2_PI_div_n_LUT[n_power_of_2]
: -m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
const ComplexScalar wp(wtemp, wpi);
const ComplexScalar wp_one = wp + ComplexScalar(1, 0);
const ComplexScalar wp_one_2 = wp_one * wp_one;
const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
const Index n2 = n / 2;
ComplexScalar w(1.0, 0.0);
for (Index i = 0; i < n2; i += 4) {
ComplexScalar temp0(data[i + n2] * w);
ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3);
w = w * wp_one_4;
data[i + n2] = data[i] - temp0;
data[i] += temp0;
data[i + 1 + n2] = data[i + 1] - temp1;
data[i + 1] += temp1;
data[i + 2 + n2] = data[i + 2] - temp2;
data[i + 2] += temp2;
data[i + 3 + n2] = data[i + 3] - temp3;
data[i + 3] += temp3;
}
}
template <int Dir>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(
ComplexScalar* data, Index n, Index n_power_of_2) {
eigen_assert(isPowerOfTwo(n));
if (n > 8) {
compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
compute_1D_Butterfly<Dir>(data + n / 2, n / 2, n_power_of_2 - 1);
butterfly_1D_merge<Dir>(data, n, n_power_of_2);
} else if (n == 8) {
butterfly_8<Dir>(data);
} else if (n == 4) {
butterfly_4<Dir>(data);
} else if (n == 2) {
butterfly_2<Dir>(data);
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
Index result = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > omitted_dim; --i) {
const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
const Index idx = index / partial_m_stride;
index -= idx * partial_m_stride;
result += idx * m_strides[i];
}
result += index;
}
else {
for (Index i = 0; i < omitted_dim; ++i) {
const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
const Index idx = index / partial_m_stride;
index -= idx * partial_m_stride;
result += idx * m_strides[i];
}
result += index;
}
// Value of index_coords[omitted_dim] is not determined to this step
return result;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
Index result = base + offset * m_strides[omitted_dim] ;
return result;
}
protected:
Index m_size;
const FFT EIGEN_DEVICE_REF m_fft;
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
TensorEvaluator<ArgType, Device> m_impl;
EvaluatorPointerType m_data;
const Device EIGEN_DEVICE_REF m_device;
// This will support a maximum FFT size of 2^32 for each dimension
// m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
const RealScalar m_sin_PI_div_n_LUT[32] = {
RealScalar(0.0),
RealScalar(-2),
RealScalar(-0.999999999999999),
RealScalar(-0.292893218813453),
RealScalar(-0.0761204674887130),
RealScalar(-0.0192147195967696),
RealScalar(-0.00481527332780311),
RealScalar(-0.00120454379482761),
RealScalar(-3.01181303795779e-04),
RealScalar(-7.52981608554592e-05),
RealScalar(-1.88247173988574e-05),
RealScalar(-4.70619042382852e-06),
RealScalar(-1.17654829809007e-06),
RealScalar(-2.94137117780840e-07),
RealScalar(-7.35342821488550e-08),
RealScalar(-1.83835707061916e-08),
RealScalar(-4.59589268710903e-09),
RealScalar(-1.14897317243732e-09),
RealScalar(-2.87243293150586e-10),
RealScalar( -7.18108232902250e-11),
RealScalar(-1.79527058227174e-11),
RealScalar(-4.48817645568941e-12),
RealScalar(-1.12204411392298e-12),
RealScalar(-2.80511028480785e-13),
RealScalar(-7.01277571201985e-14),
RealScalar(-1.75319392800498e-14),
RealScalar(-4.38298482001247e-15),
RealScalar(-1.09574620500312e-15),
RealScalar(-2.73936551250781e-16),
RealScalar(-6.84841378126949e-17),
RealScalar(-1.71210344531737e-17),
RealScalar(-4.28025861329343e-18)
};
// m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * M_PI / std::pow(2,i));
const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {
RealScalar(0.0),
RealScalar(0.0),
RealScalar(-1.00000000000000e+00),
RealScalar(-7.07106781186547e-01),
RealScalar(-3.82683432365090e-01),
RealScalar(-1.95090322016128e-01),
RealScalar(-9.80171403295606e-02),
RealScalar(-4.90676743274180e-02),
RealScalar(-2.45412285229123e-02),
RealScalar(-1.22715382857199e-02),
RealScalar(-6.13588464915448e-03),
RealScalar(-3.06795676296598e-03),
RealScalar(-1.53398018628477e-03),
RealScalar(-7.66990318742704e-04),
RealScalar(-3.83495187571396e-04),
RealScalar(-1.91747597310703e-04),
RealScalar(-9.58737990959773e-05),
RealScalar(-4.79368996030669e-05),
RealScalar(-2.39684498084182e-05),
RealScalar(-1.19842249050697e-05),
RealScalar(-5.99211245264243e-06),
RealScalar(-2.99605622633466e-06),
RealScalar(-1.49802811316901e-06),
RealScalar(-7.49014056584716e-07),
RealScalar(-3.74507028292384e-07),
RealScalar(-1.87253514146195e-07),
RealScalar(-9.36267570730981e-08),
RealScalar(-4.68133785365491e-08),
RealScalar(-2.34066892682746e-08),
RealScalar(-1.17033446341373e-08),
RealScalar(-5.85167231706864e-09),
RealScalar(-2.92583615853432e-09)
};
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H

View File

@@ -0,0 +1,379 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
namespace Eigen {
/** \class TensorFixedSize
* \ingroup CXX11_Tensor_Module
*
* \brief The fixed sized version of the tensor class.
*
* The fixed sized equivalent of
* Eigen::Tensor<float, 3> t(3, 5, 7);
* is
* Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
*/
template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> >
{
public:
typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
typedef typename Eigen::internal::nested<Self>::type Nested;
typedef typename internal::traits<Self>::StorageKind StorageKind;
typedef typename internal::traits<Self>::Index Index;
typedef Scalar_ Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename Base::CoeffReturnType CoeffReturnType;
static const int Options = Options_;
enum {
IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
PacketAccess = (internal::packet_traits<Scalar>::size > 1),
BlockAccess = false,
PreferBlockAccess = false,
Layout = Options_ & RowMajor ? RowMajor : ColMajor,
CoordAccess = true,
RawAccess = true
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
typedef Dimensions_ Dimensions;
static const std::size_t NumIndices = Dimensions::count;
protected:
TensorStorage<Scalar, Dimensions, Options> m_storage;
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar *data() { return m_storage.data(); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar *data() const { return m_storage.data(); }
// This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
// work, because that uses base().coeffRef() - and we don't yet
// implement a similar class hierarchy
inline Self& base() { return *this; }
inline const Self& base() const { return *this; }
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& coeff() const
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
return m_storage.data()[0];
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
{
eigen_internal_assert(checkIndexRange(indices));
return m_storage.data()[linearizedIndex(indices)];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
eigen_internal_assert(index >= 0 && index < size());
return m_storage.data()[index];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef()
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
return m_storage.data()[0];
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
{
if (Options&RowMajor) {
const Index index = i1 + i0 * m_storage.dimensions()[1];
return m_storage.data()[index];
} else {
const Index index = i0 + i1 * m_storage.dimensions()[0];
return m_storage.data()[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
{
if (Options&RowMajor) {
const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
return m_storage.data()[index];
} else {
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
return m_storage.data()[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
{
if (Options&RowMajor) {
const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
return m_storage.data()[index];
} else {
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
return m_storage.data()[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
{
if (Options&RowMajor) {
const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
return m_storage.data()[index];
} else {
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
return m_storage.data()[index];
}
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
{
eigen_assert(checkIndexRange(indices));
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return coeff(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator()() const
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeff();
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const
{
// The bracket operator is only for vectors, use the parenthesis operator instead.
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeff(index);
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
{
// The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
{
if (Options&RowMajor) {
const Index index = i1 + i0 * m_storage.dimensions()[1];
return m_storage.data()[index];
} else {
const Index index = i0 + i1 * m_storage.dimensions()[0];
return m_storage.data()[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
{
if (Options&RowMajor) {
const Index index = i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0);
return m_storage.data()[index];
} else {
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * i2);
return m_storage.data()[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
{
if (Options&RowMajor) {
const Index index = i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0));
return m_storage.data()[index];
} else {
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * i3));
return m_storage.data()[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
{
if (Options&RowMajor) {
const Index index = i4 + m_storage.dimensions()[4] * (i3 + m_storage.dimensions()[3] * (i2 + m_storage.dimensions()[2] * (i1 + m_storage.dimensions()[1] * i0)));
return m_storage.data()[index];
} else {
const Index index = i0 + m_storage.dimensions()[0] * (i1 + m_storage.dimensions()[1] * (i2 + m_storage.dimensions()[2] * (i3 + m_storage.dimensions()[3] * i4)));
return m_storage.data()[index];
}
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
{
eigen_assert(checkIndexRange(indices));
return coeffRef(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index index)
{
eigen_assert(index >= 0 && index < size());
return coeffRef(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()()
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
return coeffRef();
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator[](Index index)
{
// The bracket operator is only for vectors, use the parenthesis operator instead
EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
return coeffRef(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorFixedSize()
: m_storage()
{
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorFixedSize(const Self& other)
: m_storage(other.m_storage)
{
}
#if EIGEN_HAS_RVALUE_REFERENCES
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
: m_storage(other.m_storage)
{
}
#endif
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other)
{
typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
Assign assign(*this, other.derived());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
}
template<typename OtherDerived>
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other)
{
typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
Assign assign(*this, other.derived());
internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
}
// FIXME: check that the dimensions of other match the dimensions of *this.
// Unfortunately this isn't possible yet when the rhs is an expression.
EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
protected:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const
{
using internal::array_apply_and_reduce;
using internal::array_zip_and_reduce;
using internal::greater_equal_zero_op;
using internal::logical_and_op;
using internal::lesser_op;
return true;
// check whether the indices are all >= 0
/* array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
// check whether the indices fit in the dimensions
array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const
{
if (Options&RowMajor) {
return m_storage.dimensions().IndexOfRowMajor(indices);
} else {
return m_storage.dimensions().IndexOfColMajor(indices);
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H

View File

@@ -0,0 +1,237 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
namespace Eigen {
/** \class TensorForcedEval
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reshaping class.
*
*
*/
namespace internal {
template<typename XprType>
struct traits<TensorForcedEvalOp<XprType> >
{
// Type promotion to handle the case where the types of the lhs and the rhs are different.
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename traits<XprType>::StorageKind StorageKind;
typedef typename traits<XprType>::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
enum {
Flags = 0
};
};
template<typename XprType>
struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
{
typedef const TensorForcedEvalOp<XprType>& type;
};
template<typename XprType>
struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
{
typedef TensorForcedEvalOp<XprType> type;
};
} // end namespace internal
template<typename XprType>
class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr)
: m_xpr(expr) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
};
namespace internal {
template <typename Device, typename CoeffReturnType>
struct non_integral_type_placement_new{
template <typename StorageType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
// Initialize non-trivially constructible types.
if (!internal::is_arithmetic<CoeffReturnType>::value) {
for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
}
}
};
// SYCL does not support non-integral types
// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices
// no matching function for call to 'operator new'
template <typename CoeffReturnType>
struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
template <typename StorageType>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {
}
};
} // end namespace internal
template<typename ArgType_, typename Device>
struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
{
typedef const typename internal::remove_all<ArgType_>::type ArgType;
typedef TensorForcedEvalOp<ArgType> XprType;
typedef typename ArgType::Scalar Scalar;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
typedef typename XprType::Index Index;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = true,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = true
};
static const int NumDims = internal::traits<ArgType>::NumDimensions;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
Layout, Index>
TensorBlock;
//===--------------------------------------------------------------------===//
TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_op(op.expression()),
m_device(device), m_buffer(NULL)
{ }
EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
const Index numValues = internal::array_prod(m_impl.dimensions());
m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
internal::TensorExecutor<
const EvalTo, typename internal::remove_const<Device>::type,
/*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
/*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
run(evalToTmp, m_device);
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
const Index numValues = internal::array_prod(m_impl.dimensions());
m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
numValues * sizeof(CoeffReturnType)));
typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type>
EvalTo;
EvalTo evalToTmp(m_device.get(m_buffer), m_op);
auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); },
std::move(done));
internal::TensorAsyncExecutor<
const EvalTo, typename internal::remove_const<Device>::type,
decltype(on_done),
/*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
/*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
runAsync(evalToTmp, m_device, std::move(on_done));
}
#endif
EIGEN_STRONG_INLINE void cleanup() {
m_device.deallocate_temp(m_buffer);
m_buffer = NULL;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_buffer[index];
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
return internal::TensorBlockResourceRequirements::any();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
assert(m_buffer != NULL);
return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
EvaluatorPointerType data() const { return m_buffer; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_buffer.bind(cgh);
m_impl.bind(cgh);
}
#endif
private:
TensorEvaluator<ArgType, Device> m_impl;
const ArgType m_op;
const Device EIGEN_DEVICE_REF m_device;
EvaluatorPointerType m_buffer;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H

View File

@@ -0,0 +1,191 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
namespace Eigen {
// MakePointer class is used as a container of the address space of the pointer
// on the host and on the device. From the host side it generates the T* pointer
// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
// T* m_data on the host. It is always called on the device.
// Specialisation of MakePointer class for creating the sycl buffer with
// map_allocator.
template<typename T> struct MakePointer {
typedef T* Type;
typedef const T* ConstType;
};
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
return const_cast<T*>(data);
}
// The StorageMemory class is a container of the device specific pointer
// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
// is a device-agnostic type and need MakePointer class for type conversion,
// the TensorEvaluator class can be specialized for a device, hence it is possible
// to construct different types of temproray storage memory in TensorEvaluator
// for different devices by specializing the following StorageMemory class.
template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
namespace internal{
template<typename A, typename B> struct Pointer_type_promotion {
static const bool val=false;
};
template<typename A> struct Pointer_type_promotion<A, A> {
static const bool val = true;
};
template<typename A, typename B> struct TypeConversion {
typedef A* type;
};
}
template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
template<typename PlainObjectType> class TensorRef;
template<typename Derived, int AccessLevel> class TensorBase;
template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer > class TensorReductionOp;
template<typename XprType> class TensorIndexTupleOp;
template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp;
template<typename TargetType, typename XprType> class TensorConversionOp;
template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
template<typename PatchDim, typename XprType> class TensorPatchOp;
template<DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorImagePatchOp;
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType> class TensorVolumePatchOp;
template<typename Broadcast, typename XprType> class TensorBroadcastingOp;
template<DenseIndex DimId, typename XprType> class TensorChippingOp;
template<typename NewDimensions, typename XprType> class TensorReshapingOp;
template<typename XprType> class TensorLayoutSwapOp;
template<typename StartIndices, typename Sizes, typename XprType> class TensorSlicingOp;
template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
template<typename Shuffle, typename XprType> class TensorShufflingOp;
template<typename Strides, typename XprType> class TensorStridingOp;
template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
template<typename Strides, typename XprType> class TensorInflationOp;
template<typename Generator, typename XprType> class TensorGeneratorOp;
template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
template<typename Op, typename XprType> class TensorScanOp;
template<typename Dims, typename XprType> class TensorTraceOp;
template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
template<typename XprType> class TensorForcedEvalOp;
template<typename ExpressionType, typename DeviceType> class TensorDevice;
template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice;
template<typename Derived, typename Device> struct TensorEvaluator;
struct NoOpOutputKernel;
struct DefaultDevice;
struct ThreadPoolDevice;
struct GpuDevice;
struct SyclDevice;
#ifdef EIGEN_USE_SYCL
template <typename T> struct MakeSYCLPointer {
typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
};
template <typename T>
EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
return data;
}
template <typename T>
struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
template <typename T>
struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
namespace TensorSycl {
namespace internal{
template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
}
}
#endif
enum FFTResultType {
RealPart = 0,
ImagPart = 1,
BothParts = 2
};
enum FFTDirection {
FFT_FORWARD = 0,
FFT_REVERSE = 1
};
namespace internal {
template <typename Device, typename Expression>
struct IsVectorizable {
static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
};
template <typename Expression>
struct IsVectorizable<GpuDevice, Expression> {
static const bool value = TensorEvaluator<Expression, GpuDevice>::PacketAccess &&
TensorEvaluator<Expression, GpuDevice>::IsAligned;
};
// Tiled evaluation strategy.
enum TiledEvaluation {
Off = 0, // tiled evaluation is not supported
On = 1, // still work in progress (see TensorBlock.h)
};
template <typename Device, typename Expression>
struct IsTileable {
// Check that block evaluation is supported and it's a preferred option (at
// least one sub-expression has much faster block evaluation, e.g.
// broadcasting).
static const bool BlockAccess =
TensorEvaluator<Expression, Device>::BlockAccess &&
TensorEvaluator<Expression, Device>::PreferBlockAccess;
static const TiledEvaluation value =
BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
};
template <typename Expression, typename Device,
bool Vectorizable = IsVectorizable<Device, Expression>::value,
TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
class TensorExecutor;
template <typename Expression, typename Device, typename DoneCallback,
bool Vectorizable = IsVectorizable<Device, Expression>::value,
TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
class TensorAsyncExecutor;
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H

View File

@@ -0,0 +1,488 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
namespace Eigen {
namespace internal {
/** \internal
* \brief Template functor to compute the modulo between an array and a scalar.
*/
template <typename Scalar>
struct scalar_mod_op {
EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; }
const Scalar m_divisor;
};
template <typename Scalar>
struct functor_traits<scalar_mod_op<Scalar> >
{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
/** \internal
* \brief Template functor to compute the modulo between 2 arrays.
*/
template <typename Scalar>
struct scalar_mod2_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
};
template <typename Scalar>
struct functor_traits<scalar_mod2_op<Scalar> >
{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
template <typename Scalar>
struct scalar_fmod_op {
EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
operator()(const Scalar& a, const Scalar& b) const {
return numext::fmod(a, b);
}
};
template <typename Scalar>
struct functor_traits<scalar_fmod_op<Scalar> > {
enum { Cost = 13, // Reciprocal throughput of FPREM on Haswell.
PacketAccess = false };
};
template<typename Reducer, typename Device>
struct reducer_traits {
enum {
Cost = 1,
PacketAccess = false,
IsStateful = false,
IsExactlyAssociative = true
};
};
// Standard reduction functors
template <typename T> struct SumReducer
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
internal::scalar_sum_op<T> sum_op;
*accum = sum_op(*accum, t);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = padd<Packet>(*accum, p);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
internal::scalar_cast_op<int, T> conv;
return conv(0);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(initialize());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return vaccum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
internal::scalar_sum_op<T> sum_op;
return sum_op(saccum, predux(vaccum));
}
};
template <typename T, typename Device>
struct reducer_traits<SumReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::HasAdd,
IsStateful = false,
IsExactlyAssociative = NumTraits<T>::IsInteger
};
};
template <typename T> struct MeanReducer
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
MeanReducer() : scalarCount_(0), packetCount_(0) { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
internal::scalar_sum_op<T> sum_op;
*accum = sum_op(*accum, t);
scalarCount_++;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
(*accum) = padd<Packet>(*accum, p);
packetCount_++;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
internal::scalar_cast_op<int, T> conv;
return conv(0);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(initialize());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
internal::scalar_quotient_op<T> quotient_op;
return quotient_op(accum, T(scalarCount_));
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
internal::scalar_sum_op<T> sum_op;
internal::scalar_quotient_op<T> quotient_op;
return quotient_op(
sum_op(saccum, predux(vaccum)),
T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
}
protected:
DenseIndex scalarCount_;
DenseIndex packetCount_;
};
template <typename T, typename Device>
struct reducer_traits<MeanReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::HasAdd &&
PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
IsStateful = true,
IsExactlyAssociative = NumTraits<T>::IsInteger
};
};
template <typename T, bool IsMax = true, bool IsInteger = true>
struct MinMaxBottomValue {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::lowest();
}
};
template <typename T>
struct MinMaxBottomValue<T, true, false> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return -Eigen::NumTraits<T>::infinity();
}
};
template <typename T>
struct MinMaxBottomValue<T, false, true> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::highest();
}
};
template <typename T>
struct MinMaxBottomValue<T, false, false> {
EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
return Eigen::NumTraits<T>::infinity();
}
};
template <typename T, int NaNPropagation=PropagateFast> struct MaxReducer
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
scalar_max_op<T, T, NaNPropagation> op;
*accum = op(t, *accum);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
scalar_max_op<T, T, NaNPropagation> op;
(*accum) = op.packetOp(*accum, p);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(initialize());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return vaccum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
scalar_max_op<T, T, NaNPropagation> op;
return op(saccum, op.predux(vaccum));
}
};
template <typename T, typename Device, int NaNPropagation>
struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::HasMax,
IsStateful = false,
IsExactlyAssociative = (NaNPropagation!=PropagateFast)
};
};
template <typename T, int NaNPropagation=PropagateFast> struct MinReducer
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
scalar_min_op<T, T, NaNPropagation> op;
*accum = op(t, *accum);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
scalar_min_op<T, T, NaNPropagation> op;
(*accum) = op.packetOp(*accum, p);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(initialize());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return vaccum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
scalar_min_op<T, T, NaNPropagation> op;
return op(saccum, op.predux(vaccum));
}
};
template <typename T, typename Device, int NaNPropagation>
struct reducer_traits<MinReducer<T, NaNPropagation>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = PacketType<T, Device>::HasMin,
IsStateful = false,
IsExactlyAssociative = (NaNPropagation!=PropagateFast)
};
};
template <typename T> struct ProdReducer
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
internal::scalar_product_op<T> prod_op;
(*accum) = prod_op(*accum, t);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
(*accum) = pmul<Packet>(*accum, p);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
internal::scalar_cast_op<int, T> conv;
return conv(1);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
return pset1<Packet>(initialize());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
return accum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
return vaccum;
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
internal::scalar_product_op<T> prod_op;
return prod_op(saccum, predux_mul(vaccum));
}
};
template <typename T, typename Device>
struct reducer_traits<ProdReducer<T>, Device> {
enum {
Cost = NumTraits<T>::MulCost,
PacketAccess = PacketType<T, Device>::HasMul,
IsStateful = false,
IsExactlyAssociative = true
};
};
struct AndReducer
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
*accum = *accum && t;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
return accum;
}
};
template <typename Device>
struct reducer_traits<AndReducer, Device> {
enum {
Cost = 1,
PacketAccess = false,
IsStateful = false,
IsExactlyAssociative = true
};
};
struct OrReducer {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
*accum = *accum || t;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const {
return false;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const {
return accum;
}
};
template <typename Device>
struct reducer_traits<OrReducer, Device> {
enum {
Cost = 1,
PacketAccess = false,
IsStateful = false,
IsExactlyAssociative = true
};
};
// Argmin/Argmax reducers. Returns the first occurrence if multiple locations
// contain the same min/max value.
template <typename T> struct ArgMaxTupleReducer
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
if (t.second < accum->second) {
return;
} else if (t.second > accum->second || accum->first > t.first ) {
*accum = t;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return T(0, NumTraits<typename T::second_type>::lowest());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
return accum;
}
};
template <typename T, typename Device>
struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = false,
IsStateful = false,
IsExactlyAssociative = true
};
};
template <typename T> struct ArgMinTupleReducer
{
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
if (t.second > accum->second) {
return;
} else if (t.second < accum->second || accum->first > t.first) {
*accum = t;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
return T(0, NumTraits<typename T::second_type>::highest());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const {
return accum;
}
};
template <typename T, typename Device>
struct reducer_traits<ArgMinTupleReducer<T>, Device> {
enum {
Cost = NumTraits<T>::AddCost,
PacketAccess = false,
IsStateful = false,
IsExactlyAssociative = true
};
};
template <typename T, typename Index, size_t NumDims>
class GaussianGenerator {
public:
static const bool PacketAccess = false;
EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means,
const array<T, NumDims>& std_devs)
: m_means(means)
{
EIGEN_UNROLL_LOOP
for (size_t i = 0; i < NumDims; ++i) {
m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
}
}
EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
T tmp = T(0);
EIGEN_UNROLL_LOOP
for (size_t i = 0; i < NumDims; ++i) {
T offset = coordinates[i] - m_means[i];
tmp += offset * offset / m_two_sigmas[i];
}
return numext::exp(-tmp);
}
private:
array<T, NumDims> m_means;
array<T, NumDims> m_two_sigmas;
};
template <typename T, typename Index, size_t NumDims>
struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
enum {
Cost = NumDims * (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost +
functor_traits<scalar_quotient_op<T, T> >::Cost) +
functor_traits<scalar_exp_op<T> >::Cost,
PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
};
};
template <typename Scalar>
struct scalar_clamp_op {
EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
operator()(const Scalar& x) const {
return numext::mini(numext::maxi(x, m_min), m_max);
}
template <typename Packet>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
packetOp(const Packet& x) const {
return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
}
const Scalar m_min;
const Scalar m_max;
};
template<typename Scalar>
struct functor_traits<scalar_clamp_op<Scalar> >
{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; };
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H

View File

@@ -0,0 +1,302 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
namespace Eigen {
/** \class TensorGeneratorOp
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor generator class.
*
*
*/
namespace internal {
template<typename Generator, typename XprType>
struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<typename Generator, typename XprType>
struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense>
{
typedef const TensorGeneratorOp<Generator, XprType>& type;
};
template<typename Generator, typename XprType>
struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type>
{
typedef TensorGeneratorOp<Generator, XprType> type;
};
} // end namespace internal
template<typename Generator, typename XprType>
class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
: m_xpr(expr), m_generator(generator) {}
EIGEN_DEVICE_FUNC
const Generator& generator() const { return m_generator; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const Generator m_generator;
};
// Eval as rvalue
template<typename Generator, typename ArgType, typename Device>
struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
{
typedef TensorGeneratorOp<Generator, ArgType> XprType;
typedef typename XprType::Index Index;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
static const int NumDims = internal::array_size<Dimensions>::value;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = true,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
typedef internal::TensorIntDivisor<Index> IndexDivisor;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
Layout, Index>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_device(device), m_generator(op.generator())
{
TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
m_dimensions = argImpl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_strides[0] = 1;
EIGEN_UNROLL_LOOP
for (int i = 1; i < NumDims; ++i) {
m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
}
} else {
m_strides[NumDims - 1] = 1;
EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i >= 0; --i) {
m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
array<Index, NumDims> coords;
extract_coordinates(index, coords);
return m_generator(coords);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
const int packetSize = PacketType<CoeffReturnType, Device>::size;
EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+packetSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
for (int i = 0; i < packetSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
const size_t target_size = m_device.firstLevelCacheSize();
// TODO(ezhulenev): Generator should have a cost.
return internal::TensorBlockResourceRequirements::skewed<Scalar>(
target_size);
}
struct BlockIteratorState {
Index stride;
Index span;
Index size;
Index count;
};
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
static const bool is_col_major =
static_cast<int>(Layout) == static_cast<int>(ColMajor);
// Compute spatial coordinates for the first block element.
array<Index, NumDims> coords;
extract_coordinates(desc.offset(), coords);
array<Index, NumDims> initial_coords = coords;
// Offset in the output block buffer.
Index offset = 0;
// Initialize output block iterator state. Dimension in this array are
// always in inner_most -> outer_most order (col major layout).
array<BlockIteratorState, NumDims> it;
for (int i = 0; i < NumDims; ++i) {
const int dim = is_col_major ? i : NumDims - 1 - i;
it[i].size = desc.dimension(dim);
it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
it[i].span = it[i].stride * (it[i].size - 1);
it[i].count = 0;
}
eigen_assert(it[0].stride == 1);
// Prepare storage for the materialized generator result.
const typename TensorBlock::Storage block_storage =
TensorBlock::prepareStorage(desc, scratch);
CoeffReturnType* block_buffer = block_storage.data();
static const int packet_size = PacketType<CoeffReturnType, Device>::size;
static const int inner_dim = is_col_major ? 0 : NumDims - 1;
const Index inner_dim_size = it[0].size;
const Index inner_dim_vectorized = inner_dim_size - packet_size;
while (it[NumDims - 1].count < it[NumDims - 1].size) {
Index i = 0;
// Generate data for the vectorized part of the inner-most dimension.
for (; i <= inner_dim_vectorized; i += packet_size) {
for (Index j = 0; j < packet_size; ++j) {
array<Index, NumDims> j_coords = coords; // Break loop dependence.
j_coords[inner_dim] += j;
*(block_buffer + offset + i + j) = m_generator(j_coords);
}
coords[inner_dim] += packet_size;
}
// Finalize non-vectorized part of the inner-most dimension.
for (; i < inner_dim_size; ++i) {
*(block_buffer + offset + i) = m_generator(coords);
coords[inner_dim]++;
}
coords[inner_dim] = initial_coords[inner_dim];
// For the 1d tensor we need to generate only one inner-most dimension.
if (NumDims == 1) break;
// Update offset.
for (i = 1; i < NumDims; ++i) {
if (++it[i].count < it[i].size) {
offset += it[i].stride;
coords[is_col_major ? i : NumDims - 1 - i]++;
break;
}
if (i != NumDims - 1) it[i].count = 0;
coords[is_col_major ? i : NumDims - 1 - i] =
initial_coords[is_col_major ? i : NumDims - 1 - i];
offset -= it[i].span;
}
}
return block_storage.AsTensorMaterializedBlock();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool) const {
// TODO(rmlarsen): This is just a placeholder. Define interface to make
// generators return their cost.
return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() +
TensorOpCost::MulCost<Scalar>());
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_fast_strides[i];
index -= idx * m_strides[i];
coords[i] = idx;
}
coords[0] = index;
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_fast_strides[i];
index -= idx * m_strides[i];
coords[i] = idx;
}
coords[NumDims-1] = index;
}
}
const Device EIGEN_DEVICE_REF m_device;
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
array<IndexDivisor, NumDims> m_fast_strides;
Generator m_generator;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H

View File

@@ -0,0 +1,33 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
namespace Eigen {
/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
*
* This function computes the regularized incomplete beta function (integral).
*
*/
template <typename ADerived, typename BDerived, typename XDerived>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
const ADerived, const BDerived, const XDerived>
betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
return TensorCwiseTernaryOp<
internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
const BDerived, const XDerived>(
a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
}
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H

View File

@@ -0,0 +1,99 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but
// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
// When compiling such files, gcc will end up trying to pick up the CUDA headers by
// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
// This will obviously not work when trying to compile tensorflow on a system with no CUDA
// To work around this issue for HIP systems (and leave the default behaviour intact), the
// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
#if defined(EIGEN_USE_HIP)
#define gpuStream_t hipStream_t
#define gpuDeviceProp_t hipDeviceProp_t
#define gpuError_t hipError_t
#define gpuSuccess hipSuccess
#define gpuErrorNotReady hipErrorNotReady
#define gpuGetDeviceCount hipGetDeviceCount
#define gpuGetLastError hipGetLastError
#define gpuPeekAtLastError hipPeekAtLastError
#define gpuGetErrorName hipGetErrorName
#define gpuGetErrorString hipGetErrorString
#define gpuGetDeviceProperties hipGetDeviceProperties
#define gpuStreamDefault hipStreamDefault
#define gpuGetDevice hipGetDevice
#define gpuSetDevice hipSetDevice
#define gpuMalloc hipMalloc
#define gpuFree hipFree
#define gpuMemsetAsync hipMemsetAsync
#define gpuMemcpyAsync hipMemcpyAsync
#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
#define gpuStreamQuery hipStreamQuery
#define gpuSharedMemConfig hipSharedMemConfig
#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
#define gpuStreamSynchronize hipStreamSynchronize
#define gpuDeviceSynchronize hipDeviceSynchronize
#define gpuMemcpy hipMemcpy
#else
#define gpuStream_t cudaStream_t
#define gpuDeviceProp_t cudaDeviceProp
#define gpuError_t cudaError_t
#define gpuSuccess cudaSuccess
#define gpuErrorNotReady cudaErrorNotReady
#define gpuGetDeviceCount cudaGetDeviceCount
#define gpuGetLastError cudaGetLastError
#define gpuPeekAtLastError cudaPeekAtLastError
#define gpuGetErrorName cudaGetErrorName
#define gpuGetErrorString cudaGetErrorString
#define gpuGetDeviceProperties cudaGetDeviceProperties
#define gpuStreamDefault cudaStreamDefault
#define gpuGetDevice cudaGetDevice
#define gpuSetDevice cudaSetDevice
#define gpuMalloc cudaMalloc
#define gpuFree cudaFree
#define gpuMemsetAsync cudaMemsetAsync
#define gpuMemcpyAsync cudaMemcpyAsync
#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
#define gpuStreamQuery cudaStreamQuery
#define gpuSharedMemConfig cudaSharedMemConfig
#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
#define gpuStreamSynchronize cudaStreamSynchronize
#define gpuDeviceSynchronize cudaDeviceSynchronize
#define gpuMemcpy cudaMemcpy
#endif
// gpu_assert can be overridden
#ifndef gpu_assert
#if defined(EIGEN_HIP_DEVICE_COMPILE)
// HIPCC do not support the use of assert on the GPU side.
#define gpu_assert(COND)
#else
#define gpu_assert(COND) assert(COND)
#endif
#endif // gpu_assert
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

View File

@@ -0,0 +1,44 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
#undef gpuStream_t
#undef gpuDeviceProp_t
#undef gpuError_t
#undef gpuSuccess
#undef gpuErrorNotReady
#undef gpuGetDeviceCount
#undef gpuGetErrorString
#undef gpuGetDeviceProperties
#undef gpuStreamDefault
#undef gpuGetDevice
#undef gpuSetDevice
#undef gpuMalloc
#undef gpuFree
#undef gpuMemsetAsync
#undef gpuMemcpyAsync
#undef gpuMemcpyDeviceToDevice
#undef gpuMemcpyDeviceToHost
#undef gpuMemcpyHostToDevice
#undef gpuStreamQuery
#undef gpuSharedMemConfig
#undef gpuDeviceSetSharedMemConfig
#undef gpuStreamSynchronize
#undef gpuDeviceSynchronize
#undef gpuMemcpy
#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H

View File

@@ -0,0 +1,79 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
namespace Eigen {
namespace internal {
// Print the tensor as a 2d matrix
template <typename Tensor, int Rank>
struct TensorPrinter {
static void run (std::ostream& os, const Tensor& tensor) {
typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
typedef typename Tensor::Index Index;
const Index total_size = internal::array_prod(tensor.dimensions());
if (total_size > 0) {
const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
static const int layout = Tensor::Layout;
Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
os << matrix;
}
}
};
// Print the tensor as a vector
template <typename Tensor>
struct TensorPrinter<Tensor, 1> {
static void run (std::ostream& os, const Tensor& tensor) {
typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
typedef typename Tensor::Index Index;
const Index total_size = internal::array_prod(tensor.dimensions());
if (total_size > 0) {
Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
os << array;
}
}
};
// Print the tensor as a scalar
template <typename Tensor>
struct TensorPrinter<Tensor, 0> {
static void run (std::ostream& os, const Tensor& tensor) {
os << tensor.coeff(0);
}
};
}
template <typename T>
std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
typedef typename Evaluator::Dimensions Dimensions;
// Evaluate the expression if needed
TensorForcedEvalOp<const T> eval = expr.eval();
Evaluator tensor(eval, DefaultDevice());
tensor.evalSubExprsIfNeeded(NULL);
// Print the result
static const int rank = internal::array_size<Dimensions>::value;
internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
// Cleanup.
tensor.cleanup();
return os;
}
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_IO_H

View File

@@ -0,0 +1,603 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
namespace Eigen {
/** \class TensorImagePatch
* \ingroup CXX11_Tensor_Module
*
* \brief Patch extraction specialized for image processing.
* This assumes that the input has a least 3 dimensions ordered as follow:
* 1st dimension: channels (of size d)
* 2nd dimension: rows (of size r)
* 3rd dimension: columns (of size c)
* There can be additional dimensions such as time (for video) or batch (for
* bulk processing after the first 3.
* Calling the image patch code with patch_rows and patch_cols is equivalent
* to calling the regular patch extraction code with parameters d, patch_rows,
* patch_cols, and 1 for all the additional dimensions.
*/
namespace internal {
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
{
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions + 1;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense>
{
typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
};
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type>
{
typedef TensorImagePatchOp<Rows, Cols, XprType> type;
};
template <typename Self, bool Vectorizable>
struct ImagePatchCopyOp {
typedef typename Self::Index Index;
typedef typename Self::Scalar Scalar;
typedef typename Self::Impl Impl;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const Self& self, const Index num_coeff_to_copy, const Index dst_index,
Scalar* dst_data, const Index src_index) {
const Impl& impl = self.impl();
for (Index i = 0; i < num_coeff_to_copy; ++i) {
dst_data[dst_index + i] = impl.coeff(src_index + i);
}
}
};
template <typename Self>
struct ImagePatchCopyOp<Self, true> {
typedef typename Self::Index Index;
typedef typename Self::Scalar Scalar;
typedef typename Self::Impl Impl;
typedef typename packet_traits<Scalar>::type Packet;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const Self& self, const Index num_coeff_to_copy, const Index dst_index,
Scalar* dst_data, const Index src_index) {
const Impl& impl = self.impl();
const Index packet_size = internal::unpacket_traits<Packet>::size;
const Index vectorized_size =
(num_coeff_to_copy / packet_size) * packet_size;
for (Index i = 0; i < vectorized_size; i += packet_size) {
Packet p = impl.template packet<Unaligned>(src_index + i);
internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
}
for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
dst_data[dst_index + i] = impl.coeff(src_index + i);
}
}
};
template <typename Self>
struct ImagePatchPaddingOp {
typedef typename Self::Index Index;
typedef typename Self::Scalar Scalar;
typedef typename packet_traits<Scalar>::type Packet;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
const Index num_coeff_to_pad, const Scalar padding_value,
const Index dst_index, Scalar* dst_data) {
const Index packet_size = internal::unpacket_traits<Packet>::size;
const Packet padded_packet = internal::pset1<Packet>(padding_value);
const Index vectorized_size =
(num_coeff_to_pad / packet_size) * packet_size;
for (Index i = 0; i < vectorized_size; i += packet_size) {
internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
padded_packet);
}
for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
dst_data[dst_index + i] = padding_value;
}
}
};
} // end namespace internal
template<DenseIndex Rows, DenseIndex Cols, typename XprType>
class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
DenseIndex row_strides, DenseIndex col_strides,
DenseIndex in_row_strides, DenseIndex in_col_strides,
DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
PaddingType padding_type, Scalar padding_value)
: m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
m_row_strides(row_strides), m_col_strides(col_strides),
m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
m_padding_type(padding_type), m_padding_value(padding_value) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
DenseIndex row_strides, DenseIndex col_strides,
DenseIndex in_row_strides, DenseIndex in_col_strides,
DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
DenseIndex padding_top, DenseIndex padding_bottom,
DenseIndex padding_left, DenseIndex padding_right,
Scalar padding_value)
: m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
m_row_strides(row_strides), m_col_strides(col_strides),
m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
m_padding_left(padding_left), m_padding_right(padding_right),
m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
EIGEN_DEVICE_FUNC
DenseIndex patch_rows() const { return m_patch_rows; }
EIGEN_DEVICE_FUNC
DenseIndex patch_cols() const { return m_patch_cols; }
EIGEN_DEVICE_FUNC
DenseIndex row_strides() const { return m_row_strides; }
EIGEN_DEVICE_FUNC
DenseIndex col_strides() const { return m_col_strides; }
EIGEN_DEVICE_FUNC
DenseIndex in_row_strides() const { return m_in_row_strides; }
EIGEN_DEVICE_FUNC
DenseIndex in_col_strides() const { return m_in_col_strides; }
EIGEN_DEVICE_FUNC
DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
EIGEN_DEVICE_FUNC
DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
EIGEN_DEVICE_FUNC
bool padding_explicit() const { return m_padding_explicit; }
EIGEN_DEVICE_FUNC
DenseIndex padding_top() const { return m_padding_top; }
EIGEN_DEVICE_FUNC
DenseIndex padding_bottom() const { return m_padding_bottom; }
EIGEN_DEVICE_FUNC
DenseIndex padding_left() const { return m_padding_left; }
EIGEN_DEVICE_FUNC
DenseIndex padding_right() const { return m_padding_right; }
EIGEN_DEVICE_FUNC
PaddingType padding_type() const { return m_padding_type; }
EIGEN_DEVICE_FUNC
Scalar padding_value() const { return m_padding_value; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const DenseIndex m_patch_rows;
const DenseIndex m_patch_cols;
const DenseIndex m_row_strides;
const DenseIndex m_col_strides;
const DenseIndex m_in_row_strides;
const DenseIndex m_in_col_strides;
const DenseIndex m_row_inflate_strides;
const DenseIndex m_col_inflate_strides;
const bool m_padding_explicit;
const DenseIndex m_padding_top;
const DenseIndex m_padding_bottom;
const DenseIndex m_padding_left;
const DenseIndex m_padding_right;
const PaddingType m_padding_type;
const Scalar m_padding_value;
};
// Eval as rvalue
template<DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
{
typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
static const int NumDims = NumInputDims + 1;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>,
Device> Self;
typedef TensorEvaluator<ArgType, Device> Impl;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
: m_device(device), m_impl(op.expression(), device)
{
EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
m_paddingValue = op.padding_value();
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
// Caches a few variables.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputDepth = input_dims[0];
m_inputRows = input_dims[1];
m_inputCols = input_dims[2];
} else {
m_inputDepth = input_dims[NumInputDims-1];
m_inputRows = input_dims[NumInputDims-2];
m_inputCols = input_dims[NumInputDims-3];
}
m_row_strides = op.row_strides();
m_col_strides = op.col_strides();
// Input strides and effective input/patch size
m_in_row_strides = op.in_row_strides();
m_in_col_strides = op.in_col_strides();
m_row_inflate_strides = op.row_inflate_strides();
m_col_inflate_strides = op.col_inflate_strides();
// The "effective" input rows and input cols are the input rows and cols
// after inflating them with zeros.
// For examples, a 2x3 matrix with row_inflate_strides and
// col_inflate_strides of 2 comes from:
// A B C
// D E F
//
// to a matrix is 3 x 5:
//
// A . B . C
// . . . . .
// D . E . F
m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
if (op.padding_explicit()) {
m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
m_rowPaddingTop = op.padding_top();
m_colPaddingLeft = op.padding_left();
} else {
// Computing padding from the type
switch (op.padding_type()) {
case PADDING_VALID:
m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
// Calculate the padding
m_rowPaddingTop = numext::maxi<Index>(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2);
m_colPaddingLeft = numext::maxi<Index>(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2);
break;
case PADDING_SAME:
m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
// Calculate the padding
m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
// The padding size calculation for PADDING_SAME has been updated to
// be consistent with how TensorFlow extracts its paddings.
m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
break;
default:
eigen_assert(false && "unexpected padding");
m_outputCols=0; // silence the uninitialised warning;
m_outputRows=0; //// silence the uninitialised warning;
}
}
eigen_assert(m_outputRows > 0);
eigen_assert(m_outputCols > 0);
// Dimensions for result of extraction.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
// ColMajor
// 0: depth
// 1: patch_rows
// 2: patch_cols
// 3: number of patches
// 4 and beyond: anything else (such as batch).
m_dimensions[0] = input_dims[0];
m_dimensions[1] = op.patch_rows();
m_dimensions[2] = op.patch_cols();
m_dimensions[3] = m_outputRows * m_outputCols;
for (int i = 4; i < NumDims; ++i) {
m_dimensions[i] = input_dims[i-1];
}
} else {
// RowMajor
// NumDims-1: depth
// NumDims-2: patch_rows
// NumDims-3: patch_cols
// NumDims-4: number of patches
// NumDims-5 and beyond: anything else (such as batch).
m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
m_dimensions[NumDims-2] = op.patch_rows();
m_dimensions[NumDims-3] = op.patch_cols();
m_dimensions[NumDims-4] = m_outputRows * m_outputCols;
for (int i = NumDims-5; i >= 0; --i) {
m_dimensions[i] = input_dims[i];
}
}
// Strides for moving the patch in various dimensions.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_colStride = m_dimensions[1];
m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
m_otherStride = m_patchStride * m_dimensions[3];
} else {
m_colStride = m_dimensions[NumDims-2];
m_patchStride = m_colStride * m_dimensions[NumDims-3] * m_dimensions[NumDims-1];
m_otherStride = m_patchStride * m_dimensions[NumDims-4];
}
// Strides for navigating through the input tensor.
m_rowInputStride = m_inputDepth;
m_colInputStride = m_inputDepth * m_inputRows;
m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols;
// Fast representations of different variables.
m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
m_fastInflateRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
m_fastInflateColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
// Number of patches in the width dimension.
m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
} else {
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
// Patch index corresponding to the passed in index.
const Index patchIndex = index / m_fastPatchStride;
// Find the offset of the element wrt the location of the first element.
const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
// Other ways to index this element.
const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
// Calculate col index in the input original tensor.
const Index colIndex = patch2DIndex / m_fastOutputRows;
const Index colOffset = patchOffset / m_fastColStride;
const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
if (inputCol < 0 || inputCol >= m_input_cols_eff ||
((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
return Scalar(m_paddingValue);
}
// Calculate row index in the original input tensor.
const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
const Index rowOffset = patchOffset - colOffset * m_colStride;
const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
if (inputRow < 0 || inputRow >= m_input_rows_eff ||
((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
return Scalar(m_paddingValue);
}
const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride;
return m_impl.coeff(inputIndex);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
return packetWithPossibleZero(index);
}
const Index indices[2] = {index, index + PacketSize - 1};
const Index patchIndex = indices[0] / m_fastPatchStride;
if (patchIndex != indices[1] / m_fastPatchStride) {
return packetWithPossibleZero(index);
}
const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
// Find the offset of the element wrt the location of the first element.
const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
(indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
const Index patch2DIndex = (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
const Index colIndex = patch2DIndex / m_fastOutputRows;
const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
// Calculate col indices in the original input tensor.
const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] -
m_colPaddingLeft, colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
}
if (inputCols[0] == inputCols[1]) {
const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0]*m_colStride, patchOffsets[1] - colOffsets[1]*m_colStride};
eigen_assert(rowOffsets[0] <= rowOffsets[1]);
// Calculate col indices in the original input tensor.
const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] -
m_rowPaddingTop, rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
}
if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
// no padding
const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
return m_impl.template packet<Unaligned>(inputIndex);
}
}
return packetWithPossibleZero(index);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
// We conservatively estimate the cost for the code path where the computed
// index is inside the original image and
// TensorEvaluator<ArgType, Device>::CoordAccess is false.
const double compute_cost = 3 * TensorOpCost::DivCost<Index>() +
6 * TensorOpCost::MulCost<Index>() +
8 * TensorOpCost::MulCost<Index>();
return m_impl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
Dimensions m_dimensions;
Index m_otherStride;
Index m_patchStride;
Index m_colStride;
Index m_row_strides;
Index m_col_strides;
Index m_in_row_strides;
Index m_in_col_strides;
Index m_row_inflate_strides;
Index m_col_inflate_strides;
Index m_input_rows_eff;
Index m_input_cols_eff;
Index m_patch_rows_eff;
Index m_patch_cols_eff;
internal::TensorIntDivisor<Index> m_fastOtherStride;
internal::TensorIntDivisor<Index> m_fastPatchStride;
internal::TensorIntDivisor<Index> m_fastColStride;
internal::TensorIntDivisor<Index> m_fastInflateRowStride;
internal::TensorIntDivisor<Index> m_fastInflateColStride;
internal::TensorIntDivisor<Index> m_fastInputColsEff;
Index m_rowInputStride;
Index m_colInputStride;
Index m_patchInputStride;
Index m_inputDepth;
Index m_inputRows;
Index m_inputCols;
Index m_outputRows;
Index m_outputCols;
Index m_rowPaddingTop;
Index m_colPaddingLeft;
internal::TensorIntDivisor<Index> m_fastOutputRows;
internal::TensorIntDivisor<Index> m_fastOutputDepth;
Scalar m_paddingValue;
const Device EIGEN_DEVICE_REF m_device;
TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H

View File

@@ -0,0 +1,738 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
#define EIGEN_HAS_INDEX_LIST
namespace Eigen {
/** \internal
*
* \class TensorIndexList
* \ingroup CXX11_Tensor_Module
*
* \brief Set of classes used to encode a set of Tensor dimensions/indices.
*
* The indices in the list can be known at compile time or at runtime. A mix
* of static and dynamic indices can also be provided if needed. The tensor
* code will attempt to take advantage of the indices that are known at
* compile time to optimize the code it generates.
*
* This functionality requires a c++11 compliant compiler. If your compiler
* is older you need to use arrays of indices instead.
*
* Several examples are provided in the cxx11_tensor_index_list.cpp file.
*
* \sa Tensor
*/
template <Index n>
struct type2index {
static const Index value = n;
EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
EIGEN_DEVICE_FUNC void set(Index val) {
eigen_assert(val == n);
}
};
// This can be used with IndexPairList to get compile-time constant pairs,
// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
template <Index f, Index s>
struct type2indexpair {
static const Index first = f;
static const Index second = s;
constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const {
return IndexPair<Index>(f, s);
}
EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
eigen_assert(val.first == f);
eigen_assert(val.second == s);
}
};
template<Index n> struct NumTraits<type2index<n> >
{
typedef Index Real;
enum {
IsComplex = 0,
RequireInitialization = false,
ReadCost = 1,
AddCost = 1,
MulCost = 1
};
EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real epsilon() { return 0; }
EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real highest() { return n; }
EIGEN_DEVICE_FUNC static EIGEN_CONSTEXPR EIGEN_STRONG_INLINE Real lowest() { return n; }
};
namespace internal {
template <typename T>
EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
val = internal::convert_index<T>(new_val);
}
template <Index n>
EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
val.set(new_val);
}
template <typename T>
EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
val = new_val;
}
template <Index f, Index s>
EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
val.set(new_val);
}
template <typename T>
struct is_compile_time_constant {
static constexpr bool value = false;
};
template <Index idx>
struct is_compile_time_constant<type2index<idx> > {
static constexpr bool value = true;
};
template <Index idx>
struct is_compile_time_constant<const type2index<idx> > {
static constexpr bool value = true;
};
template <Index idx>
struct is_compile_time_constant<type2index<idx>& > {
static constexpr bool value = true;
};
template <Index idx>
struct is_compile_time_constant<const type2index<idx>& > {
static constexpr bool value = true;
};
template <Index f, Index s>
struct is_compile_time_constant<type2indexpair<f, s> > {
static constexpr bool value = true;
};
template <Index f, Index s>
struct is_compile_time_constant<const type2indexpair<f, s> > {
static constexpr bool value = true;
};
template <Index f, Index s>
struct is_compile_time_constant<type2indexpair<f, s>& > {
static constexpr bool value = true;
};
template <Index f, Index s>
struct is_compile_time_constant<const type2indexpair<f, s>& > {
static constexpr bool value = true;
};
template<typename... T>
struct IndexTuple;
template<typename T, typename... O>
struct IndexTuple<T, O...> {
EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() { }
EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) { }
constexpr static int count = 1 + sizeof...(O);
T head;
IndexTuple<O...> others;
typedef T Head;
typedef IndexTuple<O...> Other;
};
template<typename T>
struct IndexTuple<T> {
EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() { }
EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) { }
constexpr static int count = 1;
T head;
typedef T Head;
};
template<int N, typename... T>
struct IndexTupleExtractor;
template<int N, typename T, typename... O>
struct IndexTupleExtractor<N, T, O...> {
typedef typename IndexTupleExtractor<N-1, O...>::ValType ValType;
EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
return IndexTupleExtractor<N-1, O...>::get_val(val.others);
}
EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
return IndexTupleExtractor<N-1, O...>::get_val(val.others);
}
template <typename V>
EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
IndexTupleExtractor<N-1, O...>::set_val(val.others, new_val);
}
};
template<typename T, typename... O>
struct IndexTupleExtractor<0, T, O...> {
typedef T ValType;
EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
return val.head;
}
EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
return val.head;
}
template <typename V>
EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
val.head = new_val;
}
};
template <int N, typename T, typename... O>
EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
return IndexTupleExtractor<N, T, O...>::get_val(tuple);
}
template <int N, typename T, typename... O>
EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(const IndexTuple<T, O...>& tuple) {
return IndexTupleExtractor<N, T, O...>::get_val(tuple);
}
template <typename T, typename... O>
struct array_size<IndexTuple<T, O...> > {
static const size_t value = IndexTuple<T, O...>::count;
};
template <typename T, typename... O>
struct array_size<const IndexTuple<T, O...> > {
static const size_t value = IndexTuple<T, O...>::count;
};
template <Index Idx, typename ValueT>
struct tuple_coeff {
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
// return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
}
template <typename... T>
EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
if (i == Idx) {
update_value(array_get<Idx>(t), value);
} else {
tuple_coeff<Idx-1, ValueT>::set(i, t, value);
}
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
array_get<Idx>(t) > array_get<Idx-1>(t) &&
tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
}
};
template <typename ValueT>
struct tuple_coeff<0, ValueT> {
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
// eigen_assert (i == 0); // gcc fails to compile assertions in constexpr
return array_get<0>(t)/* * (i == 0)*/;
}
template <typename... T>
EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
eigen_assert (i == 0);
update_value(array_get<0>(t), value);
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
}
template <typename... T>
EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
return true;
}
};
} // namespace internal
template<typename FirstType, typename... OtherTypes>
struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::set(i, *this, value);
}
EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
}
EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_known_statically(*this);
}
EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_statically_known_to_increase(*this);
}
};
template <typename FirstType, typename... OtherTypes>
std::ostream& operator<<(std::ostream& os,
const IndexList<FirstType, OtherTypes...>& dims) {
os << "[";
for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
if (i > 0) os << ", ";
os << dims[i];
}
os << "]";
return os;
}
template<typename FirstType, typename... OtherTypes>
constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
}
template<typename FirstType, typename... OtherTypes>
struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[] (const Index i) const {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<Index>>::get(i, *this);
}
EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<Index> >::set(i, *this, value);
}
EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
}
};
namespace internal {
template<typename FirstType, typename... OtherTypes>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
Index result = 1;
EIGEN_UNROLL_LOOP
for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
result *= sizes[i];
}
return result;
}
template<typename FirstType, typename... OtherTypes> struct array_size<IndexList<FirstType, OtherTypes...> > {
static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
};
template<typename FirstType, typename... OtherTypes> struct array_size<const IndexList<FirstType, OtherTypes...> > {
static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
};
template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
};
template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
};
template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
}
template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
}
template <typename T>
struct index_known_statically_impl {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index) {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
}
};
template <typename T>
struct all_indices_known_statically_impl {
static constexpr bool run() {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
}
};
template <typename FirstType, typename... OtherTypes>
struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
}
};
template <typename T>
struct indices_statically_known_to_increase_impl {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
}
};
template <typename FirstType, typename... OtherTypes>
struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run() {
return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
}
};
template <typename Tx>
struct index_statically_eq_impl {
EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>().get(i) == value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>().get(i) == value);
}
};
template <typename T>
struct index_statically_ne_impl {
EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>().get(i) != value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>().get(i) != value);
}
};
template <typename T>
struct index_statically_gt_impl {
EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>().get(i) > value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>().get(i) > value);
}
};
template <typename T>
struct index_statically_lt_impl {
EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>().get(i) < value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexList<FirstType, OtherTypes...>().get(i) < value);
}
};
template <typename Tx>
struct index_pair_first_statically_eq_impl {
EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
}
};
template <typename Tx>
struct index_pair_second_statically_eq_impl {
EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
return false;
}
};
template <typename FirstType, typename... OtherTypes>
struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
}
};
template <typename FirstType, typename... OtherTypes>
struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
(IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
}
};
} // end namespace internal
} // end namespace Eigen
#else
namespace Eigen {
namespace internal {
template <typename T>
struct index_known_statically_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) {
return false;
}
};
template <typename T>
struct all_indices_known_statically_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
return false;
}
};
template <typename T>
struct indices_statically_known_to_increase_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
return false;
}
};
template <typename T>
struct index_statically_eq_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
return false;
}
};
template <typename T>
struct index_statically_ne_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
return false;
}
};
template <typename T>
struct index_statically_gt_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
return false;
}
};
template <typename T>
struct index_statically_lt_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
return false;
}
};
template <typename Tx>
struct index_pair_first_statically_eq_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
return false;
}
};
template <typename Tx>
struct index_pair_second_statically_eq_impl {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
return false;
}
};
} // end namespace internal
} // end namespace Eigen
#endif
namespace Eigen {
namespace internal {
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
return index_known_statically_impl<T>::run(i);
}
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool all_indices_known_statically() {
return all_indices_known_statically_impl<T>::run();
}
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increase() {
return indices_statically_known_to_increase_impl<T>::run();
}
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
return index_statically_eq_impl<T>::run(i, value);
}
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
return index_statically_ne_impl<T>::run(i, value);
}
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
return index_statically_gt_impl<T>::run(i, value);
}
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
return index_statically_lt_impl<T>::run(i, value);
}
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
return index_pair_first_statically_eq_impl<T>::run(i, value);
}
template <typename T>
static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
return index_pair_second_statically_eq_impl<T>::run(i, value);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H

View File

@@ -0,0 +1,247 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
namespace Eigen {
/** \class TensorInflation
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor inflation class.
*
*
*/
namespace internal {
template<typename Strides, typename XprType>
struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<typename Strides, typename XprType>
struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense>
{
typedef const TensorInflationOp<Strides, XprType>& type;
};
template<typename Strides, typename XprType>
struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type>
{
typedef TensorInflationOp<Strides, XprType> type;
};
} // end namespace internal
template<typename Strides, typename XprType>
class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides)
: m_xpr(expr), m_strides(strides) {}
EIGEN_DEVICE_FUNC
const Strides& strides() const { return m_strides; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const Strides m_strides;
};
// Eval as rvalue
template<typename Strides, typename ArgType, typename Device>
struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
{
typedef TensorInflationOp<Strides, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_strides(op.strides())
{
m_dimensions = m_impl.dimensions();
// Expand each dimension to the inflated dimension.
for (int i = 0; i < NumDims; ++i) {
m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1;
}
// Remember the strides for fast division.
for (int i = 0; i < NumDims; ++i) {
m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_outputStrides[0] = 1;
m_inputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
}
} else { // RowMajor
m_outputStrides[NumDims-1] = 1;
m_inputStrides[NumDims-1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
// Computes the input index given the output index. Returns true if the output
// index doesn't fall into a hole.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const
{
eigen_assert(index < dimensions().TotalSize());
*inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (idx != idx / m_fastStrides[i] * m_strides[i]) {
return false;
}
*inputIndex += idx / m_strides[i] * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
if (index != index / m_fastStrides[0] * m_strides[0]) {
return false;
}
*inputIndex += index / m_strides[0];
return true;
} else {
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
if (idx != idx / m_fastStrides[i] * m_strides[i]) {
return false;
}
*inputIndex += idx / m_strides[i] * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
if (index != index / m_fastStrides[NumDims-1] * m_strides[NumDims-1]) {
return false;
}
*inputIndex += index / m_strides[NumDims - 1];
}
return true;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
Index inputIndex = 0;
if (getInputIndex(index, &inputIndex)) {
return m_impl.coeff(inputIndex);
} else {
return Scalar(0);
}
}
// TODO(yangke): optimize this function so that we can detect and produce
// all-zero packets
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() +
3 * TensorOpCost::MulCost<Index>() +
2 * TensorOpCost::AddCost<Index>());
const double input_size = m_impl.dimensions().TotalSize();
const double output_size = m_dimensions.TotalSize();
if (output_size == 0)
return TensorOpCost();
return m_impl.costPerCoeff(vectorized) +
TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0,
compute_cost, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
const Strides m_strides;
array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H

View File

@@ -0,0 +1,82 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
#if EIGEN_HAS_VARIADIC_TEMPLATES
#include <initializer_list>
namespace Eigen {
/** \class TensorInitializer
* \ingroup CXX11_Tensor_Module
*
* \brief Helper template to initialize Tensors from std::initializer_lists.
*/
namespace internal {
template <typename Derived, int N>
struct Initializer {
typedef std::initializer_list<
typename Initializer<Derived, N - 1>::InitList> InitList;
static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
const InitList& vals) {
int i = 0;
for (const auto& v : vals) {
(*indices)[traits<Derived>::NumDimensions - N] = i++;
Initializer<Derived, N - 1>::run(tensor, indices, v);
}
}
};
template <typename Derived>
struct Initializer<Derived, 1> {
typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
const InitList& vals) {
int i = 0;
// There is likely a faster way to do that than iterating.
for (const auto& v : vals) {
(*indices)[traits<Derived>::NumDimensions - 1] = i++;
tensor.coeffRef(*indices) = v;
}
}
};
template <typename Derived>
struct Initializer<Derived, 0> {
typedef typename traits<Derived>::Scalar InitList;
static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*,
const InitList& v) {
tensor.coeffRef(0) = v;
}
};
template <typename Derived, int N>
void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_HAS_VARIADIC_TEMPLATES
#endif // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H

View File

@@ -0,0 +1,263 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
namespace Eigen {
/** \internal
*
* \class TensorIntDiv
* \ingroup CXX11_Tensor_Module
*
* \brief Fast integer division by a constant.
*
* See the paper from Granlund and Montgomery for explanation.
* (at https://doi.org/10.1145/773473.178249)
*
* \sa Tensor
*/
namespace internal {
namespace {
// Note: result is undefined if val == 0
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
{
#ifdef EIGEN_GPU_COMPILE_PHASE
return __clz(val);
#elif defined(SYCL_DEVICE_ONLY)
return cl::sycl::clz(val);
#elif EIGEN_COMP_MSVC
unsigned long index;
_BitScanReverse(&index, val);
return 31 - index;
#else
EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
return __builtin_clz(static_cast<uint32_t>(val));
#endif
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
{
#ifdef EIGEN_GPU_COMPILE_PHASE
return __clzll(val);
#elif defined(SYCL_DEVICE_ONLY)
return static_cast<int>(cl::sycl::clz(val));
#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
unsigned long index;
_BitScanReverse64(&index, val);
return 63 - index;
#elif EIGEN_COMP_MSVC
// MSVC's _BitScanReverse64 is not available for 32bits builds.
unsigned int lo = (unsigned int)(val&0xffffffff);
unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
int n;
if(hi==0)
n = 32 + count_leading_zeros<unsigned int>(lo);
else
n = count_leading_zeros<unsigned int>(hi);
return n;
#else
EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
return __builtin_clzll(static_cast<uint64_t>(val));
#endif
}
template <typename T>
struct UnsignedTraits {
typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
};
template <typename T>
struct DividerTraits {
typedef typename UnsignedTraits<T>::type type;
static const int N = sizeof(T) * 8;
};
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __umulhi(a, b);
#elif defined(SYCL_DEVICE_ONLY)
return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
#else
return (static_cast<uint64_t>(a) * b) >> 32;
#endif
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
#if defined(EIGEN_GPU_COMPILE_PHASE)
return __umul64hi(a, b);
#elif defined(SYCL_DEVICE_ONLY)
return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
#elif EIGEN_HAS_BUILTIN_INT128
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
return static_cast<uint64_t>(v >> 64);
#else
return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
#endif
}
template <int N, typename T>
struct DividerHelper {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1);
}
};
template <typename T>
struct DividerHelper<64, T> {
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
#else
const uint64_t shift = 1ULL << log_div;
TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
- TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+ TensorUInt128<static_val<0>, static_val<1> >(1);
return static_cast<uint64_t>(result);
#endif
}
};
}
template <typename T, bool div_gt_one = false>
struct TensorIntDivisor {
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
multiplier = 0;
shift1 = 0;
shift2 = 0;
}
// Must have 0 < divider < 2^31. This is relaxed to
// 0 < divider < 2^63 when using 64-bit indices on platforms that support
// the __uint128_t type.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
const int N = DividerTraits<T>::N;
eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest()/2);
eigen_assert(divider > 0);
// fast ln2
const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
int log_div = N - leading_zeros;
// if divider is a power of two then log_div is 1 more than it should be.
if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div-1)) == static_cast<typename UnsignedTraits<T>::type>(divider))
log_div--;
multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
shift1 = log_div > 1 ? 1 : log_div;
shift2 = log_div > 1 ? log_div-1 : 0;
}
// Must have 0 <= numerator. On platforms that don't support the __uint128_t
// type numerator should also be less than 2^32-1.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
//eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
UnsignedType t1 = muluh(multiplier, numerator);
UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
return (t1 + t) >> shift2;
}
private:
typedef typename DividerTraits<T>::type UnsignedType;
UnsignedType multiplier;
int32_t shift1;
int32_t shift2;
};
// Optimized version for signed 32 bit integers.
// Derived from Hacker's Delight.
// Only works for divisors strictly greater than one
template <>
class TensorIntDivisor<int32_t, true> {
public:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
magic = 0;
shift = 0;
}
// Must have 2 <= divider
EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) {
eigen_assert(divider >= 2);
calcMagic(divider);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
#ifdef EIGEN_GPU_COMPILE_PHASE
return (__umulhi(magic, n) >> shift);
#elif defined(SYCL_DEVICE_ONLY)
return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
#else
uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
return (static_cast<uint32_t>(v >> 32) >> shift);
#endif
}
private:
// Compute the magic numbers. See Hacker's Delight section 10 for an in
// depth explanation.
EIGEN_DEVICE_FUNC void calcMagic(int32_t d) {
const unsigned two31 = 0x80000000; // 2**31.
unsigned ad = d;
unsigned t = two31 + (ad >> 31);
unsigned anc = t - 1 - t%ad; // Absolute value of nc.
int p = 31; // Init. p.
unsigned q1 = two31/anc; // Init. q1 = 2**p/|nc|.
unsigned r1 = two31 - q1*anc; // Init. r1 = rem(2**p, |nc|).
unsigned q2 = two31/ad; // Init. q2 = 2**p/|d|.
unsigned r2 = two31 - q2*ad; // Init. r2 = rem(2**p, |d|).
unsigned delta = 0;
do {
p = p + 1;
q1 = 2*q1; // Update q1 = 2**p/|nc|.
r1 = 2*r1; // Update r1 = rem(2**p, |nc|).
if (r1 >= anc) { // (Must be an unsigned
q1 = q1 + 1; // comparison here).
r1 = r1 - anc;}
q2 = 2*q2; // Update q2 = 2**p/|d|.
r2 = 2*r2; // Update r2 = rem(2**p, |d|).
if (r2 >= ad) { // (Must be an unsigned
q2 = q2 + 1; // comparison here).
r2 = r2 - ad;}
delta = ad - r2;
} while (q1 < delta || (q1 == delta && r1 == 0));
magic = (unsigned)(q2 + 1);
shift = p - 32;
}
uint32_t magic;
int32_t shift;
};
template <typename T, bool div_gt_one>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
return divisor.divide(numerator);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H

View File

@@ -0,0 +1,216 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
namespace Eigen {
/** \class TensorLayoutSwap
* \ingroup CXX11_Tensor_Module
*
* \brief Swap the layout from col-major to row-major, or row-major
* to col-major, and invert the order of the dimensions.
*
* Beware: the dimensions are reversed by this operation. If you want to
* preserve the ordering of the dimensions, you need to combine this
* operation with a shuffle.
*
* \example:
* Tensor<float, 2, ColMajor> input(2, 4);
* Tensor<float, 2, RowMajor> output = input.swap_layout();
* eigen_assert(output.dimension(0) == 4);
* eigen_assert(output.dimension(1) == 2);
*
* array<int, 2> shuffle(1, 0);
* output = input.swap_layout().shuffle(shuffle);
* eigen_assert(output.dimension(0) == 2);
* eigen_assert(output.dimension(1) == 4);
*
*/
namespace internal {
template<typename XprType>
struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = traits<XprType>::NumDimensions;
static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
typedef typename XprTraits::PointerType PointerType;
};
template<typename XprType>
struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense>
{
typedef const TensorLayoutSwapOp<XprType>& type;
};
template<typename XprType>
struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type>
{
typedef TensorLayoutSwapOp<XprType> type;
};
} // end namespace internal
template<typename XprType>
class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors>
{
public:
typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr)
: m_xpr(expr) {}
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
protected:
typename XprType::Nested m_xpr;
};
// Eval as rvalue
template<typename ArgType, typename Device>
struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
{
typedef TensorLayoutSwapOp<ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false, // to be implemented
RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
for(int i = 0; i < NumDims; ++i) {
m_dimensions[i] = m_impl.dimensions()[NumDims-1-i];
}
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
return m_impl.evalSubExprsIfNeeded(data);
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(index);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
return m_impl.template packet<LoadMode>(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
return m_impl.costPerCoeff(vectorized);
}
EIGEN_DEVICE_FUNC typename Storage::Type data() const {
return constCast(m_impl.data());
}
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
protected:
TensorEvaluator<ArgType, Device> m_impl;
Dimensions m_dimensions;
};
// Eval as lvalue
template<typename ArgType, typename Device>
struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
: public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
{
typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
typedef TensorLayoutSwapOp<ArgType> XprType;
enum {
IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
CoordAccess = false // to be implemented
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(index);
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
this->m_impl.template writePacket<StoreMode>(index, x);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H

View File

@@ -0,0 +1,98 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
/** use this macro in sfinae selection in templated functions
*
* template<typename T,
* typename std::enable_if< isBanana<T>::value , int >::type = 0
* >
* void foo(){}
*
* becomes =>
*
* template<typename TopoType,
* SFINAE_ENABLE_IF( isBanana<T>::value )
* >
* void foo(){}
*/
// SFINAE requires variadic templates
#if !defined(EIGEN_GPUCC)
#if EIGEN_HAS_VARIADIC_TEMPLATES
// SFINAE doesn't work for gcc <= 4.7
#ifdef EIGEN_COMP_GNUC
#if EIGEN_GNUC_AT_LEAST(4,8)
#define EIGEN_HAS_SFINAE
#endif
#else
#define EIGEN_HAS_SFINAE
#endif
#endif
#endif
#define EIGEN_SFINAE_ENABLE_IF( __condition__ ) \
typename internal::enable_if< ( __condition__ ) , int >::type = 0
// Define a macro to use a reference on the host but a value on the device
#if defined(SYCL_DEVICE_ONLY)
#define EIGEN_DEVICE_REF
#else
#define EIGEN_DEVICE_REF &
#endif
// Define a macro for catching SYCL exceptions if exceptions are enabled
#define EIGEN_SYCL_TRY_CATCH(X) \
do { \
EIGEN_TRY {X;} \
EIGEN_CATCH(const cl::sycl::exception& e) { \
EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \
std::string(__FILE__) + ":" + \
std::to_string(__LINE__) + "\n" + \
e.what())); \
} \
} while (false)
// Define a macro if local memory flags are unset or one of them is set
// Setting both flags is the same as unsetting them
#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
(defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM))
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
#endif
#if EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
using Base::operator =; \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
template <typename OtherDerived> \
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { Base::operator=(other); return *this; }
#else
#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
#endif
/** \internal
* \brief Macro to manually inherit assignment operators.
* This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
* This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments.
* With C++11 or later this also default-implements the copy-constructor
*/
#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
#endif

View File

@@ -0,0 +1,327 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
namespace Eigen {
// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
/** \class TensorMap
* \ingroup CXX11_Tensor_Module
*
* \brief A tensor expression mapping an existing array of data.
*
*/
/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
/// It is added due to the fact that for our device compiler `T*` is not allowed.
/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
/// Therefore, by adding the default value, we managed to convert the type and it does not break any
/// existing code as its default value is `T*`.
template<typename PlainObjectType, int Options_, template <class> class MakePointer_> class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> >
{
public:
typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
#ifdef EIGEN_USE_SYCL
typedef typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
#else
typedef typename Eigen::internal::nested<Self>::type Nested;
#endif
typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
typedef typename internal::traits<PlainObjectType>::Index Index;
typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
typedef typename MakePointer_<Scalar>::Type PointerType;
typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
// WARN: PointerType still can be a pointer to const (const Scalar*), for
// example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
// expression should be illegal, but adding this restriction is not possible
// in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
typedef typename internal::conditional<
bool(internal::is_lvalue<PlainObjectType>::value),
PointerType, // use simple pointer in lvalue expressions
PointerConstType // use const pointer in rvalue expressions
>::type StoragePointerType;
// If TensorMap was constructed over rvalue expression (e.g. const Tensor),
// we should return a reference to const from operator() (and others), even
// if TensorMap itself is not const.
typedef typename internal::conditional<
bool(internal::is_lvalue<PlainObjectType>::value),
Scalar&,
const Scalar&
>::type StorageRefType;
static const int Options = Options_;
static const Index NumIndices = PlainObjectType::NumIndices;
typedef typename PlainObjectType::Dimensions Dimensions;
enum {
IsAligned = ((int(Options_)&Aligned)==Aligned),
Layout = PlainObjectType::Layout,
CoordAccess = true,
RawAccess = true
};
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
// The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
}
#endif
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions)
: m_data(dataPtr), m_dimensions(dimensions)
{ }
template <typename Dimensions>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
: m_data(dataPtr), m_dimensions(dimensions)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
: m_data(tensor.data()), m_dimensions(tensor.dimensions())
{ }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const
{
// eigen_assert(checkIndexRange(indices));
if (PlainObjectType::Options&RowMajor) {
const Index index = m_dimensions.IndexOfRowMajor(indices);
return m_data[index];
} else {
const Index index = m_dimensions.IndexOfColMajor(indices);
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()() const
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
return m_data[0];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const
{
eigen_internal_assert(index >= 0 && index < size());
return m_data[index];
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
{
EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
if (PlainObjectType::Options&RowMajor) {
const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
return m_data[index];
} else {
const Index index = m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
return m_data[index];
}
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i1 + i0 * m_dimensions[1];
return m_data[index];
} else {
const Index index = i0 + i1 * m_dimensions[0];
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
return m_data[index];
}
}
#endif
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices)
{
// eigen_assert(checkIndexRange(indices));
if (PlainObjectType::Options&RowMajor) {
const Index index = m_dimensions.IndexOfRowMajor(indices);
return m_data[index];
} else {
const Index index = m_dimensions.IndexOfColMajor(indices);
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()()
{
EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
return m_data[0];
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index index)
{
eigen_internal_assert(index >= 0 && index < size());
return m_data[index];
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
{
static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
const std::size_t NumDims = sizeof...(otherIndices) + 2;
if (PlainObjectType::Options&RowMajor) {
const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
return m_data[index];
} else {
const Index index = m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
return m_data[index];
}
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1)
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i1 + i0 * m_dimensions[1];
return m_data[index];
} else {
const Index index = i0 + i1 * m_dimensions[0];
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2)
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * i2);
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3)
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * i3));
return m_data[index];
}
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
{
if (PlainObjectType::Options&RowMajor) {
const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
return m_data[index];
} else {
const Index index = i0 + m_dimensions[0] * (i1 + m_dimensions[1] * (i2 + m_dimensions[2] * (i3 + m_dimensions[3] * i4)));
return m_data[index];
}
}
#endif
EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
private:
StoragePointerType m_data;
Dimensions m_dimensions;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_MAP_H

View File

@@ -0,0 +1,311 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
#define EIGEN_CXX11_TENSOR_TENSOR_META_H
namespace Eigen {
template<bool cond> struct Cond {};
template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
const T1& choose(Cond<true>, const T1& first, const T2&) {
return first;
}
template<typename T1, typename T2> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
const T2& choose(Cond<false>, const T1&, const T2& second) {
return second;
}
template <typename T, typename X, typename Y>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T divup(const X x, const Y y) {
return static_cast<T>((x + y - 1) / y);
}
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
T divup(const T x, const T y) {
return static_cast<T>((x + y - 1) / y);
}
template <size_t n> struct max_n_1 {
static const size_t size = n;
};
template <> struct max_n_1<0> {
static const size_t size = 1;
};
// Default packet types
template <typename Scalar, typename Device>
struct PacketType : internal::packet_traits<Scalar> {
typedef typename internal::packet_traits<Scalar>::type type;
};
// For CUDA packet types when using a GpuDevice
#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
typedef ulonglong2 Packet4h2;
template<>
struct PacketType<half, GpuDevice> {
typedef Packet4h2 type;
static const int size = 8;
enum {
HasAdd = 1,
HasSub = 1,
HasMul = 1,
HasNegate = 1,
HasAbs = 1,
HasArg = 0,
HasAbs2 = 0,
HasMin = 1,
HasMax = 1,
HasConj = 0,
HasSetLinear = 0,
HasBlend = 0,
HasDiv = 1,
HasSqrt = 1,
HasRsqrt = 1,
HasExp = 1,
HasExpm1 = 0,
HasLog = 1,
HasLog1p = 0,
HasLog10 = 0,
HasPow = 1,
};
};
#endif
#if defined(EIGEN_USE_SYCL)
namespace TensorSycl {
namespace internal {
template <typename Index, Index A, Index B> struct PlusOp {
static constexpr Index Value = A + B;
};
template <typename Index, Index A, Index B> struct DivOp {
static constexpr Index Value = A / B;
};
template <typename Index, Index start, Index end, Index step,
template <class Indx, Indx...> class StepOp>
struct static_for {
template <typename UnaryOperator>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
op(start);
static_for<Index, StepOp<Index, start, step>::Value, end, step,
StepOp>::loop(op);
}
};
template <typename Index, Index end, Index step,
template <class Indx, Indx...> class StepOp>
struct static_for<Index, end, end, step, StepOp> {
template <typename UnaryOperator>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
};
template <typename OutScalar, typename Device, bool Vectorizable>
struct Vectorise {
static const int PacketSize = 1;
typedef OutScalar PacketReturnType;
};
template <typename OutScalar, typename Device>
struct Vectorise<OutScalar, Device, true> {
static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
};
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
return ((((x) + (y)-1) / (y)) * (y));
}
} // namespace internal
} // namespace TensorSycl
template <>
struct PacketType<half, SyclDevice> {
typedef half type;
static const int size = 1;
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasArg = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasConj = 0,
HasSetLinear = 0,
HasBlend = 0
};
};
template <typename Scalar>
struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
typedef Scalar type;
typedef Scalar half;
enum {
Vectorizable = 0,
size = 1,
AlignedOnScalar = 0,
HasHalfPacket = 0
};
enum {
HasAdd = 0,
HasSub = 0,
HasMul = 0,
HasNegate = 0,
HasAbs = 0,
HasAbs2 = 0,
HasMin = 0,
HasMax = 0,
HasConj = 0,
HasSetLinear = 0
};
};
template <typename Scalar>
struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
#ifndef EIGEN_DONT_VECTORIZE_SYCL
#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
{\
typedef typename internal::packet_traits<Type>::type type;\
typedef typename internal::packet_traits<Type>::half half;\
};
PACKET_TYPE(const, float, 1, 4, SyclDevice)
PACKET_TYPE(, float, 1, 4, SyclDevice)
PACKET_TYPE(const, float, 1, 4, const SyclDevice)
PACKET_TYPE(, float, 1, 4, const SyclDevice)
PACKET_TYPE(const, double, 0, 2, SyclDevice)
PACKET_TYPE(, double, 0, 2, SyclDevice)
PACKET_TYPE(const, double, 0, 2, const SyclDevice)
PACKET_TYPE(, double, 0, 2, const SyclDevice)
#undef PACKET_TYPE
template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
#endif
#endif
// Tuple mimics std::pair but works on e.g. nvcc.
template <typename U, typename V> struct Tuple {
public:
U first;
V second;
typedef U first_type;
typedef V second_type;
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Tuple() : first(), second() {}
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Tuple(const U& f, const V& s) : first(f), second(s) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void swap(Tuple& rhs) {
using numext::swap;
swap(first, rhs.first);
swap(second, rhs.second);
}
};
template <typename U, typename V>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool operator==(const Tuple<U, V>& x, const Tuple<U, V>& y) {
return (x.first == y.first && x.second == y.second);
}
template <typename U, typename V>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
return !(x == y);
}
// Can't use std::pairs on cuda devices
template <typename Idx> struct IndexPair {
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
first = val.first;
second = val.second;
}
Idx first;
Idx second;
};
#ifdef EIGEN_HAS_SFINAE
namespace internal {
template<typename IndexType, typename Index, Index... Is>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
return { idx[Is]... };
}
template<typename IndexType, typename Index>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
return array<Index, 0>();
}
/** Make an array (for index/dimensions) out of a custom index */
template<typename Index, std::size_t NumIndices, typename IndexType>
EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
array<Index, NumIndices> customIndices2Array(IndexType& idx) {
return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
}
template <typename B, typename D>
struct is_base_of
{
typedef char (&yes)[1];
typedef char (&no)[2];
template <typename BB, typename DD>
struct Host
{
operator BB*() const;
operator DD*();
};
template<typename T>
static yes check(D*, T);
static no check(B*, int);
static const bool value = sizeof(check(Host<B,D>(), int())) == sizeof(yes);
};
}
#endif
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_META_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,708 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
namespace Eigen {
/** \class TensorPadding
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor padding class.
* At the moment only padding with a constant value is supported.
*
*/
namespace internal {
template<typename PaddingDimensions, typename XprType>
struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<typename PaddingDimensions, typename XprType>
struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense>
{
typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
};
template<typename PaddingDimensions, typename XprType>
struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1, typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type>
{
typedef TensorPaddingOp<PaddingDimensions, XprType> type;
};
} // end namespace internal
template<typename PaddingDimensions, typename XprType>
class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims, const Scalar padding_value)
: m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
EIGEN_DEVICE_FUNC
const PaddingDimensions& padding() const { return m_padding_dims; }
EIGEN_DEVICE_FUNC
Scalar padding_value() const { return m_padding_value; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const PaddingDimensions m_padding_dims;
const Scalar m_padding_value;
};
// Eval as rvalue
template<typename PaddingDimensions, typename ArgType, typename Device>
struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device>
{
typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<PaddingDimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = true,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = true,
RawAccess = false
};
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
Layout, Index>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
{
// The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
// to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
// of 1 element first and then pad.
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Compute dimensions
m_dimensions = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
m_dimensions[i] += m_padding[i].first + m_padding[i].second;
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputStrides[0] = 1;
m_outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
}
m_outputStrides[NumDims] = m_outputStrides[NumDims-1] * m_dimensions[NumDims-1];
} else {
m_inputStrides[NumDims - 1] = 1;
m_outputStrides[NumDims] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
m_outputStrides[i+1] = m_outputStrides[i+2] * m_dimensions[i+1];
}
m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
eigen_assert(index < dimensions().TotalSize());
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
if (isPaddingAtIndexForDim(idx, i)) {
return m_paddingValue;
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
if (isPaddingAtIndexForDim(index, 0)) {
return m_paddingValue;
}
inputIndex += (index - m_padding[0].first);
} else {
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i+1];
if (isPaddingAtIndexForDim(idx, i)) {
return m_paddingValue;
}
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i+1];
}
if (isPaddingAtIndexForDim(index, NumDims-1)) {
return m_paddingValue;
}
inputIndex += (index - m_padding[NumDims-1].first);
}
return m_impl.coeff(inputIndex);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return packetColMajor(index);
}
return packetRowMajor(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
TensorOpCost cost = m_impl.costPerCoeff(vectorized);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims; ++i)
updateCostPerDimension(cost, i, i == 0);
} else {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i >= 0; --i)
updateCostPerDimension(cost, i, i == NumDims - 1);
}
return cost;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
const size_t target_size = m_device.lastLevelCacheSize();
return internal::TensorBlockResourceRequirements::merge(
internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
m_impl.getResourceRequirements());
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
// If one of the dimensions is zero, return empty block view.
if (desc.size() == 0) {
return TensorBlock(internal::TensorBlockKind::kView, NULL,
desc.dimensions());
}
static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
Index offset = desc.offset();
// Compute offsets in the output tensor corresponding to the desc.offset().
DSizes<Index, NumDims> output_offsets;
for (int i = NumDims - 1; i > 0; --i) {
const int dim = IsColMajor ? i : NumDims - i - 1;
const int stride_dim = IsColMajor ? dim : dim + 1;
output_offsets[dim] = offset / m_outputStrides[stride_dim];
offset -= output_offsets[dim] * m_outputStrides[stride_dim];
}
output_offsets[inner_dim_idx] = offset;
// Offsets in the input corresponding to output offsets.
DSizes<Index, NumDims> input_offsets = output_offsets;
for (int i = 0; i < NumDims; ++i) {
const int dim = IsColMajor ? i : NumDims - i - 1;
input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
}
// Compute offset in the input buffer (at this point it might be illegal and
// point outside of the input buffer, because we don't check for negative
// offsets, it will be autocorrected in the block iteration loop below).
Index input_offset = 0;
for (int i = 0; i < NumDims; ++i) {
const int dim = IsColMajor ? i : NumDims - i - 1;
input_offset += input_offsets[dim] * m_inputStrides[dim];
}
// Destination buffer and scratch buffer both indexed from 0 and have the
// same dimensions as the requested block (for destination buffer this
// property is guaranteed by `desc.destination()`).
Index output_offset = 0;
const DSizes<Index, NumDims> output_strides =
internal::strides<Layout>(desc.dimensions());
// NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
// dimensions, skipping innermost dimension. In theory it should be possible
// to squeeze matching innermost dimensions, however in practice that did
// not show any improvements in benchmarks. Also in practice first outer
// dimension usually has padding, and will prevent squeezing.
// Initialize output block iterator state. Dimension in this array are
// always in inner_most -> outer_most order (col major layout).
array<BlockIteratorState, NumDims - 1> it;
for (int i = 0; i < NumDims - 1; ++i) {
const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
it[i].count = 0;
it[i].size = desc.dimension(dim);
it[i].input_stride = m_inputStrides[dim];
it[i].input_span = it[i].input_stride * (it[i].size - 1);
it[i].output_stride = output_strides[dim];
it[i].output_span = it[i].output_stride * (it[i].size - 1);
}
const Index input_inner_dim_size =
static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
// Total output size.
const Index output_size = desc.size();
// We will fill inner dimension of this size in the output. It might be
// larger than the inner dimension in the input, so we might have to pad
// before/after we copy values from the input inner dimension.
const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
// How many values to fill with padding BEFORE reading from the input inner
// dimension.
const Index output_inner_pad_before_size =
input_offsets[inner_dim_idx] < 0
? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
output_inner_dim_size)
: 0;
// How many values we can actually copy from the input inner dimension.
const Index output_inner_copy_size = numext::mini(
// Want to copy from input.
(output_inner_dim_size - output_inner_pad_before_size),
// Can copy from input.
numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
output_inner_pad_before_size),
Index(0)));
eigen_assert(output_inner_copy_size >= 0);
// How many values to fill with padding AFTER reading from the input inner
// dimension.
const Index output_inner_pad_after_size =
(output_inner_dim_size - output_inner_copy_size -
output_inner_pad_before_size);
// Sanity check, sum of all sizes must be equal to the output size.
eigen_assert(output_inner_dim_size ==
(output_inner_pad_before_size + output_inner_copy_size +
output_inner_pad_after_size));
// Keep track of current coordinates and padding in the output.
DSizes<Index, NumDims> output_coord = output_offsets;
DSizes<Index, NumDims> output_padded;
for (int i = 0; i < NumDims; ++i) {
const int dim = IsColMajor ? i : NumDims - i - 1;
output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
}
typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
// Prepare storage for the materialized padding result.
const typename TensorBlock::Storage block_storage =
TensorBlock::prepareStorage(desc, scratch);
// TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
// single logical inner dimension.
// When possible we squeeze writes for the innermost (only if non-padded)
// dimension with the first padded dimension. This allows to reduce the
// number of calls to LinCopy and better utilize vector instructions.
const bool squeeze_writes =
NumDims > 1 &&
// inner dimension is not padded
(input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
// and equal to the block inner dimension
(input_inner_dim_size == output_inner_dim_size);
const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
// Maximum coordinate on a squeeze dimension that we can write to.
const Index squeeze_max_coord =
squeeze_writes ? numext::mini(
// max non-padded element in the input
static_cast<Index>(m_dimensions[squeeze_dim] -
m_padding[squeeze_dim].second),
// max element in the output buffer
static_cast<Index>(output_offsets[squeeze_dim] +
desc.dimension(squeeze_dim)))
: static_cast<Index>(0);
// Iterate copying data from `m_impl.data()` to the output buffer.
for (Index size = 0; size < output_size;) {
// Detect if we are in the padded region (exclude innermost dimension).
bool is_padded = false;
for (int j = 1; j < NumDims; ++j) {
const int dim = IsColMajor ? j : NumDims - j - 1;
is_padded = output_padded[dim];
if (is_padded) break;
}
if (is_padded) {
// Fill single innermost dimension with padding value.
size += output_inner_dim_size;
LinCopy::template Run<LinCopy::Kind::FillLinear>(
typename LinCopy::Dst(output_offset, 1, block_storage.data()),
typename LinCopy::Src(0, 0, &m_paddingValue),
output_inner_dim_size);
} else if (squeeze_writes) {
// Squeeze multiple reads from innermost dimensions.
const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
size += output_inner_dim_size * squeeze_num;
// Copy `squeeze_num` inner dimensions from input to output.
LinCopy::template Run<LinCopy::Kind::Linear>(
typename LinCopy::Dst(output_offset, 1, block_storage.data()),
typename LinCopy::Src(input_offset, 1, m_impl.data()),
output_inner_dim_size * squeeze_num);
// Update iteration state for only `squeeze_num - 1` processed inner
// dimensions, because we have another iteration state update at the end
// of the loop that will update iteration state for the last inner
// processed dimension.
it[0].count += (squeeze_num - 1);
input_offset += it[0].input_stride * (squeeze_num - 1);
output_offset += it[0].output_stride * (squeeze_num - 1);
output_coord[squeeze_dim] += (squeeze_num - 1);
} else {
// Single read from innermost dimension.
size += output_inner_dim_size;
{ // Fill with padding before copying from input inner dimension.
const Index out = output_offset;
LinCopy::template Run<LinCopy::Kind::FillLinear>(
typename LinCopy::Dst(out, 1, block_storage.data()),
typename LinCopy::Src(0, 0, &m_paddingValue),
output_inner_pad_before_size);
}
{ // Copy data from input inner dimension.
const Index out = output_offset + output_inner_pad_before_size;
const Index in = input_offset + output_inner_pad_before_size;
eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
LinCopy::template Run<LinCopy::Kind::Linear>(
typename LinCopy::Dst(out, 1, block_storage.data()),
typename LinCopy::Src(in, 1, m_impl.data()),
output_inner_copy_size);
}
{ // Fill with padding after copying from input inner dimension.
const Index out = output_offset + output_inner_pad_before_size +
output_inner_copy_size;
LinCopy::template Run<LinCopy::Kind::FillLinear>(
typename LinCopy::Dst(out, 1, block_storage.data()),
typename LinCopy::Src(0, 0, &m_paddingValue),
output_inner_pad_after_size);
}
}
for (int j = 0; j < NumDims - 1; ++j) {
const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
if (++it[j].count < it[j].size) {
input_offset += it[j].input_stride;
output_offset += it[j].output_stride;
output_coord[dim] += 1;
output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
break;
}
it[j].count = 0;
input_offset -= it[j].input_span;
output_offset -= it[j].output_span;
output_coord[dim] -= it[j].size - 1;
output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
}
}
return block_storage.AsTensorMaterializedBlock();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
private:
struct BlockIteratorState {
BlockIteratorState()
: count(0),
size(0),
input_stride(0),
input_span(0),
output_stride(0),
output_span(0) {}
Index count;
Index size;
Index input_stride;
Index input_span;
Index output_stride;
Index output_span;
};
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
Index index, int dim_index) const {
#if defined(EIGEN_HAS_INDEX_LIST)
return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
index < m_padding[dim_index].first) ||
(!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
index >= m_dimensions[dim_index] - m_padding[dim_index].second);
#else
return (index < m_padding[dim_index].first) ||
(index >= m_dimensions[dim_index] - m_padding[dim_index].second);
#endif
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
int dim_index) const {
#if defined(EIGEN_HAS_INDEX_LIST)
return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
#else
EIGEN_UNUSED_VARIABLE(dim_index);
return false;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
int dim_index) const {
#if defined(EIGEN_HAS_INDEX_LIST)
return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
#else
EIGEN_UNUSED_VARIABLE(dim_index);
return false;
#endif
}
void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
const double in = static_cast<double>(m_impl.dimensions()[i]);
const double out = in + m_padding[i].first + m_padding[i].second;
if (out == 0)
return;
const double reduction = in / out;
cost *= reduction;
if (first) {
cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
reduction * (1 * TensorOpCost::AddCost<Index>()));
} else {
cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() +
2 * TensorOpCost::MulCost<Index>() +
reduction * (2 * TensorOpCost::MulCost<Index>() +
1 * TensorOpCost::DivCost<Index>()));
}
}
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
Index inputIndex = 0;
EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index firstIdx = index;
const Index lastIdx = index + PacketSize - 1;
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
const Index lastPaddedRight = m_outputStrides[i+1];
if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
const Index idx = index / m_outputStrides[i];
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
else {
// Every other case
return packetWithPossibleZero(initialIndex);
}
}
const Index lastIdx = index + PacketSize - 1;
const Index firstIdx = index;
const Index lastPaddedLeft = m_padding[0].first;
const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
const Index lastPaddedRight = m_outputStrides[1];
if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
inputIndex += (index - m_padding[0].first);
return m_impl.template packet<Unaligned>(inputIndex);
}
// Every other case
return packetWithPossibleZero(initialIndex);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
const Index initialIndex = index;
Index inputIndex = 0;
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index firstIdx = index;
const Index lastIdx = index + PacketSize - 1;
const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
const Index lastPaddedRight = m_outputStrides[i];
if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
const Index idx = index / m_outputStrides[i+1];
inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
index -= idx * m_outputStrides[i+1];
}
else {
// Every other case
return packetWithPossibleZero(initialIndex);
}
}
const Index lastIdx = index + PacketSize - 1;
const Index firstIdx = index;
const Index lastPaddedLeft = m_padding[NumDims-1].first;
const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
const Index lastPaddedRight = m_outputStrides[NumDims-1];
if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
// all the coefficient are in the padding zone.
return internal::pset1<PacketReturnType>(m_paddingValue);
}
else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
// all the coefficient are between the 2 padding zones.
inputIndex += (index - m_padding[NumDims-1].first);
return m_impl.template packet<Unaligned>(inputIndex);
}
// Every other case
return packetWithPossibleZero(initialIndex);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
Dimensions m_dimensions;
array<Index, NumDims+1> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
PaddingDimensions m_padding;
Scalar m_paddingValue;
const Device EIGEN_DEVICE_REF m_device;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H

View File

@@ -0,0 +1,291 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
namespace Eigen {
/** \class TensorPatch
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor patch class.
*
*
*/
namespace internal {
template<typename PatchDim, typename XprType>
struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions + 1;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<typename PatchDim, typename XprType>
struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense>
{
typedef const TensorPatchOp<PatchDim, XprType>& type;
};
template<typename PatchDim, typename XprType>
struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type>
{
typedef TensorPatchOp<PatchDim, XprType> type;
};
} // end namespace internal
template<typename PatchDim, typename XprType>
class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
: m_xpr(expr), m_patch_dims(patch_dims) {}
EIGEN_DEVICE_FUNC
const PatchDim& patch_dims() const { return m_patch_dims; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const PatchDim m_patch_dims;
};
// Eval as rvalue
template<typename PatchDim, typename ArgType, typename Device>
struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
{
typedef TensorPatchOp<PatchDim, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
Index num_patches = 1;
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const PatchDim& patch_dims = op.patch_dims();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < NumDims-1; ++i) {
m_dimensions[i] = patch_dims[i];
num_patches *= (input_dims[i] - patch_dims[i] + 1);
}
m_dimensions[NumDims-1] = num_patches;
m_inputStrides[0] = 1;
m_patchStrides[0] = 1;
for (int i = 1; i < NumDims-1; ++i) {
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
m_patchStrides[i] = m_patchStrides[i-1] * (input_dims[i-1] - patch_dims[i-1] + 1);
}
m_outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
}
} else {
for (int i = 0; i < NumDims-1; ++i) {
m_dimensions[i+1] = patch_dims[i];
num_patches *= (input_dims[i] - patch_dims[i] + 1);
}
m_dimensions[0] = num_patches;
m_inputStrides[NumDims-2] = 1;
m_patchStrides[NumDims-2] = 1;
for (int i = NumDims-3; i >= 0; --i) {
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
m_patchStrides[i] = m_patchStrides[i+1] * (input_dims[i+1] - patch_dims[i+1] + 1);
}
m_outputStrides[NumDims-1] = 1;
for (int i = NumDims-2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
// Find the location of the first element of the patch.
Index patchIndex = index / m_outputStrides[output_stride_index];
// Find the offset of the element wrt the location of the first element.
Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i > 0; --i) {
const Index patchIdx = patchIndex / m_patchStrides[i];
patchIndex -= patchIdx * m_patchStrides[i];
const Index offsetIdx = patchOffset / m_outputStrides[i];
patchOffset -= offsetIdx * m_outputStrides[i];
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
}
} else {
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 2; ++i) {
const Index patchIdx = patchIndex / m_patchStrides[i];
patchIndex -= patchIdx * m_patchStrides[i];
const Index offsetIdx = patchOffset / m_outputStrides[i+1];
patchOffset -= offsetIdx * m_outputStrides[i+1];
inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
}
}
inputIndex += (patchIndex + patchOffset);
return m_impl.coeff(inputIndex);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
Index indices[2] = {index, index + PacketSize - 1};
Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
indices[1] / m_outputStrides[output_stride_index]};
Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]};
Index inputIndices[2] = {0, 0};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 2; i > 0; --i) {
const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
patchIndices[1] / m_patchStrides[i]};
patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i],
patchOffsets[1] / m_outputStrides[i]};
patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
}
} else {
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 2; ++i) {
const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
patchIndices[1] / m_patchStrides[i]};
patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i+1],
patchOffsets[1] / m_outputStrides[i+1]};
patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i+1];
patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i+1];
inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
}
}
inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
return rslt;
}
else {
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
values[0] = m_impl.coeff(inputIndices[0]);
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
EIGEN_UNROLL_LOOP
for (int i = 1; i < PacketSize-1; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() +
TensorOpCost::MulCost<Index>() +
2 * TensorOpCost::AddCost<Index>());
return m_impl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims-1> m_inputStrides;
array<Index, NumDims-1> m_patchStrides;
TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H

View File

@@ -0,0 +1,322 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
namespace Eigen {
namespace internal {
namespace {
EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
#if defined(EIGEN_GPU_COMPILE_PHASE)
// We don't support 3d kernels since we currently only use 1 and
// 2d kernels.
gpu_assert(threadIdx.z == 0);
return blockIdx.x * blockDim.x + threadIdx.x
+ gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
#else
// Rely on Eigen's random implementation.
return random<uint64_t>();
#endif
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
// TODO: Unify with the implementation in the non blocking thread pool.
uint64_t current = *state;
// Update the internal state
*state = current * 6364136223846793005ULL + (stream << 1 | 1);
// Generate the random output (using the PCG-XSH-RS scheme)
return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
seed = seed ? seed : get_random_seed();
return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
}
} // namespace
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
unsigned rnd = PCG_XSH_RS_generator(state, stream);
return static_cast<T>(rnd);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
// Generate 10 random bits for the mantissa, merge with exponent.
unsigned rnd = PCG_XSH_RS_generator(state, stream);
const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
// Return the final result
return result - Eigen::half(1.0f);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state, uint64_t stream) {
// Generate 7 random bits for the mantissa, merge with exponent.
unsigned rnd = PCG_XSH_RS_generator(state, stream);
const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
// Return the final result
return result - Eigen::bfloat16(1.0f);
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
typedef union {
uint32_t raw;
float fp;
} internal;
internal result;
// Generate 23 random bits for the mantissa mantissa
const unsigned rnd = PCG_XSH_RS_generator(state, stream);
result.raw = rnd & 0x7fffffu;
// Set the exponent
result.raw |= (static_cast<uint32_t>(127) << 23);
// Return the final result
return result.fp - 1.0f;
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
typedef union {
uint64_t raw;
double dp;
} internal;
internal result;
result.raw = 0;
// Generate 52 random bits for the mantissa
// First generate the upper 20 bits
unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
// The generate the lower 32 bits
unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
// Set the exponent
result.raw |= (static_cast<uint64_t>(1023) << 52);
// Return the final result
return result.dp - 1.0;
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
return std::complex<float>(RandomToTypeUniform<float>(state, stream),
RandomToTypeUniform<float>(state, stream));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
return std::complex<double>(RandomToTypeUniform<double>(state, stream),
RandomToTypeUniform<double>(state, stream));
}
template <typename T> class UniformRandomGenerator {
public:
static const bool PacketAccess = true;
// Uses the given "seed" if non-zero, otherwise uses a random seed.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
#ifdef EIGEN_USE_SYCL
// In SYCL it is not possible to build PCG_XSH_RS_state in one step.
// Therefor, we need two step to initializate the m_state.
// IN SYCL, the constructor of the functor is s called on the CPU
// and we get the clock seed here from the CPU. However, This seed is
//the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
// and only available on the Operator() function (which is called on the GPU).
// Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread
// but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds
// the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction
// similar to CUDA Therefore, the thread Id injection is not available at this stage.
//However when the operator() is called the thread ID will be avilable. So inside the opeator,
// we add the thrreadID, BlockId,... (which is equivalent of i)
//to the seed and construct the unique m_state per thead similar to cuda.
m_exec_once =false;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
const UniformRandomGenerator& other) {
m_state = other.m_state;
#ifdef EIGEN_USE_SYCL
m_exec_once =other.m_exec_once;
#endif
}
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
#ifdef EIGEN_USE_SYCL
if(!m_exec_once) {
// This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
// The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
m_state += (i * 6364136223846793005ULL);
m_exec_once =true;
}
#endif
T result = RandomToTypeUniform<T>(&m_state, i);
return result;
}
template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
#ifdef EIGEN_USE_SYCL
if(!m_exec_once) {
// This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
m_state += (i * 6364136223846793005ULL);
m_exec_once =true;
}
#endif
EIGEN_UNROLL_LOOP
for (int j = 0; j < packetSize; ++j) {
values[j] = RandomToTypeUniform<T>(&m_state, i);
}
return internal::pload<Packet>(values);
}
private:
mutable uint64_t m_state;
#ifdef EIGEN_USE_SYCL
mutable bool m_exec_once;
#endif
};
template <typename Scalar>
struct functor_traits<UniformRandomGenerator<Scalar> > {
enum {
// Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
Cost = 12 * NumTraits<Scalar>::AddCost *
((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
};
};
template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
// Use the ratio of uniform method to generate numbers following a normal
// distribution. See for example Numerical Recipes chapter 7.3.9 for the
// details.
T u, v, q;
do {
u = RandomToTypeUniform<T>(state, stream);
v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
const T x = u - T(0.449871);
const T y = numext::abs(v) + T(0.386595);
q = x*x + y * (T(0.196)*y - T(0.25472)*x);
} while (q > T(0.27597) &&
(q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
return v/u;
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
return std::complex<float>(RandomToTypeNormal<float>(state, stream),
RandomToTypeNormal<float>(state, stream));
}
template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
return std::complex<double>(RandomToTypeNormal<double>(state, stream),
RandomToTypeNormal<double>(state, stream));
}
template <typename T> class NormalRandomGenerator {
public:
static const bool PacketAccess = true;
// Uses the given "seed" if non-zero, otherwise uses a random seed.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
m_state = PCG_XSH_RS_state(seed);
#ifdef EIGEN_USE_SYCL
// In SYCL it is not possible to build PCG_XSH_RS_state in one step.
// Therefor, we need two steps to initializate the m_state.
// IN SYCL, the constructor of the functor is s called on the CPU
// and we get the clock seed here from the CPU. However, This seed is
//the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
// and only available on the Operator() function (which is called on the GPU).
// Therefore, the thread Id injection is not available at this stage. However when the operator()
//is called the thread ID will be avilable. So inside the opeator,
// we add the thrreadID, BlockId,... (which is equivalent of i)
//to the seed and construct the unique m_state per thead similar to cuda.
m_exec_once =false;
#endif
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
const NormalRandomGenerator& other) {
m_state = other.m_state;
#ifdef EIGEN_USE_SYCL
m_exec_once=other.m_exec_once;
#endif
}
template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T operator()(Index i) const {
#ifdef EIGEN_USE_SYCL
if(!m_exec_once) {
// This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
m_state += (i * 6364136223846793005ULL);
m_exec_once =true;
}
#endif
T result = RandomToTypeNormal<T>(&m_state, i);
return result;
}
template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
Packet packetOp(Index i) const {
const int packetSize = internal::unpacket_traits<Packet>::size;
EIGEN_ALIGN_MAX T values[packetSize];
#ifdef EIGEN_USE_SYCL
if(!m_exec_once) {
// This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
m_state += (i * 6364136223846793005ULL);
m_exec_once =true;
}
#endif
EIGEN_UNROLL_LOOP
for (int j = 0; j < packetSize; ++j) {
values[j] = RandomToTypeNormal<T>(&m_state, i);
}
return internal::pload<Packet>(values);
}
private:
mutable uint64_t m_state;
#ifdef EIGEN_USE_SYCL
mutable bool m_exec_once;
#endif
};
template <typename Scalar>
struct functor_traits<NormalRandomGenerator<Scalar> > {
enum {
// On average, we need to generate about 3 random numbers
// 15 mul, 8 add, 1.5 logs
Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
};
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H

View File

@@ -0,0 +1,998 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
// so we'll use a macro to make clang happy.
#ifndef KERNEL_FRIEND
#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
#else
#define KERNEL_FRIEND friend
#endif
#endif
namespace Eigen {
/** \class TensorReduction
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reduction class.
*
*/
namespace internal {
template<typename Op, typename Dims, typename XprType,template <class> class MakePointer_ >
struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> >
: traits<XprType>
{
typedef traits<XprType> XprTraits;
typedef typename XprTraits::Scalar Scalar;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
template <class T> struct MakePointer {
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
};
};
template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense>
{
typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
};
template<typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1, typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type>
{
typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
};
template <typename OutputDims> struct DimInitializer {
template <typename InputDims, typename ReducedDims> EIGEN_DEVICE_FUNC
static void run(const InputDims& input_dims,
const array<bool, internal::array_size<InputDims>::value>& reduced,
OutputDims* output_dims, ReducedDims* reduced_dims) {
const int NumInputDims = internal::array_size<InputDims>::value;
int outputIndex = 0;
int reduceIndex = 0;
for (int i = 0; i < NumInputDims; ++i) {
if (reduced[i]) {
(*reduced_dims)[reduceIndex] = input_dims[i];
++reduceIndex;
} else {
(*output_dims)[outputIndex] = input_dims[i];
++outputIndex;
}
}
}
};
template <> struct DimInitializer<Sizes<> > {
template <typename InputDims, typename Index, size_t Rank> EIGEN_DEVICE_FUNC
static void run(const InputDims& input_dims, const array<bool, Rank>&,
Sizes<>*, array<Index, Rank>* reduced_dims) {
const int NumInputDims = internal::array_size<InputDims>::value;
for (int i = 0; i < NumInputDims; ++i) {
(*reduced_dims)[i] = input_dims[i];
}
}
};
template <typename ReducedDims, int NumTensorDims, int Layout>
struct are_inner_most_dims {
static const bool value = false;
};
template <typename ReducedDims, int NumTensorDims, int Layout>
struct preserve_inner_most_dims {
static const bool value = false;
};
#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
template <typename ReducedDims, int NumTensorDims>
struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value-1, array_size<ReducedDims>::value-1);
static const bool value = tmp1 & tmp2 & tmp3;
};
template <typename ReducedDims, int NumTensorDims>
struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
static const bool value = tmp1 & tmp2 & tmp3;
};
template <typename ReducedDims, int NumTensorDims>
struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
static const bool value = tmp1 & tmp2;
};
template <typename ReducedDims, int NumTensorDims>
struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
static const bool value = tmp1 & tmp2;
};
#endif
template <int DimIndex, typename Self, typename Op>
struct GenericDimReducer {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
}
}
};
template <typename Self, typename Op>
struct GenericDimReducer<0, Self, Op> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
for (int j = 0; j < self.m_reducedDims[0]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
reducer.reduce(self.m_impl.coeff(input), accum);
}
}
};
template <typename Self, typename Op>
struct GenericDimReducer<-1, Self, Op> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer, typename Self::CoeffReturnType* accum) {
reducer.reduce(self.m_impl.coeff(index), accum);
}
};
template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
!Self::ReducerTraits::IsExactlyAssociative)>
struct InnerMostDimReducer {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
typename Self::CoeffReturnType accum = reducer.initialize();
for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
return reducer.finalize(accum);
}
};
template <typename Self, typename Op>
struct InnerMostDimReducer<Self, Op, true, false> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
}
typename Self::CoeffReturnType accum = reducer.initialize();
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
return reducer.finalizeBoth(accum, paccum);
}
};
#if !defined(EIGEN_HIPCC)
static const int kLeafSize = 1024;
template <typename Self, typename Op>
struct InnerMostDimReducer<Self, Op, false, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
reduce(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
typename Self::CoeffReturnType accum = reducer.initialize();
if (numValuesToReduce > kLeafSize) {
const typename Self::Index half = numValuesToReduce / 2;
reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
reducer.reduce(
reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
&accum);
} else {
for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
}
return reducer.finalize(accum);
}
};
template <typename Self, typename Op>
struct InnerMostDimReducer<Self, Op, true, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
reduce(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer) {
const typename Self::Index packetSize =
internal::unpacket_traits<typename Self::PacketReturnType>::size;
typename Self::CoeffReturnType accum = reducer.initialize();
if (numValuesToReduce > packetSize * kLeafSize) {
// Make sure the split point is aligned on a packet boundary.
const typename Self::Index split =
packetSize *
divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
packetSize);
const typename Self::Index num_left =
numext::mini(split - firstIndex, numValuesToReduce);
reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
if (num_left < numValuesToReduce) {
reducer.reduce(
reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
}
return reducer.finalize(accum);
} else {
const typename Self::Index UnrollSize =
(numValuesToReduce / (2*packetSize)) * 2*packetSize;
const typename Self::Index VectorizedSize =
(numValuesToReduce / packetSize) * packetSize;
typename Self::PacketReturnType paccum =
reducer.template initializePacket<typename Self::PacketReturnType>();
typename Self::PacketReturnType paccum2 =
reducer.template initializePacket<typename Self::PacketReturnType>();
for (typename Self::Index j = 0; j < UnrollSize; j += packetSize * 2) {
reducer.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
reducer.reducePacket(
self.m_impl.template packet<Unaligned>(firstIndex + j + packetSize),
&paccum2);
}
for (typename Self::Index j = UnrollSize; j < VectorizedSize; j+= packetSize) {
reducer.reducePacket(self.m_impl.template packet<Unaligned>(
firstIndex + j), &paccum);
}
reducer.reducePacket(paccum2, &paccum);
for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
++j) {
reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
}
return reducer.finalizeBoth(accum, paccum);
}
}
};
#endif
template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct InnerMostDimPreserver {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
eigen_assert(false && "should never be called");
}
};
template <int DimIndex, typename Self, typename Op>
struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
}
}
};
template <typename Self, typename Op>
struct InnerMostDimPreserver<0, Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
for (typename Self::Index j = 0; j < self.m_reducedDims[0]; ++j) {
const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
reducer.reducePacket(self.m_impl.template packet<Unaligned>(input), accum);
}
}
};
template <typename Self, typename Op>
struct InnerMostDimPreserver<-1, Self, Op, true> {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
eigen_assert(false && "should never be called");
}
};
// Default full reducer
template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct FullReducer {
static const bool HasOptimizedImplementation = false;
static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
}
};
#ifdef EIGEN_USE_THREADS
// Multithreaded full reducers
template <typename Self, typename Op,
bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
struct FullReducerShard {
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
typename Self::Index numValuesToReduce, Op& reducer,
typename Self::CoeffReturnType* output) {
*output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
self, firstIndex, numValuesToReduce, reducer);
}
};
// Multithreaded full reducer
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
static const Index PacketSize =
unpacket_traits<typename Self::PacketReturnType>::size;
// launch one reducer per thread and accumulate the result.
static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
typename Self::CoeffReturnType* output) {
typedef typename Self::Index Index;
const Index num_coeffs = array_prod(self.m_impl.dimensions());
if (num_coeffs == 0) {
*output = reducer.finalize(reducer.initialize());
return;
}
const TensorOpCost cost =
self.m_impl.costPerCoeff(Vectorizable) +
TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
PacketSize);
const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
num_coeffs, cost, device.numThreads());
if (num_threads == 1) {
*output =
InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
return;
}
const Index blocksize =
std::floor<Index>(static_cast<float>(num_coeffs) / num_threads);
const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
eigen_assert(num_coeffs >= numblocks * blocksize);
Barrier barrier(internal::convert_index<unsigned int>(numblocks));
MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
for (Index i = 0; i < numblocks; ++i) {
device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
self, i * blocksize, blocksize, reducer,
&shards[i]);
}
typename Self::CoeffReturnType finalShard;
if (numblocks * blocksize < num_coeffs) {
finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(
self, numblocks * blocksize, num_coeffs - numblocks * blocksize,
reducer);
} else {
finalShard = reducer.initialize();
}
barrier.Wait();
for (Index i = 0; i < numblocks; ++i) {
reducer.reduce(shards[i], &finalShard);
}
*output = reducer.finalize(finalShard);
}
};
#endif
// Default inner reducer
template <typename Self, typename Op, typename Device>
struct InnerReducer {
static const bool HasOptimizedImplementation = false;
EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
eigen_assert(false && "Not implemented");
return true;
}
};
// Default outer reducer
template <typename Self, typename Op, typename Device>
struct OuterReducer {
static const bool HasOptimizedImplementation = false;
EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
eigen_assert(false && "Not implemented");
return true;
}
};
#ifdef EIGEN_USE_SYCL
// Default Generic reducer
template <typename Self, typename Op, typename Device>
struct GenericReducer {
static const bool HasOptimizedImplementation = false;
EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
eigen_assert(false && "Not implemented");
return true;
}
};
#endif
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
template <int B, int N, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
#endif
template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
template <int NPT, typename S, typename R, typename I_>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif
/**
* For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
* This allows the reduction to have a different type for the accumulator than the input data type.
* If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
* with the accumulator and the other for reducing two accumulators.
* Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
* some properties of the input.
*/
template <typename Op, typename CoeffReturnType>
struct ReductionReturnType {
#if defined(EIGEN_USE_SYCL)
typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
#else
typedef typename remove_const<CoeffReturnType>::type type;
#endif
};
} // end namespace internal
template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
public:
typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorReductionOp(const XprType& expr, const Dims& dims) : m_expr(expr), m_dims(dims)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer) : m_expr(expr), m_dims(dims), m_reducer(reducer)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const XprType& expression() const { return m_expr; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dims& dims() const { return m_dims; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Op& reducer() const { return m_reducer; }
protected:
typename XprType::Nested m_expr;
const Dims m_dims;
const Op m_reducer;
};
template<typename ArgType, typename Device>
struct TensorReductionEvaluatorBase;
// Eval as rvalue
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
{
typedef internal::reducer_traits<Op, Device> ReducerTraits;
typedef Dims ReducedDims;
typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
typedef typename XprType::Index Index;
typedef ArgType ChildType;
typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
static const int NumInputDims = internal::array_size<InputDimensions>::value;
static const int NumReducedDims = internal::array_size<Dims>::value;
static const int NumOutputDims = NumInputDims - NumReducedDims;
typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
typedef typename XprType::Scalar Scalar;
typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
// Subset of strides of the input tensor for the non-reduced dimensions.
// Indexed by output dimensions.
static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
enum {
IsAligned = false,
PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
BlockAccess = false,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
static const bool RunningFullReduction = (NumOutputDims==0);
EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
{
EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
YOU_MADE_A_PROGRAMMING_MISTAKE);
// Build the bitmap indicating if an input dimension is reduced or not.
for (int i = 0; i < NumInputDims; ++i) {
m_reduced[i] = false;
}
for (int i = 0; i < NumReducedDims; ++i) {
eigen_assert(op.dims()[i] >= 0);
eigen_assert(op.dims()[i] < NumInputDims);
m_reduced[op.dims()[i]] = true;
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
// Precompute output strides.
if (NumOutputDims > 0) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_outputStrides[0] = 1;
for (int i = 1; i < NumOutputDims; ++i) {
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
} else {
m_outputStrides[NumOutputDims - 1] = 1;
for (int i = NumOutputDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
}
}
}
// Precompute input strides.
if (NumInputDims > 0) {
array<Index, NumInputDims> input_strides;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
input_strides[0] = 1;
for (int i = 1; i < NumInputDims; ++i) {
input_strides[i] = input_strides[i-1] * input_dims[i-1];
}
} else {
input_strides.back() = 1;
for (int i = NumInputDims - 2; i >= 0; --i) {
input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
}
}
int outputIndex = 0;
int reduceIndex = 0;
for (int i = 0; i < NumInputDims; ++i) {
if (m_reduced[i]) {
m_reducedStrides[reduceIndex] = input_strides[i];
++reduceIndex;
} else {
m_preservedStrides[outputIndex] = input_strides[i];
m_output_to_input_dim_map[outputIndex] = i;
++outputIndex;
}
}
}
// Special case for full reductions
if (NumOutputDims == 0) {
m_preservedStrides[0] = internal::array_prod(input_dims);
}
m_numValuesToReduce =
NumOutputDims == 0
? internal::array_prod(input_dims)
: (static_cast<int>(Layout) == static_cast<int>(ColMajor))
? m_preservedStrides[0]
: m_preservedStrides[NumOutputDims - 1];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE
bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
// Use the FullReducer if possible.
if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
!RunningOnGPU))) {
bool need_assign = false;
if (!data) {
m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
data = m_result;
need_assign = true;
}
Op reducer(m_reducer);
internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
return need_assign;
}
// Attempt to use an optimized reduction.
else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
bool reducing_inner_dims = true;
for (int i = 0; i < NumReducedDims; ++i) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
reducing_inner_dims &= m_reduced[i];
} else {
reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
}
}
if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
(reducing_inner_dims || ReducingInnerMostDims)) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
if (!data) {
if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
m_result = data;
}
else {
return true;
}
}
Op reducer(m_reducer);
// For SYCL this if always return false
if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
if (m_result) {
m_device.deallocate_temp(m_result);
m_result = NULL;
}
return true;
} else {
return (m_result != NULL);
}
}
bool preserving_inner_dims = true;
for (int i = 0; i < NumReducedDims; ++i) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
} else {
preserving_inner_dims &= m_reduced[i];
}
}
if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation &&
preserving_inner_dims) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
if (!data) {
if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
m_result = data;
}
else {
return true;
}
}
Op reducer(m_reducer);
// For SYCL this if always return false
if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
if (m_result) {
m_device.deallocate_temp(m_result);
m_result = NULL;
}
return true;
} else {
return (m_result != NULL);
}
}
#if defined(EIGEN_USE_SYCL)
// If there is no Optimised version for SYCL, the reduction expression
// must break into two subexpression and use the SYCL generic Reducer on the device.
if(RunningOnSycl) {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
if (!data) {
data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
m_result = data;
}
Op reducer(m_reducer);
internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
return (m_result != NULL);
}
#endif
}
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE
void
evalSubExprsIfNeededAsync(EvaluatorPointerType data,
EvalSubExprsCallback done) {
m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
done(evalSubExprsIfNeededCommon(data));
});
}
#endif
EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
m_impl.evalSubExprsIfNeeded(NULL);
return evalSubExprsIfNeededCommon(data);
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
if (m_result) {
m_device.deallocate_temp(m_result);
m_result = NULL;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
if (( RunningFullReduction || RunningOnGPU) && m_result ) {
return *(m_result + index);
}
Op reducer(m_reducer);
if (ReducingInnerMostDims || RunningFullReduction) {
const Index num_values_to_reduce =
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index),
num_values_to_reduce, reducer);
} else {
typename Self::CoeffReturnType accum = reducer.initialize();
internal::GenericDimReducer<NumReducedDims-1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
return reducer.finalize(accum);
}
}
// TODO(bsteiner): provide a more efficient implementation.
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
if (RunningOnGPU && m_result) {
return internal::pload<PacketReturnType>(m_result + index);
}
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
if (ReducingInnerMostDims) {
const Index num_values_to_reduce =
(static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_preservedStrides[0] : m_preservedStrides[NumPreservedStrides - 1];
const Index firstIndex = firstInput(index);
for (Index i = 0; i < PacketSize; ++i) {
Op reducer(m_reducer);
values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
num_values_to_reduce, reducer);
}
} else if (PreservingInnerMostDims) {
const Index firstIndex = firstInput(index);
const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
// TBD: extend this the the n innermost dimensions that we preserve.
if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
Op reducer(m_reducer);
typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
internal::InnerMostDimPreserver<NumReducedDims-1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
return reducer.finalizePacket(accum);
} else {
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index + i);
}
}
} else {
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index + i);
}
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
// Must be called after evalSubExprsIfNeeded().
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
if (RunningFullReduction && m_result) {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
} else {
const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_result.bind(cgh);
}
#endif
private:
template <int, typename, typename> friend struct internal::GenericDimReducer;
template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
#ifdef EIGEN_USE_THREADS
template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
#endif
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
#endif
template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif
#if defined(EIGEN_USE_SYCL)
template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
// SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
template <typename, typename, typename> friend struct internal::GenericReducer;
#endif
template <typename S, typename O, typename D> friend struct internal::InnerReducer;
struct BlockIteratorState {
Index input_dim;
Index output_size;
Index output_count;
};
// Returns the Index in the input tensor of the first value that needs to be
// used to compute the reduction at output index "index".
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
if (ReducingInnerMostDims) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
return index * m_preservedStrides[0];
} else {
return index * m_preservedStrides[NumPreservedStrides - 1];
}
}
// TBD: optimize the case where we preserve the innermost dimensions.
Index startInput = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumOutputDims - 1; i > 0; --i) {
// This is index_i in the output tensor.
const Index idx = index / m_outputStrides[i];
startInput += idx * m_preservedStrides[i];
index -= idx * m_outputStrides[i];
}
if (PreservingInnerMostDims) {
eigen_assert(m_preservedStrides[0] == 1);
startInput += index;
} else {
startInput += index * m_preservedStrides[0];
}
} else {
for (int i = 0; i < NumOutputDims - 1; ++i) {
// This is index_i in the output tensor.
const Index idx = index / m_outputStrides[i];
startInput += idx * m_preservedStrides[i];
index -= idx * m_outputStrides[i];
}
if (PreservingInnerMostDims) {
eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
startInput += index;
} else {
startInput += index * m_preservedStrides[NumPreservedStrides - 1];
}
}
return startInput;
}
// Bitmap indicating if an input dimension is reduced or not.
array<bool, NumInputDims> m_reduced;
// Dimensions of the output of the operation.
Dimensions m_dimensions;
// Precomputed strides for the output tensor.
array<Index, NumOutputDims> m_outputStrides;
array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
array<Index, NumPreservedStrides> m_preservedStrides;
// Map from output to input dimension index.
array<Index, NumOutputDims> m_output_to_input_dim_map;
// How many values go into each reduction
Index m_numValuesToReduce;
// Subset of strides of the input tensor for the reduced dimensions.
// Indexed by reduced dimensions.
array<Index, NumReducedDims> m_reducedStrides;
// Size of the input dimensions that are reduced.
// Indexed by reduced dimensions.
array<Index, NumReducedDims> m_reducedDims;
// Evaluator for the input expression.
TensorEvaluator<ArgType, Device> m_impl;
// Operation to apply for computing the reduction.
Op m_reducer;
// For full reductions
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
static const bool RunningOnSycl = false;
#elif defined(EIGEN_USE_SYCL)
static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
static const bool RunningOnGPU = false;
#else
static const bool RunningOnGPU = false;
static const bool RunningOnSycl = false;
#endif
EvaluatorPointerType m_result;
const Device EIGEN_DEVICE_REF m_device;
};
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
};
template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
// The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
//Therefore the coeff function should be overridden by for SYCL kernel
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
return *(this->data() + index);
}
// The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
//Therefore the packet function should be overridden by for SYCL kernel
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
return internal::pload<typename Base::PacketReturnType>(this->data() + index);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H

View File

@@ -0,0 +1,6 @@
#if defined(__clang__) || defined(__GNUC__)
#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file"
#endif
#include "TensorReductionGpu.h"

View File

@@ -0,0 +1,966 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
namespace Eigen {
namespace internal {
#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
// Full reducers for GPU, don't vectorize for now
// Reducer function that enables multiple gpu thread to safely accumulate at the same
// output address. It basically reads the current value of the output variable, and
// attempts to update it with the new value. If in the meantime another gpu thread
// updated the content of the output address it will try again.
template <typename T, typename R>
__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
if (sizeof(T) == 4)
{
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
unsigned int newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
if (newval == oldval) {
return;
}
unsigned int readback;
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
if (newval == oldval) {
return;
}
}
}
else if (sizeof(T) == 8) {
unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
unsigned long long newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
if (newval == oldval) {
return;
}
unsigned long long readback;
while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reduce(accum, reinterpret_cast<T*>(&newval));
if (newval == oldval) {
return;
}
}
}
else {
gpu_assert(0 && "Wordsize not supported");
}
#else // EIGEN_CUDA_ARCH >= 300
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
// We extend atomicExch to support extra data types
template <typename Type>
__device__ inline Type atomicExchCustom(Type* address, Type val) {
return atomicExch(address, val);
}
template <>
__device__ inline double atomicExchCustom(double* address, double val) {
unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
}
#ifdef EIGEN_HAS_GPU_FP16
template <typename R>
__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
unsigned int newval = oldval;
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
if (newval == oldval) {
return;
}
unsigned int readback;
while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
oldval = readback;
newval = oldval;
reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
if (newval == oldval) {
return;
}
}
}
// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
template <typename R>
__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
half2* houtput=reinterpret_cast<half2*>(output);
half2* haccum=reinterpret_cast<half2*>(&accum);
for(int i=0;i<4;++i){
atomicReduce(houtput+i,*(haccum+i),reducer);
}
}
#endif // EIGEN_HAS_GPU_FP16
template <>
__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
atomicAdd(output, accum);
#else // EIGEN_CUDA_ARCH >= 300
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
template <typename CoeffType, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const Index num_threads = blockDim.x * gridDim.x;
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = val;
}
}
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
// Initialize the output value
const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
if (gridDim.x == 1) {
if (first_index == 0) {
*output = reducer.initialize();
}
}
else {
if (threadIdx.x == 0) {
unsigned int block = atomicCAS(semaphore, 0u, 1u);
if (block == 0) {
// We're the first block to run, initialize the output value
atomicExchCustom(output, reducer.initialize());
__threadfence();
atomicExch(semaphore, 2u);
}
else {
// Wait for the first block to initialize the output value.
// Use atomicCAS here to ensure that the reads aren't cached
unsigned int val;
do {
val = atomicCAS(semaphore, 2u, 2u);
}
while (val < 2u);
}
}
}
__syncthreads();
eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
typename Self::CoeffReturnType accum = reducer.initialize();
Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
for (Index i = 0; i < max_iter; i+=BlockSize) {
const Index index = first_index + i;
eigen_assert(index < num_coeffs);
typename Self::CoeffReturnType val = input.m_impl.coeff(index);
reducer.reduce(val, &accum);
}
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_HIPCC)
// use std::is_floating_point to determine the type of reduced_val
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
// and list the float and int versions of __shfl_down as the candidate functions.
if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
} else {
reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
}
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
#else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
#endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(output, accum, reducer);
}
if (gridDim.x > 1 && threadIdx.x == 0) {
// Let the last block reset the semaphore
atomicInc(semaphore, gridDim.x + 1);
#if defined(EIGEN_HIPCC)
__threadfence_system();
#endif
}
#else // EIGEN_CUDA_ARCH >= 300
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
#ifdef EIGEN_HAS_GPU_FP16
template <typename Self,
typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
packet_traits<Eigen::half>::type* scratch) {
eigen_assert(blockDim.x == 1);
eigen_assert(gridDim.x == 1);
typedef packet_traits<Eigen::half>::type packet_type;
Index packet_remainder =
num_coeffs % Index(unpacket_traits<packet_type>::size);
if (packet_remainder != 0) {
half2* h2scratch = reinterpret_cast<half2*>(scratch);
for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
*h2scratch =
__halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
h2scratch++;
}
if ((num_coeffs & 1) != 0) {
half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
*h2scratch = __halves2half2(lastCoeff, reducer.initialize());
}
} else {
*scratch = reducer.template initializePacket<packet_type>();
}
}
template <typename Self,
typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const Index num_threads = blockDim.x * gridDim.x;
typedef typename packet_traits<Eigen::half>::type PacketType;
const Index num_packets =
num_coeffs / Index(unpacket_traits<PacketType>::size);
PacketType* p_output = reinterpret_cast<PacketType*>(output);
for (Index i = thread_id; i < num_packets; i += num_threads) {
p_output[i] = reducer.template initializePacket<PacketType>();
}
Index packet_remainder =
num_coeffs % Index(unpacket_traits<PacketType>::size);
if (thread_id < packet_remainder) {
output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
}
}
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
half* output, packet_traits<Eigen::half>::type* scratch) {
typedef typename packet_traits<Eigen::half>::type PacketType;
const int packet_width = unpacket_traits<PacketType>::size;
eigen_assert(NumPerThread % packet_width == 0);
const Index first_index =
blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
// Initialize the output value if it wasn't initialized by the ReductionInitKernel
if (gridDim.x == 1) {
if (first_index == 0) {
int rem = num_coeffs % packet_width;
if (rem != 0) {
half2* p_scratch = reinterpret_cast<half2*>(scratch);
*scratch = reducer.template initializePacket<PacketType>();
for (int i = 0; i < rem / 2; i++) {
*p_scratch = __halves2half2(
input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
p_scratch++;
}
if ((num_coeffs & 1) != 0) {
half last = input.m_impl.coeff(num_coeffs - 1);
*p_scratch = __halves2half2(last, reducer.initialize());
}
} else {
*scratch = reducer.template initializePacket<PacketType>();
}
}
__syncthreads();
}
PacketType accum = reducer.template initializePacket<PacketType>();
const Index max_iter =
numext::mini<Index>((num_coeffs - first_index) / packet_width,
NumPerThread * BlockSize / packet_width);
for (Index i = 0; i < max_iter; i += BlockSize) {
const Index index = first_index + packet_width * i;
eigen_assert(index + packet_width < num_coeffs);
PacketType val = input.m_impl.template packet<Unaligned>(index);
reducer.reducePacket(val, &accum);
}
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_HIPCC)
PacketType r1;
half2* hr = reinterpret_cast<half2*>(&r1);
half2* hacc = reinterpret_cast<half2*>(&accum);
for (int i = 0; i < packet_width / 2; i++) {
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
union { int i; half2 h; } wka_in, wka_out;
wka_in.h = hacc[i];
wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
hr[i] = wka_out.h;
}
reducer.reducePacket(r1, &accum);
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
PacketType r1;
half2* hr = reinterpret_cast<half2*>(&r1);
half2* hacc = reinterpret_cast<half2*>(&accum);
for (int i = 0; i < packet_width / 2; i++) {
hr[i] = __shfl_down(hacc[i], offset, warpSize);
}
reducer.reducePacket(r1, &accum);
#else
PacketType r1;
half2* hr = reinterpret_cast<half2*>(&r1);
half2* hacc = reinterpret_cast<half2*>(&accum);
for (int i = 0; i < packet_width / 2; i++) {
hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
}
reducer.reducePacket(r1, &accum);
#endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(scratch, accum, reducer);
}
__syncthreads();
half2* rv1 = reinterpret_cast<half2*>(scratch);
if (packet_width > 2) {
reducer.reducePacket(rv1[2], rv1);
reducer.reducePacket(rv1[3], rv1 + 1);
reducer.reducePacket(rv1[1], rv1);
}
if (gridDim.x == 1) {
if (first_index == 0) {
half tmp = __low2half(*rv1);
reducer.reduce(__high2half(*rv1), &tmp);
*output = tmp;
}
}
}
template <typename Op>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
eigen_assert(threadIdx.x == 1);
half2* pscratch = reinterpret_cast<half2*>(scratch);
half tmp = __float2half(0.f);
typedef packet_traits<Eigen::half>::type packet_type;
for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
reducer.reduce(__low2half(*pscratch), &tmp);
reducer.reduce(__high2half(*pscratch), &tmp);
pscratch++;
}
*output = tmp;
}
#endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct FullReductionLauncher {
static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
gpu_assert(false && "Should only be called on doubles, floats and half floats");
}
};
// Specialization for float and double
template <typename Self, typename Op, typename OutputType, bool PacketAccess>
struct FullReductionLauncher<
Self, Op, OutputType, PacketAccess,
typename internal::enable_if<
internal::is_same<float, OutputType>::value ||
internal::is_same<double, OutputType>::value,
void>::type> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
unsigned int* semaphore = NULL;
if (num_blocks > 1) {
semaphore = device.semaphore();
}
LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
}
};
#ifdef EIGEN_HAS_GPU_FP16
template <typename Self, typename Op>
struct FullReductionLauncher<Self, Op, Eigen::half, false> {
static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
gpu_assert(false && "Should not be called since there is no packet accessor");
}
};
template <typename Self, typename Op>
struct FullReductionLauncher<Self, Op, Eigen::half, true> {
static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
typedef typename Self::Index Index;
typedef typename packet_traits<Eigen::half>::type PacketType;
const int block_size = 256;
const int num_per_thread = 128;
const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
// half2* scratch = static_cast<half2*>(device.scratchpad());
if (num_blocks > 1) {
// We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
1, 1, 0, device, reducer, self, num_coeffs, scratch);
}
LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
if (num_blocks > 1) {
LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
1, 1, 0, device, reducer, output, scratch);
}
}
};
#endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple cases
// of doubles, floats and half floats
#ifdef EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else // EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif // EIGEN_HAS_GPU_FP16
template <typename OutputType>
static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
return;
}
FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
}
};
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
typedef typename Self::CoeffReturnType Type;
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
eigen_assert(gridDim.z == 1);
const int unroll_times = 16;
eigen_assert(NumPerThread % unroll_times == 0);
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
const Index num_threads = blockDim.x * gridDim.x;
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel
if (gridDim.x == 1) {
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
__syncthreads();
}
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
const Index row = i / input_col_blocks;
if (row < num_preserved_coeffs) {
const Index col_block = i % input_col_blocks;
const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
Type reduced_val = reducer.initialize();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
if (last_col >= num_coeffs_to_reduce) {
for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
reducer.reduce(val, &reduced_val);
}
break;
} else {
// Faster version of the loop with no branches after unrolling.
#pragma unroll
for (int k = 0; k < unroll_times; ++k) {
const Index col = col_begin + blockDim.x * (j + k);
reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
}
}
}
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_HIPCC)
// use std::is_floating_point to determine the type of reduced_val
// This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
// and list the float and int versions of __shfl_down as the candidate functions.
if (std::is_floating_point<Type>::value) {
reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
} else {
reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
}
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
#else
reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
#endif
}
if ((threadIdx.x & (warpSize - 1)) == 0) {
atomicReduce(&(output[row]), reduced_val, reducer);
}
}
}
#else // EIGEN_CUDA_ARCH >= 300
gpu_assert(0 && "Shouldn't be called on unsupported device");
#endif // EIGEN_CUDA_ARCH >= 300
}
#ifdef EIGEN_HAS_GPU_FP16
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
half* output) {
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
eigen_assert(gridDim.y == 1);
eigen_assert(gridDim.z == 1);
typedef typename packet_traits<Eigen::half>::type PacketType;
const int packet_width = unpacket_traits<PacketType>::size;
const int unroll_times = 16 / packet_width;
eigen_assert(NumPerThread % unroll_times == 0);
eigen_assert(unroll_times % 2 == 0);
const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
const Index num_threads = blockDim.x * gridDim.x;
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel
if (gridDim.x == 1) {
Index i = packet_width * thread_id;
for (; i + packet_width <= num_preserved_coeffs;
i += packet_width * num_threads) {
PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
*poutput = reducer.template initializePacket<PacketType>();
}
if (i < num_preserved_coeffs) {
output[i] = reducer.initialize();
}
__syncthreads();
}
for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
const Index row = 2 * (i / input_col_blocks); // everybody takes 2 rows
if (row + 1 < num_preserved_coeffs) {
const Index col_block = i % input_col_blocks;
const Index col_begin =
packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
for (Index j = 0; j < NumPerThread; j += unroll_times) {
const Index last_col =
col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
if (last_col >= num_coeffs_to_reduce) {
Index col = col_begin + blockDim.x * j;
for (; col + packet_width <= num_coeffs_to_reduce;
col += blockDim.x) {
const PacketType val1 = input.m_impl.template packet<Unaligned>(
row * num_coeffs_to_reduce + col);
reducer.reducePacket(val1, &reduced_val1);
const PacketType val2 = input.m_impl.template packet<Unaligned>(
(row + 1) * num_coeffs_to_reduce + col);
reducer.reducePacket(val2, &reduced_val2);
}
if (col < num_coeffs_to_reduce) {
PacketType r1 = reducer.template initializePacket<PacketType>();
PacketType r2 = reducer.template initializePacket<PacketType>();
half2* hr1 = reinterpret_cast<half2*>(&r1);
half2* hr2 = reinterpret_cast<half2*>(&r2);
while (col + 1 < num_coeffs_to_reduce) {
*hr1 = __halves2half2(
input.m_impl.coeff(row * num_coeffs_to_reduce + col),
input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
*hr2 = __halves2half2(
input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
1));
hr1++;
hr2++;
col += 2;
}
if (col < num_coeffs_to_reduce) {
// Peel;
const half last1 =
input.m_impl.coeff(row * num_coeffs_to_reduce + col);
*hr1 = __halves2half2(last1, reducer.initialize());
const half last2 =
input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
*hr2 = __halves2half2(last2, reducer.initialize());
}
reducer.reducePacket(r1, &reduced_val1);
reducer.reducePacket(r2, &reduced_val2);
}
break;
} else {
// Faster version of the loop with no branches after unrolling.
#pragma unroll
for (int k = 0; k < unroll_times; ++k) {
const Index col = col_begin + blockDim.x * (j + k) * packet_width;
reducer.reducePacket(input.m_impl.template packet<Unaligned>(
row * num_coeffs_to_reduce + col),
&reduced_val1);
reducer.reducePacket(input.m_impl.template packet<Unaligned>(
(row + 1) * num_coeffs_to_reduce + col),
&reduced_val2);
}
}
}
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2) {
#if defined(EIGEN_HIPCC)
PacketType r1;
PacketType r2;
half2* hr1 = reinterpret_cast<half2*>(&r1);
half2* hr2 = reinterpret_cast<half2*>(&r2);
half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
for (int i = 0; i < packet_width / 2; i++) {
// FIXME : remove this workaround once we have native half/half2 support for __shfl_down
union { int i; half2 h; } wka_in1, wka_out1;
wka_in1.h = rv1[i];
wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
hr1[i] = wka_out1.h;
union { int i; half2 h; } wka_in2, wka_out2;
wka_in2.h = rv2[i];
wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
hr2[i] = wka_out2.h;
}
reducer.reducePacket(r1, &reduced_val1);
reducer.reducePacket(r2, &reduced_val2);
#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
PacketType r1;
PacketType r2;
half2* hr1 = reinterpret_cast<half2*>(&r1);
half2* hr2 = reinterpret_cast<half2*>(&r2);
half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
for (int i = 0; i < packet_width / 2; i++) {
hr1[i] = __shfl_down(rv1[i], offset, warpSize);
hr2[i] = __shfl_down(rv2[i], offset, warpSize);
}
reducer.reducePacket(r1, &reduced_val1);
reducer.reducePacket(r2, &reduced_val2);
#else
PacketType r1;
PacketType r2;
half2* hr1 = reinterpret_cast<half2*>(&r1);
half2* hr2 = reinterpret_cast<half2*>(&r2);
half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
for (int i = 0; i < packet_width / 2; i++) {
hr1[i] =
__shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
hr2[i] =
__shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
}
reducer.reducePacket(r1, &reduced_val1);
reducer.reducePacket(r2, &reduced_val2);
#endif
}
half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
half2 val;
if (packet_width > 2) {
reducer.reducePacket(rv1[2], rv1);
reducer.reducePacket(rv1[3], rv1 + 1);
reducer.reducePacket(rv1[1], rv1);
reducer.reducePacket(rv2[2], rv2);
reducer.reducePacket(rv2[3], rv2 + 1);
reducer.reducePacket(rv2[1], rv2);
}
half val1 = __low2half(*rv1);
reducer.reduce(__high2half(*rv1), &val1);
half val2 = __low2half(*rv2);
reducer.reduce(__high2half(*rv2), &val2);
val = __halves2half2(val1, val2);
if ((threadIdx.x & (warpSize - 1)) == 0) {
half* loc = output + row;
atomicReduce((half2*)loc, val, reducer);
}
}
}
}
#endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
struct InnerReductionLauncher {
static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
return true;
}
};
// Specialization for float and double
template <typename Self, typename Op, typename OutputType, bool PacketAccess>
struct InnerReductionLauncher<
Self, Op, OutputType, PacketAccess,
typename internal::enable_if<
internal::is_same<float, OutputType>::value ||
internal::is_same<double, OutputType>::value,
void>::type> {
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 128;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false;
}
};
#ifdef EIGEN_HAS_GPU_FP16
template <typename Self, typename Op>
struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
gpu_assert(false && "Should not be called since there is no packet accessor");
return true;
}
};
template <typename Self, typename Op>
struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
if (num_preserved_vals % 2 != 0) {
// Not supported yet, revert to the slower code path
return true;
}
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = /*256*/128;
const int num_per_thread = /*128*/64;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) {
// We initialize the outputs outside the reduction kernel when we can't be sure that there
// won't be a race conditions between multiple thread blocks.
LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
1, 1, 0, device, reducer, self, num_preserved_vals, output);
}
LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false;
}
};
#endif // EIGEN_HAS_GPU_FP16
template <typename Self, typename Op>
struct InnerReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case
// of floats and half floats.
#ifdef EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value ||
(internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
#else // EIGEN_HAS_GPU_FP16
static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
#endif // EIGEN_HAS_GPU_FP16
template <typename OutputType>
static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
const Index num_coeffs = array_prod(self.m_impl.dimensions());
// Don't crash when we're called with an input tensor of size 0.
if (num_coeffs == 0) {
return true;
}
// It's faster to use the usual code.
if (num_coeffs_to_reduce <= 128) {
return true;
}
return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
}
};
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
const Index num_threads = blockDim.x * gridDim.x;
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
// Initialize the output values if they weren't initialized by the ReductionInitKernel
if (gridDim.x == 1) {
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
output[i] = reducer.initialize();
}
__syncthreads();
}
// Do the reduction.
const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
for (Index i = thread_id; i < max_iter; i += num_threads) {
const Index input_col = i % num_preserved_coeffs;
const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
typename Self::CoeffReturnType reduced_val = reducer.initialize();
const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
for (Index j = input_row; j < max_row; j++) {
typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
reducer.reduce(val, &reduced_val);
}
atomicReduce(&(output[input_col]), reduced_val, reducer);
}
}
template <typename Self, typename Op>
struct OuterReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex,
// so reduce the scope of the optimized version of the code to the simple case
// of floats.
static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
(internal::is_same<typename Self::CoeffReturnType, float>::value ||
internal::is_same<typename Self::CoeffReturnType, double>::value);
template <typename Device, typename OutputType>
static
#if !defined(EIGEN_HIPCC)
// FIXME : leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
// (in the cxx11_tensor_reduction_gpu test)
//
// terminate called after throwing an instance of 'std::runtime_error'
// what(): No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
//
// don't know why this happens (and why is it a runtime error instead of a compile time error)
//
// this will be fixed by HIP PR#457
EIGEN_DEVICE_FUNC
#endif
bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
return true;
}
static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
typedef typename Self::Index Index;
// It's faster to use the usual code.
if (num_coeffs_to_reduce <= 32) {
return true;
}
const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
const int block_size = 256;
const int num_per_thread = 16;
const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / block_size;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
if (num_blocks > 1) {
// We initialize the outputs in the reduction kernel itself when we don't have to worry
// about race conditions between multiple thread blocks.
const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
const int max_blocks = device.getNumGpuMultiProcessors() *
device.maxGpuThreadsPerMultiProcessor() / 1024;
const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
num_blocks, 1024, 0, device, reducer.initialize(),
num_preserved_vals, output);
}
LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
return false;
}
};
#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H

View File

@@ -0,0 +1,582 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* TensorReductionSycl.h
*
* \brief:
* This is the specialization of the reduction operation. Two phase reduction approach
* is used since the GPU does not have Global Synchronization for global memory among
* different work-group/thread block. To solve the problem, we need to create two kernels
* to reduce the data, where the first kernel reduce the data locally and each local
* workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
* one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
* Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
* https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
*
*****************************************************************/
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
namespace Eigen {
namespace TensorSycl {
namespace internal {
template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
struct OpDefiner {
typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
typedef Op type;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
const Index &) {
return accumulator;
}
};
template <typename CoeffReturnType, typename Index>
struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
typedef Eigen::internal::SumReducer<CoeffReturnType> type;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
return type();
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
const Index &scale) {
::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
return quotient_op(accumulator, CoeffReturnType(scale));
}
};
template <typename CoeffReturnType, typename Index>
struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
typedef Eigen::internal::SumReducer<CoeffReturnType> type;
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
return type();
}
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
const Index &scale) {
return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
}
};
template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
Index local_range>
struct SecondStepFullReducer {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
LocalAccessor;
typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
typedef typename OpDef::type Op;
LocalAccessor scratch;
InputAccessor aI;
OutputAccessor outAcc;
Op op;
SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
: scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
void operator()(cl::sycl::nd_item<1> itemID) {
// Our empirical research shows that the best performance will be achieved
// when there is only one element per thread to reduce in the second step.
// in this step the second step reduction time is almost negligible.
// Hence, in the second step of reduction the input size is fixed to the
// local size, thus, there is only one element read per thread. The
// algorithm must be changed if the number of reduce per thread in the
// second step is greater than 1. Otherwise, the result will be wrong.
const Index localid = itemID.get_local_id(0);
auto aInPtr = aI.get_pointer() + localid;
auto aOutPtr = outAcc.get_pointer();
CoeffReturnType *scratchptr = scratch.get_pointer();
CoeffReturnType accumulator = *aInPtr;
scratchptr[localid] = op.finalize(accumulator);
for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
itemID.barrier(cl::sycl::access::fence_space::local_space);
if (localid < offset) {
op.reduce(scratchptr[localid + offset], &accumulator);
scratchptr[localid] = op.finalize(accumulator);
}
}
if (localid == 0) *aOutPtr = op.finalize(accumulator);
}
};
// Full reduction first phase. In this version the vectorization is true and the reduction accept
// any generic reducerOp e.g( max, min, sum, mean, iamax, iamin, etc ).
template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
class FullReductionKernelFunctor {
public:
typedef typename Evaluator::CoeffReturnType CoeffReturnType;
typedef typename Evaluator::Index Index;
typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
OpDef;
typedef typename OpDef::type Op;
typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
typedef typename Evaluator::PacketReturnType PacketReturnType;
typedef
typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
PacketReturnType, CoeffReturnType>::type OutType;
typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
LocalAccessor;
LocalAccessor scratch;
Evaluator evaluator;
EvaluatorPointerType final_output;
Index rng;
Op op;
FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
Index rng_, OpType op_)
: scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
const cl::sycl::nd_item<1> &itemID) {
auto output_ptr = final_output.get_pointer();
Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
Index globalid = itemID.get_global_id(0);
Index localid = itemID.get_local_id(0);
Index step = Evaluator::PacketSize * itemID.get_global_range(0);
Index start = Evaluator::PacketSize * globalid;
// vectorizable parts
PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
for (Index i = start; i < VectorizedRange; i += step) {
op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
}
globalid += VectorizedRange;
// non vectorizable parts
for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
op.template reducePacket<PacketReturnType>(
::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
evaluator.impl().coeff(i), op.initialize()),
&packetAccumulator);
}
scratch[localid] = packetAccumulator =
OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
// reduction parts // Local size is always power of 2
EIGEN_UNROLL_LOOP
for (Index offset = local_range / 2; offset > 0; offset /= 2) {
itemID.barrier(cl::sycl::access::fence_space::local_space);
if (localid < offset) {
op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
}
}
if (localid == 0) {
output_ptr[itemID.get_group(0)] =
op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
}
}
template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
const cl::sycl::nd_item<1> &itemID) {
auto output_ptr = final_output.get_pointer();
Index globalid = itemID.get_global_id(0);
Index localid = itemID.get_local_id(0);
// vectorizable parts
CoeffReturnType accumulator = op.initialize();
// non vectorizable parts
for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
op.reduce(evaluator.impl().coeff(i), &accumulator);
}
scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
// reduction parts. the local size is always power of 2
EIGEN_UNROLL_LOOP
for (Index offset = local_range / 2; offset > 0; offset /= 2) {
itemID.barrier(cl::sycl::access::fence_space::local_space);
if (localid < offset) {
op.reduce(scratch[localid + offset], &accumulator);
scratch[localid] = op.finalize(accumulator);
}
}
if (localid == 0) {
output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
}
}
};
template <typename Evaluator, typename OpType>
class GenericNondeterministicReducer {
public:
typedef typename Evaluator::CoeffReturnType CoeffReturnType;
typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
typedef typename Evaluator::Index Index;
typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
typedef typename OpDef::type Op;
template <typename Scratch>
GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
Index range_, Index num_values_to_reduce_)
: evaluator(evaluator_),
output_accessor(output_accessor_),
functor(OpDef::get_op(functor_)),
range(range_),
num_values_to_reduce(num_values_to_reduce_) {}
void operator()(cl::sycl::nd_item<1> itemID) {
auto output_accessor_ptr = output_accessor.get_pointer();
/// const cast added as a naive solution to solve the qualifier drop error
Index globalid = static_cast<Index>(itemID.get_global_linear_id());
if (globalid < range) {
CoeffReturnType accum = functor.initialize();
Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
evaluator, evaluator.firstInput(globalid), functor, &accum);
output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
}
}
private:
Evaluator evaluator;
EvaluatorPointerType output_accessor;
Op functor;
Index range;
Index num_values_to_reduce;
};
enum class reduction_dim { inner_most, outer_most };
// default is preserver
template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
struct PartialReductionKernel {
typedef typename Evaluator::CoeffReturnType CoeffReturnType;
typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
typedef typename Evaluator::Index Index;
typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
typedef typename OpDef::type Op;
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
ScratchAcc;
ScratchAcc scratch;
Evaluator evaluator;
EvaluatorPointerType output_accessor;
Op op;
const Index preserve_elements_num_groups;
const Index reduce_elements_num_groups;
const Index num_coeffs_to_preserve;
const Index num_coeffs_to_reduce;
PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
: scratch(scratch_),
evaluator(evaluator_),
output_accessor(output_accessor_),
op(OpDef::get_op(op_)),
preserve_elements_num_groups(preserve_elements_num_groups_),
reduce_elements_num_groups(reduce_elements_num_groups_),
num_coeffs_to_preserve(num_coeffs_to_preserve_),
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
CoeffReturnType &accumulator) {
if (globalPId >= num_coeffs_to_preserve) {
return;
}
Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
: globalRId + (globalPId * num_coeffs_to_reduce);
Index localOffset = globalRId;
const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
const Index per_thread_global_stride =
rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
localOffset += per_thread_local_stride;
global_offset += per_thread_global_stride;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
const Index linearLocalThreadId = itemID.get_local_id(0);
Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
: linearLocalThreadId / PannelParameters::LocalThreadSizeR;
Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
: linearLocalThreadId % PannelParameters::LocalThreadSizeR;
const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
: itemID.get_group(0) / reduce_elements_num_groups;
const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
: itemID.get_group(0) % reduce_elements_num_groups;
Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
auto scratchPtr = scratch.get_pointer().get();
auto outPtr =
output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
CoeffReturnType accumulator = op.initialize();
element_wise_reduce(globalRId, globalPId, accumulator);
accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
accumulator;
if (rt == reduction_dim::inner_most) {
pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
}
/* Apply the reduction operation between the current local
* id and the one on the other half of the vector. */
auto out_scratch_ptr =
scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
itemID.barrier(cl::sycl::access::fence_space::local_space);
if (rt == reduction_dim::inner_most) {
accumulator = *out_scratch_ptr;
}
// The Local LocalThreadSizeR is always power of 2
EIGEN_UNROLL_LOOP
for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
if (rLocalThreadId < offset) {
op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
// The result has already been divided for mean reducer in the
// previous reduction so no need to divide furthermore
*out_scratch_ptr = op.finalize(accumulator);
}
/* All threads collectively read from global memory into local.
* The barrier ensures all threads' IO is resolved before
* execution continues (strictly speaking, all threads within
* a single work-group - there is no co-ordination between
* work-groups, only work-items). */
itemID.barrier(cl::sycl::access::fence_space::local_space);
}
if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
outPtr[globalPId] = op.finalize(accumulator);
}
}
};
template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
struct SecondStepPartialReduction {
typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
typedef typename OpDef::type Op;
typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
ScratchAccessor;
InputAccessor input_accessor;
OutputAccessor output_accessor;
Op op;
const Index num_coeffs_to_preserve;
const Index num_coeffs_to_reduce;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
OutputAccessor output_accessor_, OpType op_,
const Index num_coeffs_to_preserve_,
const Index num_coeffs_to_reduce_)
: input_accessor(input_accessor_),
output_accessor(output_accessor_),
op(OpDef::get_op(op_)),
num_coeffs_to_preserve(num_coeffs_to_preserve_),
num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
const Index globalId = itemID.get_global_id(0);
if (globalId >= num_coeffs_to_preserve) return;
auto in_ptr = input_accessor.get_pointer() + globalId;
OutScalar accumulator = op.initialize();
// num_coeffs_to_reduce is not bigger that 256
for (Index i = 0; i < num_coeffs_to_reduce; i++) {
op.reduce(*in_ptr, &accumulator);
in_ptr += num_coeffs_to_preserve;
}
output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
}
}; // namespace internal
template <typename Index, Index LTP, Index LTR, bool BC_>
struct ReductionPannel {
static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
static EIGEN_CONSTEXPR bool BC = BC_;
};
template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
struct PartialReducerLauncher {
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
typedef typename Self::CoeffReturnType CoeffReturnType;
typedef typename Self::Storage Storage;
typedef typename Self::Index Index;
typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
PannelParameters;
typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
// getPowerOfTwo makes sure local range is power of 2 and <=
// maxSyclThreadPerBlock this will help us to avoid extra check on the
// kernel
static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
(PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
"The Local thread size must be a power of 2 for the reduction "
"operation");
EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
// In this step, we force the code not to be more than 2-step reduction:
// Our empirical research shows that if each thread reduces at least 64
// elemnts individually, we get better performance. However, this can change
// on different platforms. In this step we force the code not to be
// morthan step reduction: Our empirical research shows that for inner_most
// dim reducer, it is better to have 8 group in a reduce dimension for sizes
// > 1024 to achieve the best performance.
const Index reductionPerThread = 64;
Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
const Index globalRange = pNumGroups * rNumGroups * localRange;
EIGEN_CONSTEXPR Index scratchSize =
PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
if (rNumGroups > 1) {
CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
num_coeffs_to_reduce);
typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
SecondStepPartialReductionKernel;
dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
temp_accessor, output,
cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
reducer, num_coeffs_to_preserve, rNumGroups);
self.device().deallocate_temp(temp_pointer);
} else {
dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
num_coeffs_to_reduce);
}
return false;
}
};
} // namespace internal
} // namespace TensorSycl
namespace internal {
template <typename Self, typename Op, bool Vectorizable>
struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
typedef typename Self::CoeffReturnType CoeffReturnType;
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
"The Local thread size must be a power of 2 for the reduction "
"operation");
EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
typename Self::Index inputSize = self.impl().dimensions().TotalSize();
// In this step we force the code not to be more than 2-step reduction:
// Our empirical research shows that if each thread reduces at least 512
// elemnts individually, we get better performance.
const Index reductionPerThread = 2048;
// const Index num_work_group =
Index reductionGroup = dev.getPowerOfTwo(
(inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
const Index num_work_group = std::min(reductionGroup, local_range);
// 1
// ? local_range
// : 1);
const Index global_range = num_work_group * local_range;
auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
if (num_work_group > 1) {
CoeffReturnType *temp_pointer =
static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
local_range, inputSize, reducer);
typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
EvaluatorPointerType, Index, local_range>
GenericRKernel;
dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
tmp_global_accessor, data,
cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
reducer);
dev.deallocate_temp(temp_pointer);
} else {
dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
reducer);
}
}
};
// vectorizable inner_most most dim preserver
// col reduction
template <typename Self, typename Op>
struct OuterReducer<Self, Op, Eigen::SyclDevice> {
static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_coeffs_to_preserve) {
return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
num_coeffs_to_reduce,
num_coeffs_to_preserve);
}
};
// row reduction
template <typename Self, typename Op>
struct InnerReducer<Self, Op, Eigen::SyclDevice> {
static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
typename Self::Index num_coeffs_to_preserve) {
return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
num_coeffs_to_reduce,
num_coeffs_to_preserve);
}
};
// ArmgMax uses this kernel for partial reduction//
// TODO(@mehdi.goli) come up with a better kernel
// generic partial reduction
template <typename Self, typename Op>
struct GenericReducer<Self, Op, Eigen::SyclDevice> {
static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
typename Self::Index num_coeffs_to_preserve) {
typename Self::Index range, GRange, tileSize;
dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
return false;
}
};
} // namespace internal
} // namespace Eigen
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP

View File

@@ -0,0 +1,454 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
namespace Eigen {
namespace internal {
template <typename Dimensions, typename Scalar>
class TensorLazyBaseEvaluator {
public:
TensorLazyBaseEvaluator() : m_refcount(0) { }
virtual ~TensorLazyBaseEvaluator() { }
EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0;
EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0;
EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0;
EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0;
void incrRefCount() { ++m_refcount; }
void decrRefCount() { --m_refcount; }
int refCount() const { return m_refcount; }
private:
// No copy, no assignment;
TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
int m_refcount;
};
template <typename Dimensions, typename Expr, typename Device>
class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
public:
// typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
typedef StorageMemory<Scalar, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
typedef TensorEvaluator<Expr, Device> EvalType;
TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
m_dims = m_impl.dimensions();
m_impl.evalSubExprsIfNeeded(NULL);
}
virtual ~TensorLazyEvaluatorReadOnly() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const {
return m_dims;
}
EIGEN_DEVICE_FUNC virtual const Scalar* data() const {
return m_impl.data();
}
EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const {
return m_impl.coeff(index);
}
EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) {
eigen_assert(false && "can't reference the coefficient of a rvalue");
return m_dummy;
};
protected:
TensorEvaluator<Expr, Device> m_impl;
Dimensions m_dims;
Scalar m_dummy;
};
template <typename Dimensions, typename Expr, typename Device>
class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
public:
typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
typedef typename Base::Scalar Scalar;
typedef StorageMemory<Scalar, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
}
virtual ~TensorLazyEvaluatorWritable() {
}
EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) {
return this->m_impl.coeffRef(index);
}
};
template <typename Dimensions, typename Expr, typename Device>
class TensorLazyEvaluator : public internal::conditional<bool(internal::is_lvalue<Expr>::value),
TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type {
public:
typedef typename internal::conditional<bool(internal::is_lvalue<Expr>::value),
TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device> >::type Base;
typedef typename Base::Scalar Scalar;
TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {
}
virtual ~TensorLazyEvaluator() {
}
};
} // namespace internal
/** \class TensorRef
* \ingroup CXX11_Tensor_Module
*
* \brief A reference to a tensor expression
* The expression will be evaluated lazily (as much as possible).
*
*/
template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef<PlainObjectType> >
{
public:
typedef TensorRef<PlainObjectType> Self;
typedef typename PlainObjectType::Base Base;
typedef typename Eigen::internal::nested<Self>::type Nested;
typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
typedef typename internal::traits<PlainObjectType>::Index Index;
typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
typedef typename NumTraits<Scalar>::Real RealScalar;
typedef typename Base::CoeffReturnType CoeffReturnType;
typedef Scalar* PointerType;
typedef PointerType PointerArgType;
static const Index NumIndices = PlainObjectType::NumIndices;
typedef typename PlainObjectType::Dimensions Dimensions;
enum {
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
PreferBlockAccess = false,
Layout = PlainObjectType::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
}
template <typename Expression>
EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : m_evaluator(new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice())) {
m_evaluator->incrRefCount();
}
template <typename Expression>
EIGEN_STRONG_INLINE TensorRef& operator = (const Expression& expr) {
unrefEvaluator();
m_evaluator = new internal::TensorLazyEvaluator<Dimensions, Expression, DefaultDevice>(expr, DefaultDevice());
m_evaluator->incrRefCount();
return *this;
}
~TensorRef() {
unrefEvaluator();
}
TensorRef(const TensorRef& other) : m_evaluator(other.m_evaluator) {
eigen_assert(m_evaluator->refCount() > 0);
m_evaluator->incrRefCount();
}
TensorRef& operator = (const TensorRef& other) {
if (this != &other) {
unrefEvaluator();
m_evaluator = other.m_evaluator;
eigen_assert(m_evaluator->refCount() > 0);
m_evaluator->incrRefCount();
}
return *this;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index index) const
{
return m_evaluator->coeff(index);
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
{
const std::size_t num_indices = (sizeof...(otherIndices) + 1);
const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
return coeff(indices);
}
template<typename... IndexTypes> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
{
const std::size_t num_indices = (sizeof...(otherIndices) + 1);
const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
return coeffRef(indices);
}
#else
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1) const
{
array<Index, 2> indices;
indices[0] = i0;
indices[1] = i1;
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2) const
{
array<Index, 3> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3) const
{
array<Index, 4> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
indices[3] = i3;
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
{
array<Index, 5> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
indices[3] = i3;
indices[4] = i4;
return coeff(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1)
{
array<Index, 2> indices;
indices[0] = i0;
indices[1] = i1;
return coeffRef(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2)
{
array<Index, 3> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
return coeffRef(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
{
array<Index, 4> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
indices[3] = i3;
return coeffRef(indices);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index i0, Index i1, Index i2, Index i3, Index i4)
{
array<Index, 5> indices;
indices[0] = i0;
indices[1] = i1;
indices[2] = i2;
indices[3] = i3;
indices[4] = i4;
return coeffRef(indices);
}
#endif
template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const
{
const Dimensions& dims = this->dimensions();
Index index = 0;
if (PlainObjectType::Options & RowMajor) {
index += indices[0];
for (size_t i = 1; i < NumIndices; ++i) {
index = index * dims[i] + indices[i];
}
} else {
index += indices[NumIndices-1];
for (int i = NumIndices-2; i >= 0; --i) {
index = index * dims[i] + indices[i];
}
}
return m_evaluator->coeff(index);
}
template <std::size_t NumIndices> EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices)
{
const Dimensions& dims = this->dimensions();
Index index = 0;
if (PlainObjectType::Options & RowMajor) {
index += indices[0];
for (size_t i = 1; i < NumIndices; ++i) {
index = index * dims[i] + indices[i];
}
} else {
index += indices[NumIndices-1];
for (int i = NumIndices-2; i >= 0; --i) {
index = index * dims[i] + indices[i];
}
}
return m_evaluator->coeffRef(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
{
return m_evaluator->coeff(index);
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
return m_evaluator->coeffRef(index);
}
private:
EIGEN_STRONG_INLINE void unrefEvaluator() {
if (m_evaluator) {
m_evaluator->decrRefCount();
if (m_evaluator->refCount() == 0) {
delete m_evaluator;
}
}
}
internal::TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
};
// evaluator for rvalues
template<typename Derived, typename Device>
struct TensorEvaluator<const TensorRef<Derived>, Device>
{
typedef typename Derived::Index Index;
typedef typename Derived::Scalar Scalar;
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorRef<Derived>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
: m_ref(m)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
return true;
}
EIGEN_STRONG_INLINE void cleanup() { }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
return m_ref.coeff(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
return m_ref.coeffRef(index);
}
EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
protected:
TensorRef<Derived> m_ref;
};
// evaluator for lvalues
template<typename Derived, typename Device>
struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device>
{
typedef typename Derived::Index Index;
typedef typename Derived::Scalar Scalar;
typedef typename Derived::Scalar CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef typename Derived::Dimensions Dimensions;
typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
enum {
IsAligned = false,
PacketAccess = false,
BlockAccess = false,
PreferBlockAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
return this->m_ref.coeffRef(index);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REF_H

View File

@@ -0,0 +1,465 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
// Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
namespace Eigen {
/** \class TensorReverse
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor reverse elements class.
*
*/
namespace internal {
template<typename ReverseDimensions, typename XprType>
struct traits<TensorReverseOp<ReverseDimensions,
XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<typename ReverseDimensions, typename XprType>
struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense>
{
typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
};
template<typename ReverseDimensions, typename XprType>
struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type>
{
typedef TensorReverseOp<ReverseDimensions, XprType> type;
};
} // end namespace internal
template<typename ReverseDimensions, typename XprType>
class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions,
XprType>, WriteAccessors>
{
public:
typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors>Base;
typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind
StorageKind;
typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(
const XprType& expr, const ReverseDimensions& reverse_dims)
: m_xpr(expr), m_reverse_dims(reverse_dims) { }
EIGEN_DEVICE_FUNC
const ReverseDimensions& reverse() const { return m_reverse_dims; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
protected:
typename XprType::Nested m_xpr;
const ReverseDimensions m_reverse_dims;
};
// Eval as rvalue
template<typename ReverseDimensions, typename ArgType, typename Device>
struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device>
{
typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<ReverseDimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = NumDims > 0,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
typedef internal::TensorIntDivisor<Index> IndexDivisor;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
ArgTensorBlock;
typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
Layout, Index>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device),
m_reverse(op.reverse()),
m_device(device)
{
// Reversing a scalar isn't supported yet. It would be a no-op anyway.
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
// Compute strides
m_dimensions = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_strides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
}
} else {
m_strides[NumDims-1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(
Index index) const {
eigen_assert(index < dimensions().TotalSize());
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
Index idx = index / m_fastStrides[i];
index -= idx * m_strides[i];
if (m_reverse[i]) {
idx = m_dimensions[i] - idx - 1;
}
inputIndex += idx * m_strides[i] ;
}
if (m_reverse[0]) {
inputIndex += (m_dimensions[0] - index - 1);
} else {
inputIndex += index;
}
} else {
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
Index idx = index / m_fastStrides[i];
index -= idx * m_strides[i];
if (m_reverse[i]) {
idx = m_dimensions[i] - idx - 1;
}
inputIndex += idx * m_strides[i] ;
}
if (m_reverse[NumDims-1]) {
inputIndex += (m_dimensions[NumDims-1] - index - 1);
} else {
inputIndex += index;
}
}
return inputIndex;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(
Index index) const {
return m_impl.coeff(reverseIndex(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// TODO(ndjaitly): write a better packing routine that uses
// local structure.
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
const size_t target_size = m_device.lastLevelCacheSize();
// Block evaluation reads underlying memory in reverse order, and default
// cost model does not properly catch this in bytes stored/loaded.
return internal::TensorBlockResourceRequirements::skewed<Scalar>(
target_size)
.addCostPerCoeff({0, 0, 24});
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool /*root_of_expr_ast*/ = false) const {
// TODO(ezhulenev): If underlying tensor expression supports and prefers
// block evaluation we must use it. Currently we use coeff and packet
// access into the underlying tensor expression.
// static const bool useBlockAccessForArgType =
// TensorEvaluator<ArgType, Device>::BlockAccess &&
// TensorEvaluator<ArgType, Device>::PreferBlockAccess;
static const bool isColMajor =
static_cast<int>(Layout) == static_cast<int>(ColMajor);
static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
const bool inner_dim_reversed = m_reverse[inner_dim_idx];
// Offset in the output block.
Index block_offset = 0;
// Offset in the input Tensor.
Index input_offset = reverseIndex(desc.offset());
// Initialize output block iterator state. Dimension in this array are
// always in inner_most -> outer_most order (col major layout).
array<BlockIteratorState, NumDims> it;
for (int i = 0; i < NumDims; ++i) {
const int dim = isColMajor ? i : NumDims - 1 - i;
it[i].size = desc.dimension(dim);
it[i].count = 0;
it[i].reverse = m_reverse[dim];
it[i].block_stride =
i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
it[i].block_span = it[i].block_stride * (it[i].size - 1);
it[i].input_stride = m_strides[dim];
it[i].input_span = it[i].input_stride * (it[i].size - 1);
if (it[i].reverse) {
it[i].input_stride = -1 * it[i].input_stride;
it[i].input_span = -1 * it[i].input_span;
}
}
// If multiple inner dimensions have the same reverse flag, check if we can
// merge them into a single virtual inner dimension.
int effective_inner_dim = 0;
for (int i = 1; i < NumDims; ++i) {
if (it[i].reverse != it[effective_inner_dim].reverse) break;
if (it[i].block_stride != it[effective_inner_dim].size) break;
if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
it[i].size = it[effective_inner_dim].size * it[i].size;
it[i].block_stride = 1;
it[i].input_stride = (inner_dim_reversed ? -1 : 1);
it[i].block_span = it[i].block_stride * (it[i].size - 1);
it[i].input_span = it[i].input_stride * (it[i].size - 1);
effective_inner_dim = i;
}
eigen_assert(it[effective_inner_dim].block_stride == 1);
eigen_assert(it[effective_inner_dim].input_stride ==
(inner_dim_reversed ? -1 : 1));
const Index inner_dim_size = it[effective_inner_dim].size;
// Prepare storage for the materialized reverse result.
const typename TensorBlock::Storage block_storage =
TensorBlock::prepareStorage(desc, scratch);
CoeffReturnType* block_buffer = block_storage.data();
while (it[NumDims - 1].count < it[NumDims - 1].size) {
// Copy inner-most dimension data from reversed location in input.
Index dst = block_offset;
Index src = input_offset;
// NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
// worse results in benchmarks than a simple coefficient loop.
if (inner_dim_reversed) {
for (Index i = 0; i < inner_dim_size; ++i) {
block_buffer[dst] = m_impl.coeff(src);
++dst;
--src;
}
} else {
for (Index i = 0; i < inner_dim_size; ++i) {
block_buffer[dst] = m_impl.coeff(src);
++dst;
++src;
}
}
// For the 1d tensor we need to generate only one inner-most dimension.
if ((NumDims - effective_inner_dim) == 1) break;
// Update offset.
for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
if (++it[i].count < it[i].size) {
block_offset += it[i].block_stride;
input_offset += it[i].input_stride;
break;
}
if (i != NumDims - 1) it[i].count = 0;
block_offset -= it[i].block_span;
input_offset -= it[i].input_span;
}
}
return block_storage.AsTensorMaterializedBlock();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
2 * TensorOpCost::MulCost<Index>() +
TensorOpCost::DivCost<Index>());
for (int i = 0; i < NumDims; ++i) {
if (m_reverse[i]) {
compute_cost += 2 * TensorOpCost::AddCost<Index>();
}
}
return m_impl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
}
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
Dimensions m_dimensions;
array<Index, NumDims> m_strides;
array<IndexDivisor, NumDims> m_fastStrides;
TensorEvaluator<ArgType, Device> m_impl;
ReverseDimensions m_reverse;
const Device EIGEN_DEVICE_REF m_device;
private:
struct BlockIteratorState {
BlockIteratorState()
: size(0),
count(0),
reverse(false),
block_stride(0),
block_span(0),
input_stride(0),
input_span(0) {}
Index size;
Index count;
bool reverse;
Index block_stride;
Index block_span;
Index input_stride;
Index input_span;
};
};
// Eval as lvalue
template <typename ReverseDimensions, typename ArgType, typename Device>
struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
: public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
Device> {
typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>,
Device> Base;
typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<ReverseDimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device) {}
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dimensions& dimensions() const { return this->m_dimensions; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
return this->m_impl.coeffRef(this->reverseIndex(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x) {
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
// This code is pilfered from TensorMorphing.h
EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index+i) = values[i];
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H

View File

@@ -0,0 +1,528 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
namespace Eigen {
namespace internal {
template <typename Op, typename XprType>
struct traits<TensorScanOp<Op, XprType> >
: public traits<XprType> {
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<typename Op, typename XprType>
struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
{
typedef const TensorScanOp<Op, XprType>& type;
};
template<typename Op, typename XprType>
struct nested<TensorScanOp<Op, XprType>, 1,
typename eval<TensorScanOp<Op, XprType> >::type>
{
typedef TensorScanOp<Op, XprType> type;
};
} // end namespace internal
/** \class TensorScan
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor scan class.
*/
template <typename Op, typename XprType>
class TensorScanOp
: public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
public:
typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
: m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Index axis() const { return m_axis; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const XprType& expression() const { return m_expr; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Op accumulator() const { return m_accumulator; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool exclusive() const { return m_exclusive; }
protected:
typename XprType::Nested m_expr;
const Index m_axis;
const Op m_accumulator;
const bool m_exclusive;
};
namespace internal {
template <typename Self>
EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
typename Self::CoeffReturnType* data) {
// Compute the scan along the axis, starting at the given offset
typename Self::CoeffReturnType accum = self.accumulator().initialize();
if (self.stride() == 1) {
if (self.exclusive()) {
for (Index curr = offset; curr < offset + self.size(); ++curr) {
data[curr] = self.accumulator().finalize(accum);
self.accumulator().reduce(self.inner().coeff(curr), &accum);
}
} else {
for (Index curr = offset; curr < offset + self.size(); ++curr) {
self.accumulator().reduce(self.inner().coeff(curr), &accum);
data[curr] = self.accumulator().finalize(accum);
}
}
} else {
if (self.exclusive()) {
for (Index idx3 = 0; idx3 < self.size(); idx3++) {
Index curr = offset + idx3 * self.stride();
data[curr] = self.accumulator().finalize(accum);
self.accumulator().reduce(self.inner().coeff(curr), &accum);
}
} else {
for (Index idx3 = 0; idx3 < self.size(); idx3++) {
Index curr = offset + idx3 * self.stride();
self.accumulator().reduce(self.inner().coeff(curr), &accum);
data[curr] = self.accumulator().finalize(accum);
}
}
}
}
template <typename Self>
EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
typename Self::CoeffReturnType* data) {
using Scalar = typename Self::CoeffReturnType;
using Packet = typename Self::PacketReturnType;
// Compute the scan along the axis, starting at the calculated offset
Packet accum = self.accumulator().template initializePacket<Packet>();
if (self.stride() == 1) {
if (self.exclusive()) {
for (Index curr = offset; curr < offset + self.size(); ++curr) {
internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
}
} else {
for (Index curr = offset; curr < offset + self.size(); ++curr) {
self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
}
}
} else {
if (self.exclusive()) {
for (Index idx3 = 0; idx3 < self.size(); idx3++) {
const Index curr = offset + idx3 * self.stride();
internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
}
} else {
for (Index idx3 = 0; idx3 < self.size(); idx3++) {
const Index curr = offset + idx3 * self.stride();
self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
}
}
}
}
template <typename Self, bool Vectorize, bool Parallel>
struct ReduceBlock {
EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
typename Self::CoeffReturnType* data) {
for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
// Calculate the starting offset for the scan
Index offset = idx1 + idx2;
ReduceScalar(self, offset, data);
}
}
};
// Specialization for vectorized reduction.
template <typename Self>
struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
typename Self::CoeffReturnType* data) {
using Packet = typename Self::PacketReturnType;
const int PacketSize = internal::unpacket_traits<Packet>::size;
Index idx2 = 0;
for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
// Calculate the starting offset for the packet scan
Index offset = idx1 + idx2;
ReducePacket(self, offset, data);
}
for (; idx2 < self.stride(); idx2++) {
// Calculate the starting offset for the scan
Index offset = idx1 + idx2;
ReduceScalar(self, offset, data);
}
}
};
// Single-threaded CPU implementation of scan
template <typename Self, typename Reducer, typename Device,
bool Vectorize =
(TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
internal::reducer_traits<Reducer, Device>::PacketAccess)>
struct ScanLauncher {
void operator()(Self& self, typename Self::CoeffReturnType* data) {
Index total_size = internal::array_prod(self.dimensions());
// We fix the index along the scan axis to 0 and perform a
// scan per remaining entry. The iteration is split into two nested
// loops to avoid an integer division by keeping track of each idx1 and
// idx2.
for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
block_reducer(self, idx1, data);
}
}
};
#ifdef EIGEN_USE_THREADS
// Adjust block_size to avoid false sharing of cachelines among
// threads. Currently set to twice the cache line size on Intel and ARM
// processors.
EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
EIGEN_CONSTEXPR Index kBlockAlignment = 128;
const Index items_per_cacheline =
numext::maxi<Index>(1, kBlockAlignment / item_size);
return items_per_cacheline * divup(block_size, items_per_cacheline);
}
template <typename Self>
struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
typename Self::CoeffReturnType* data) {
using Scalar = typename Self::CoeffReturnType;
using Packet = typename Self::PacketReturnType;
const int PacketSize = internal::unpacket_traits<Packet>::size;
Index num_scalars = self.stride();
Index num_packets = 0;
if (self.stride() >= PacketSize) {
num_packets = self.stride() / PacketSize;
self.device().parallelFor(
num_packets,
TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
16 * PacketSize * self.size(), true, PacketSize),
// Make the shard size large enough that two neighboring threads
// won't write to the same cacheline of `data`.
[=](Index blk_size) {
return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
},
[&](Index first, Index last) {
for (Index packet = first; packet < last; ++packet) {
const Index idx2 = packet * PacketSize;
ReducePacket(self, idx1 + idx2, data);
}
});
num_scalars -= num_packets * PacketSize;
}
self.device().parallelFor(
num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
// Make the shard size large enough that two neighboring threads
// won't write to the same cacheline of `data`.
[=](Index blk_size) {
return AdjustBlockSize(sizeof(Scalar), blk_size);
},
[&](Index first, Index last) {
for (Index scalar = first; scalar < last; ++scalar) {
const Index idx2 = num_packets * PacketSize + scalar;
ReduceScalar(self, idx1 + idx2, data);
}
});
}
};
template <typename Self>
struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
typename Self::CoeffReturnType* data) {
using Scalar = typename Self::CoeffReturnType;
self.device().parallelFor(
self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
// Make the shard size large enough that two neighboring threads
// won't write to the same cacheline of `data`.
[=](Index blk_size) {
return AdjustBlockSize(sizeof(Scalar), blk_size);
},
[&](Index first, Index last) {
for (Index idx2 = first; idx2 < last; ++idx2) {
ReduceScalar(self, idx1 + idx2, data);
}
});
}
};
// Specialization for multi-threaded execution.
template <typename Self, typename Reducer, bool Vectorize>
struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
void operator()(Self& self, typename Self::CoeffReturnType* data) {
using Scalar = typename Self::CoeffReturnType;
using Packet = typename Self::PacketReturnType;
const int PacketSize = internal::unpacket_traits<Packet>::size;
const Index total_size = internal::array_prod(self.dimensions());
const Index inner_block_size = self.stride() * self.size();
bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
if ((parallelize_by_outer_blocks && total_size <= 4096) ||
(!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
launcher(self, data);
return;
}
if (parallelize_by_outer_blocks) {
// Parallelize over outer blocks.
const Index num_outer_blocks = total_size / inner_block_size;
self.device().parallelFor(
num_outer_blocks,
TensorOpCost(inner_block_size, inner_block_size,
16 * PacketSize * inner_block_size, Vectorize,
PacketSize),
[=](Index blk_size) {
return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
},
[&](Index first, Index last) {
for (Index idx1 = first; idx1 < last; ++idx1) {
ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
block_reducer(self, idx1 * inner_block_size, data);
}
});
} else {
// Parallelize over inner packets/scalars dimensions when the reduction
// axis is not an inner dimension.
ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
for (Index idx1 = 0; idx1 < total_size;
idx1 += self.stride() * self.size()) {
block_reducer(self, idx1, data);
}
}
}
};
#endif // EIGEN_USE_THREADS
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
// GPU implementation of scan
// TODO(ibab) This placeholder implementation performs multiple scans in
// parallel, but it would be better to use a parallel scan algorithm and
// optimize memory access.
template <typename Self, typename Reducer>
__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
// Compute offset as in the CPU version
Index val = threadIdx.x + blockIdx.x * blockDim.x;
Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
if (offset + (self.size() - 1) * self.stride() < total_size) {
// Compute the scan along the axis, starting at the calculated offset
typename Self::CoeffReturnType accum = self.accumulator().initialize();
for (Index idx = 0; idx < self.size(); idx++) {
Index curr = offset + idx * self.stride();
if (self.exclusive()) {
data[curr] = self.accumulator().finalize(accum);
self.accumulator().reduce(self.inner().coeff(curr), &accum);
} else {
self.accumulator().reduce(self.inner().coeff(curr), &accum);
data[curr] = self.accumulator().finalize(accum);
}
}
}
__syncthreads();
}
template <typename Self, typename Reducer, bool Vectorize>
struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
void operator()(const Self& self, typename Self::CoeffReturnType* data) {
Index total_size = internal::array_prod(self.dimensions());
Index num_blocks = (total_size / self.size() + 63) / 64;
Index block_size = 64;
LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
}
};
#endif // EIGEN_USE_GPU && (EIGEN_GPUCC)
} // namespace internal
// Eval as rvalue
template <typename Op, typename ArgType, typename Device>
struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
typedef TensorScanOp<Op, ArgType> XprType;
typedef typename XprType::Index Index;
typedef const ArgType ChildTypeNoConst;
typedef const ArgType ChildType;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
typedef StorageMemory<Scalar, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = false,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = true
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device),
m_device(device),
m_exclusive(op.exclusive()),
m_accumulator(op.accumulator()),
m_size(m_impl.dimensions()[op.axis()]),
m_stride(1), m_consume_dim(op.axis()),
m_output(NULL) {
// Accumulating a scalar isn't supported.
EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
// Compute stride of scan axis
const Dimensions& dims = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < op.axis(); ++i) {
m_stride = m_stride * dims[i];
}
} else {
// dims can only be indexed through unsigned integers,
// so let's use an unsigned type to let the compiler knows.
// This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
unsigned int axis = internal::convert_index<unsigned int>(op.axis());
for (unsigned int i = NumDims - 1; i > axis; --i) {
m_stride = m_stride * dims[i];
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
return m_impl.dimensions();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
return m_stride;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
return m_consume_dim;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
return m_size;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
return m_accumulator;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
return m_exclusive;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
return m_impl;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
return m_device;
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
m_impl.evalSubExprsIfNeeded(NULL);
internal::ScanLauncher<Self, Op, Device> launcher;
if (data) {
launcher(*this, data);
return false;
}
const Index total_size = internal::array_prod(dimensions());
m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
launcher(*this, m_output);
return true;
}
template<int LoadMode>
EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
{
return m_output;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_output[index];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
}
EIGEN_STRONG_INLINE void cleanup() {
if (m_output) {
m_device.deallocate_temp(m_output);
m_output = NULL;
}
m_impl.cleanup();
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
m_output.bind(cgh);
}
#endif
protected:
TensorEvaluator<ArgType, Device> m_impl;
const Device EIGEN_DEVICE_REF m_device;
const bool m_exclusive;
Op m_accumulator;
const Index m_size;
Index m_stride;
Index m_consume_dim;
EvaluatorPointerType m_output;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H

View File

@@ -0,0 +1,513 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Mehdi Goli Codeplay Software Ltd.
// Ralph Potter Codeplay Software Ltd.
// Luke Iwanski Codeplay Software Ltd.
// Contact: <eigen@codeplay.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
/*****************************************************************
* TensorScanSycl.h
*
* \brief:
* Tensor Scan Sycl implement the extend version of
* "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
* The algorithm requires up to 3 stage (consequently 3 kernels) depending on
* the size of the tensor. In the first kernel (ScanKernelFunctor), each
* threads within the work-group individually reduces the allocated elements per
* thread in order to reduces the total number of blocks. In the next step all
* thread within the work-group will reduce the associated blocks into the
* temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
* buffer is given as an input and all the threads within a work-group scan and
* reduces the boundaries between the blocks (generated from the previous
* kernel). and write the data on the temporary buffer. If the second kernel is
* required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
* adjust the final result into the output buffer.
* The original algorithm for the parallel prefix sum can be found here:
*
* Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
* scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
*1, no. 1 (2008): 1-17.
*****************************************************************/
#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
namespace Eigen {
namespace TensorSycl {
namespace internal {
#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
#endif
template <typename index_t>
struct ScanParameters {
// must be power of 2
static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
const index_t total_size;
const index_t non_scan_size;
const index_t scan_size;
const index_t non_scan_stride;
const index_t scan_stride;
const index_t panel_threads;
const index_t group_threads;
const index_t block_threads;
const index_t elements_per_group;
const index_t elements_per_block;
const index_t loop_range;
ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
: total_size(total_size_),
non_scan_size(non_scan_size_),
scan_size(scan_size_),
non_scan_stride(non_scan_stride_),
scan_stride(scan_stride_),
panel_threads(panel_threads_),
group_threads(group_threads_),
block_threads(block_threads_),
elements_per_group(elements_per_group_),
elements_per_block(elements_per_block_),
loop_range(loop_range_) {}
};
enum class scan_step { first, second };
template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
scan_step stp>
struct ScanKernelFunctor {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
LocalAccessor;
static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
LocalAccessor scratch;
Evaluator dev_eval;
OutAccessor out_accessor;
OutAccessor temp_accessor;
const ScanParameters<Index> scanParameters;
Op accumulator;
const bool inclusive;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
OutAccessor out_accessor_, OutAccessor temp_accessor_,
const ScanParameters<Index> scanParameters_, Op accumulator_,
const bool inclusive_)
: scratch(scratch_),
dev_eval(dev_eval_),
out_accessor(out_accessor_),
temp_accessor(temp_accessor_),
scanParameters(scanParameters_),
accumulator(accumulator_),
inclusive(inclusive_) {}
template <scan_step sst = stp, typename Input>
typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
read(const Input &inpt, Index global_id) {
return inpt.coeff(global_id);
}
template <scan_step sst = stp, typename Input>
typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE
read(const Input &inpt, Index global_id) {
return inpt[global_id];
}
template <scan_step sst = stp, typename InclusiveOp>
typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
first_step_inclusive_Operation(InclusiveOp inclusive_op) {
inclusive_op();
}
template <scan_step sst = stp, typename InclusiveOp>
typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
first_step_inclusive_Operation(InclusiveOp) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
auto out_ptr = out_accessor.get_pointer();
auto tmp_ptr = temp_accessor.get_pointer();
auto scratch_ptr = scratch.get_pointer().get();
for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
Index tmp = data_offset % scanParameters.panel_threads;
const Index panel_id = data_offset / scanParameters.panel_threads;
const Index group_id = tmp / scanParameters.group_threads;
tmp = tmp % scanParameters.group_threads;
const Index block_id = tmp / scanParameters.block_threads;
const Index local_id = tmp % scanParameters.block_threads;
// we put one element per packet in scratch_mem
const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
CoeffReturnType inclusive_scan;
// the actual panel size is scan_size * non_scan_size.
// elements_per_panel is roundup to power of 2 for binary tree
const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
const Index group_offset = group_id * scanParameters.non_scan_stride;
// This will be effective when the size is bigger than elements_per_block
const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
Index next_elements = 0;
EIGEN_UNROLL_LOOP
for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
Index global_id = global_offset + next_elements;
private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
(ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
(global_id < scanParameters.total_size))
? read(dev_eval, global_id)
: accumulator.initialize();
next_elements += scanParameters.scan_stride;
}
first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
if (inclusive) {
inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
}
});
// This for loop must be 2
EIGEN_UNROLL_LOOP
for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
Index private_offset = 1;
// build sum in place up the tree
EIGEN_UNROLL_LOOP
for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
EIGEN_UNROLL_LOOP
for (Index l = 0; l < d; l++) {
Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
CoeffReturnType accum = accumulator.initialize();
accumulator.reduce(private_scan[ai], &accum);
accumulator.reduce(private_scan[bi], &accum);
private_scan[bi] = accumulator.finalize(accum);
}
private_offset *= 2;
}
scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
private_scan[PacketSize - 1 + packetIndex];
private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
// traverse down tree & build scan
EIGEN_UNROLL_LOOP
for (Index d = 1; d < PacketSize; d *= 2) {
private_offset >>= 1;
EIGEN_UNROLL_LOOP
for (Index l = 0; l < d; l++) {
Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
CoeffReturnType accum = accumulator.initialize();
accumulator.reduce(private_scan[ai], &accum);
accumulator.reduce(private_scan[bi], &accum);
private_scan[ai] = private_scan[bi];
private_scan[bi] = accumulator.finalize(accum);
}
}
}
Index offset = 1;
// build sum in place up the tree
for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
// Synchronise
itemID.barrier(cl::sycl::access::fence_space::local_space);
if (local_id < d) {
Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
CoeffReturnType accum = accumulator.initialize();
accumulator.reduce(scratch_ptr[ai], &accum);
accumulator.reduce(scratch_ptr[bi], &accum);
scratch_ptr[bi] = accumulator.finalize(accum);
}
offset *= 2;
}
// Synchronise
itemID.barrier(cl::sycl::access::fence_space::local_space);
// next step optimisation
if (local_id == 0) {
if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
scanParameters.non_scan_size +
group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
block_id;
tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
}
// clear the last element
scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
}
// traverse down tree & build scan
for (Index d = 1; d < scratch_stride; d *= 2) {
offset >>= 1;
// Synchronise
itemID.barrier(cl::sycl::access::fence_space::local_space);
if (local_id < d) {
Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
CoeffReturnType accum = accumulator.initialize();
accumulator.reduce(scratch_ptr[ai], &accum);
accumulator.reduce(scratch_ptr[bi], &accum);
scratch_ptr[ai] = scratch_ptr[bi];
scratch_ptr[bi] = accumulator.finalize(accum);
}
}
// Synchronise
itemID.barrier(cl::sycl::access::fence_space::local_space);
// This for loop must be 2
EIGEN_UNROLL_LOOP
for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
EIGEN_UNROLL_LOOP
for (Index i = 0; i < PacketSize; i++) {
CoeffReturnType accum = private_scan[packetIndex + i];
accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
private_scan[packetIndex + i] = accumulator.finalize(accum);
}
}
first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
if (inclusive) {
accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
private_scan[0] = accumulator.finalize(inclusive_scan);
}
});
next_elements = 0;
// right the first set of private param
EIGEN_UNROLL_LOOP
for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
Index global_id = global_offset + next_elements;
if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
scanParameters.scan_size) &&
(global_id < scanParameters.total_size)) {
Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
out_ptr[global_id] = private_scan[private_id];
}
next_elements += scanParameters.scan_stride;
}
} // end for loop
}
};
template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
struct ScanAdjustmentKernelFunctor {
typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
LocalAccessor;
static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
InAccessor in_accessor;
OutAccessor out_accessor;
const ScanParameters<Index> scanParameters;
Op accumulator;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
OutAccessor out_accessor_,
const ScanParameters<Index> scanParameters_,
Op accumulator_)
: in_accessor(in_accessor_),
out_accessor(out_accessor_),
scanParameters(scanParameters_),
accumulator(accumulator_) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
auto in_ptr = in_accessor.get_pointer();
auto out_ptr = out_accessor.get_pointer();
for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
Index tmp = data_offset % scanParameters.panel_threads;
const Index panel_id = data_offset / scanParameters.panel_threads;
const Index group_id = tmp / scanParameters.group_threads;
tmp = tmp % scanParameters.group_threads;
const Index block_id = tmp / scanParameters.block_threads;
const Index local_id = tmp % scanParameters.block_threads;
// the actual panel size is scan_size * non_scan_size.
// elements_per_panel is roundup to power of 2 for binary tree
const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
const Index group_offset = group_id * scanParameters.non_scan_stride;
// This will be effective when the size is bigger than elements_per_block
const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
CoeffReturnType adjust_val = in_ptr[in_id];
Index next_elements = 0;
EIGEN_UNROLL_LOOP
for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
Index global_id = global_offset + next_elements;
if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
scanParameters.scan_size) &&
(global_id < scanParameters.total_size)) {
CoeffReturnType accum = adjust_val;
accumulator.reduce(out_ptr[global_id], &accum);
out_ptr[global_id] = accumulator.finalize(accum);
}
next_elements += scanParameters.scan_stride;
}
}
}
};
template <typename Index>
struct ScanInfo {
const Index &total_size;
const Index &scan_size;
const Index &panel_size;
const Index &non_scan_size;
const Index &scan_stride;
const Index &non_scan_stride;
Index max_elements_per_block;
Index block_size;
Index panel_threads;
Index group_threads;
Index block_threads;
Index elements_per_group;
Index elements_per_block;
Index loop_range;
Index global_range;
Index local_range;
const Eigen::SyclDevice &dev;
EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
const Eigen::SyclDevice &dev_)
: total_size(total_size_),
scan_size(scan_size_),
panel_size(panel_size_),
non_scan_size(non_scan_size_),
scan_stride(scan_stride_),
non_scan_stride(non_scan_stride_),
dev(dev_) {
// must be power of 2
local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
elements_per_group =
dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
const Index elements_per_panel = elements_per_group * non_scan_size;
elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
block_size = elements_per_group / elements_per_block;
#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
#else
const Index max_threads = panel_threads * panel_size;
#endif
global_range = roundUp(max_threads, local_range);
loop_range = Index(
std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
}
inline ScanParameters<Index> get_scan_parameter() {
return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
}
inline cl::sycl::nd_range<1> get_thread_range() {
return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
}
};
template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
struct SYCLAdjustBlockOffset {
EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
Reducer &accumulator, const Index total_size,
const Index scan_size, const Index panel_size,
const Index non_scan_size, const Index scan_stride,
const Index non_scan_stride, const Eigen::SyclDevice &dev) {
auto scan_info =
ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
AdjustFuctor;
dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
scan_info.max_elements_per_block,
scan_info.get_scan_parameter(), accumulator);
}
};
template <typename CoeffReturnType, scan_step stp>
struct ScanLauncher_impl {
template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
const Index total_size, const Index scan_size, const Index panel_size,
const Index non_scan_size, const Index scan_stride,
const Index non_scan_stride, const bool inclusive,
const Eigen::SyclDevice &dev) {
auto scan_info =
ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
CoeffReturnType *temp_pointer =
static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
scan_info.get_scan_parameter(), accumulator, inclusive);
if (scan_info.block_size > 1) {
ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
non_scan_size, Index(1), scan_info.block_size, false, dev);
SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
non_scan_stride, dev);
}
dev.deallocate_temp(temp_pointer);
}
};
} // namespace internal
} // namespace TensorSycl
namespace internal {
template <typename Self, typename Reducer, bool vectorize>
struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
typedef typename Self::Index Index;
typedef typename Self::CoeffReturnType CoeffReturnType;
typedef typename Self::Storage Storage;
typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
void operator()(Self &self, EvaluatorPointerType data) {
const Index total_size = internal::array_prod(self.dimensions());
const Index scan_size = self.size();
const Index scan_stride = self.stride();
// this is the scan op (can be sum or ...)
auto accumulator = self.accumulator();
auto inclusive = !self.exclusive();
auto consume_dim = self.consume_dim();
auto dev = self.device();
auto dims = self.inner().dimensions();
Index non_scan_size = 1;
Index panel_size = 1;
if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
for (int i = 0; i < consume_dim; i++) {
non_scan_size *= dims[i];
}
for (int i = consume_dim + 1; i < Self::NumDims; i++) {
panel_size *= dims[i];
}
} else {
for (int i = Self::NumDims - 1; i > consume_dim; i--) {
non_scan_size *= dims[i];
}
for (int i = consume_dim - 1; i >= 0; i--) {
panel_size *= dims[i];
}
}
const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
auto eval_impl = self.inner();
TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
inclusive, dev);
}
};
} // namespace internal
} // namespace Eigen
#endif // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP

View File

@@ -0,0 +1,471 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
namespace Eigen {
/** \class TensorShuffling
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor shuffling class.
*
*
*/
namespace internal {
template<typename Shuffle, typename XprType>
struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<typename Shuffle, typename XprType>
struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
{
typedef const TensorShufflingOp<Shuffle, XprType>& type;
};
template<typename Shuffle, typename XprType>
struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
{
typedef TensorShufflingOp<Shuffle, XprType> type;
};
} // end namespace internal
template<typename Shuffle, typename XprType>
class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
{
public:
typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
: m_xpr(expr), m_shuffle(shfl) {}
EIGEN_DEVICE_FUNC
const Shuffle& shufflePermutation() const { return m_shuffle; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
protected:
typename XprType::Nested m_xpr;
const Shuffle m_shuffle;
};
// Eval as rvalue
template<typename Shuffle, typename ArgType, typename Device>
struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
{
typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
Layout, Index>
TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_device(device),
m_impl(op.expression(), device)
{
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
const Shuffle& shuffle = op.shufflePermutation();
m_is_identity = true;
for (int i = 0; i < NumDims; ++i) {
m_shuffle[i] = static_cast<int>(shuffle[i]);
m_dimensions[i] = input_dims[shuffle[i]];
m_inverseShuffle[shuffle[i]] = i;
if (m_is_identity && shuffle[i] != i) {
m_is_identity = false;
}
}
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_unshuffledInputStrides[0] = 1;
m_outputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_unshuffledInputStrides[i] =
m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
}
} else {
m_unshuffledInputStrides[NumDims - 1] = 1;
m_outputStrides[NumDims - 1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_unshuffledInputStrides[i] =
m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
}
}
for (int i = 0; i < NumDims; ++i) {
m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
#ifdef EIGEN_USE_THREADS
template <typename EvalSubExprsCallback>
EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
EvaluatorPointerType, EvalSubExprsCallback done) {
m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
}
#endif // EIGEN_USE_THREADS
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
if (m_is_identity) {
return m_impl.coeff(index);
} else {
return m_impl.coeff(srcCoeff(index));
}
}
template <int LoadMode, typename Self, bool ImplPacketAccess>
struct PacketLoader {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
static PacketReturnType Run(const Self& self, Index index) {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = self.coeff(index + i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
};
template<int LoadMode, typename Self>
struct PacketLoader<LoadMode, Self, true> {
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
static PacketReturnType Run(const Self& self, Index index) {
if (self.m_is_identity) {
return self.m_impl.template packet<LoadMode>(index);
} else {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = self.coeff(index + i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
};
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
internal::TensorBlockResourceRequirements getResourceRequirements() const {
static const int inner_dim =
Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
const size_t target_size = m_device.firstLevelCacheSize();
const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
// Shuffled inner dimensions leads to a random memory access, which is not
// captured by default cost model bytes loaded/stored. We add this cost
// explicitly. The number of cycles picked based on the benchmarks.
// TODO(ezhulenev): This number was picked based on a very questionable
// benchmarks, add benchmarks that are representative of real workloads.
using BlockRequirements = internal::TensorBlockResourceRequirements;
if (inner_dim_shuffled) {
return BlockRequirements::uniform<Scalar>(target_size)
.addCostPerCoeff({0, 0, NumDims * 28});
} else {
return BlockRequirements::skewed<Scalar>(target_size);
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
bool root_of_expr_ast = false) const {
assert(m_impl.data() != NULL);
typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
TensorBlockIO;
typedef typename TensorBlockIO::Dst TensorBlockIODst;
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
const typename TensorBlock::Storage block_storage =
TensorBlock::prepareStorage(
desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
block_storage.data());
typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
return block_storage.AsTensorMaterializedBlock();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
NumDims * (2 * TensorOpCost::AddCost<Index>() +
2 * TensorOpCost::MulCost<Index>() +
TensorOpCost::DivCost<Index>());
return m_impl.costPerCoeff(vectorized) +
TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
}
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
Index input_index,
const DSizes<Index, NumDims>& input_block_strides,
const DSizes<Index, NumDims>& output_block_strides,
const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
Index output_index = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = input_index / fast_input_block_strides[i];
output_index += idx * output_block_strides[m_inverseShuffle[i]];
input_index -= idx * input_block_strides[i];
}
return output_index + input_index *
output_block_strides[m_inverseShuffle[0]];
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = input_index / fast_input_block_strides[i];
output_index += idx * output_block_strides[m_inverseShuffle[i]];
input_index -= idx * input_block_strides[i];
}
return output_index + input_index *
output_block_strides[m_inverseShuffle[NumDims - 1]];
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_fastOutputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
return inputIndex + index * m_inputStrides[0];
} else {
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_fastOutputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
return inputIndex + index * m_inputStrides[NumDims - 1];
}
}
Dimensions m_dimensions;
bool m_is_identity;
array<int, NumDims> m_shuffle;
array<Index, NumDims> m_inverseShuffle; // TODO(ezhulenev): Make it int type.
array<Index, NumDims> m_outputStrides;
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
array<Index, NumDims> m_inputStrides;
array<Index, NumDims> m_unshuffledInputStrides;
const Device EIGEN_DEVICE_REF m_device;
TensorEvaluator<ArgType, Device> m_impl;
};
// Eval as lvalue
template<typename Shuffle, typename ArgType, typename Device>
struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
: public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
{
typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
typedef TensorShufflingOp<Shuffle, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
enum {
IsAligned = false,
PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
PreferBlockAccess = true,
Layout = TensorEvaluator<ArgType, Device>::Layout,
RawAccess = false
};
typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device)
{ }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
this->coeffRef(index+i) = values[i];
}
}
template <typename TensorBlock>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
const TensorBlockDesc& desc, const TensorBlock& block) {
eigen_assert(this->m_impl.data() != NULL);
typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
TensorBlockIO;
typedef typename TensorBlockIO::Dst TensorBlockIODst;
typedef typename TensorBlockIO::Src TensorBlockIOSrc;
const Scalar* block_buffer = block.data();
// TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
// expression with coefficient and packet access as `src`.
void* mem = NULL;
if (block_buffer == NULL) {
mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
typedef internal::TensorBlockAssignment<
ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
TensorBlockAssignment;
TensorBlockAssignment::Run(
TensorBlockAssignment::target(
desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
buf),
block.expr());
block_buffer = buf;
}
// Read from block.
TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
block_buffer);
// Write to the output buffer.
typename TensorBlockIO::Dimensions output_strides(
this->m_unshuffledInputStrides);
typename TensorBlockIO::Dimensions output_dimensions;
for (int i = 0; i < NumDims; ++i) {
output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
}
TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
this->srcCoeff(desc.offset()));
// Reorder dimensions according to the shuffle.
typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
for (int i = 0; i < NumDims; ++i) {
dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
}
TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
// Deallocate temporary buffer used for the block materialization.
if (mem != NULL) this->m_device.deallocate(mem);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H

View File

@@ -0,0 +1,161 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN
#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN;
#else
#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
#endif
namespace Eigen {
/** \internal
*
* \class TensorStorage
* \ingroup CXX11_Tensor_Module
*
* \brief Stores the data of a tensor
*
* This class stores the data of fixed-size, dynamic-size or mixed tensors
* in a way as compact as possible.
*
* \sa Tensor
*/
template<typename T, typename Dimensions, int Options> class TensorStorage;
// Pure fixed-size storage
template<typename T, typename FixedDimensions, int Options_>
class TensorStorage
{
private:
static const std::size_t Size = FixedDimensions::total_size;
// Allocate an array of size at least one to prevent compiler warnings.
static const std::size_t MinSize = max_n_1<Size>::size;
EIGEN_ALIGN_MAX T m_data[MinSize];
public:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE TensorStorage() {
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T *data() { return m_data; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T *data() const { return m_data; }
static EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const FixedDimensions& dimensions()
{
static const FixedDimensions* singleton_dimensions = new FixedDimensions();
return *singleton_dimensions;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
};
// pure dynamic
template<typename T, typename IndexType, int NumIndices_, int Options_>
class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
{
public:
typedef IndexType Index;
typedef DSizes<IndexType, NumIndices_> Dimensions;
typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
if (NumIndices_ == 0) {
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
}
}
EIGEN_DEVICE_FUNC TensorStorage(internal::constructor_without_unaligned_array_assert)
: m_data(0), m_dimensions(internal::template repeat<NumIndices_, Index>(0)) {}
EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
{ EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
#if EIGEN_HAS_VARIADIC_TEMPLATES
template <typename... DenseIndex>
EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
}
#endif
EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
: m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(other.m_dimensions)))
, m_dimensions(other.m_dimensions)
{
internal::smart_copy(other.m_data, other.m_data+internal::array_prod(other.m_dimensions), m_data);
}
EIGEN_DEVICE_FUNC Self& operator=(const Self& other)
{
if (this != &other) {
Self tmp(other);
this->swap(tmp);
}
return *this;
}
#if EIGEN_HAS_RVALUE_REFERENCES
EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage()
{
*this = std::move(other);
}
EIGEN_DEVICE_FUNC Self& operator=(Self&& other)
{
numext::swap(m_data, other.m_data);
numext::swap(m_dimensions, other.m_dimensions);
return *this;
}
#endif
EIGEN_DEVICE_FUNC ~TensorStorage() { internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, internal::array_prod(m_dimensions)); }
EIGEN_DEVICE_FUNC void swap(Self& other)
{ numext::swap(m_data,other.m_data); numext::swap(m_dimensions,other.m_dimensions); }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {return m_dimensions;}
EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions)
{
const Index currentSz = internal::array_prod(m_dimensions);
if(size != currentSz)
{
internal::conditional_aligned_delete_auto<T,(Options_&DontAlign)==0>(m_data, currentSz);
if (size)
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size);
else if (NumIndices_ == 0) {
m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(1);
}
else
m_data = 0;
EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
}
m_dimensions = nbDimensions;
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T *data() { return m_data; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
private:
T *m_data;
Dimensions m_dimensions;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H

View File

@@ -0,0 +1,346 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
namespace Eigen {
/** \class TensorStriding
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor striding class.
*
*
*/
namespace internal {
template<typename Strides, typename XprType>
struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<typename Strides, typename XprType>
struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
{
typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
};
template<typename Strides, typename XprType>
struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type>
{
typedef TensorStridingOp<Strides, XprType> type;
};
} // end namespace internal
template<typename Strides, typename XprType>
class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> >
{
public:
typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
: m_xpr(expr), m_dims(dims) {}
EIGEN_DEVICE_FUNC
const Strides& strides() const { return m_dims; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
protected:
typename XprType::Nested m_xpr;
const Strides m_dims;
};
// Eval as rvalue
template<typename Strides, typename ArgType, typename Device>
struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
{
typedef TensorStridingOp<Strides, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device)
{
m_dimensions = m_impl.dimensions();
for (int i = 0; i < NumDims; ++i) {
m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
}
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_outputStrides[0] = 1;
m_inputStrides[0] = 1;
for (int i = 1; i < NumDims; ++i) {
m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
m_inputStrides[i-1] *= op.strides()[i-1];
}
m_inputStrides[NumDims-1] *= op.strides()[NumDims-1];
} else { // RowMajor
m_outputStrides[NumDims-1] = 1;
m_inputStrides[NumDims-1] = 1;
for (int i = NumDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
m_inputStrides[i+1] *= op.strides()[i+1];
}
m_inputStrides[0] *= op.strides()[0];
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
return m_impl.coeff(srcCoeff(index));
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + PacketSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / m_outputStrides[i];
const Index idx1 = indices[1] / m_outputStrides[i];
inputIndices[0] += idx0 * m_inputStrides[i];
inputIndices[1] += idx1 * m_inputStrides[i];
indices[0] -= idx0 * m_outputStrides[i];
indices[1] -= idx1 * m_outputStrides[i];
}
inputIndices[0] += indices[0] * m_inputStrides[0];
inputIndices[1] += indices[1] * m_inputStrides[0];
} else { // RowMajor
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / m_outputStrides[i];
const Index idx1 = indices[1] / m_outputStrides[i];
inputIndices[0] += idx0 * m_inputStrides[i];
inputIndices[1] += idx1 * m_inputStrides[i];
indices[0] -= idx0 * m_outputStrides[i];
indices[1] -= idx1 * m_outputStrides[i];
}
inputIndices[0] += indices[0] * m_inputStrides[NumDims-1];
inputIndices[1] += indices[1] * m_inputStrides[NumDims-1];
}
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
return rslt;
}
else {
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
values[0] = m_impl.coeff(inputIndices[0]);
values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
EIGEN_UNROLL_LOOP
for (int i = 1; i < PacketSize-1; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() +
TensorOpCost::MulCost<Index>() +
TensorOpCost::DivCost<Index>()) +
TensorOpCost::MulCost<Index>();
if (vectorized) {
compute_cost *= 2; // packet() computes two indices
}
const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
// Computation is not vectorized per se, but it is done once per packet.
TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
{
Index inputIndex = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
inputIndex += index * m_inputStrides[0];
} else { // RowMajor
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
inputIndex += idx * m_inputStrides[i];
index -= idx * m_outputStrides[i];
}
inputIndex += index * m_inputStrides[NumDims-1];
}
return inputIndex;
}
Dimensions m_dimensions;
array<Index, NumDims> m_outputStrides;
array<Index, NumDims> m_inputStrides;
TensorEvaluator<ArgType, Device> m_impl;
};
// Eval as lvalue
template<typename Strides, typename ArgType, typename Device>
struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
: public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
{
typedef TensorStridingOp<Strides, ArgType> XprType;
typedef TensorEvaluator<const XprType, Device> Base;
// typedef typename XprType::Index Index;
static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
// typedef DSizes<Index, NumDims> Dimensions;
enum {
IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
PreferBlockAccess = false,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false, // to be implemented
RawAccess = false
};
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: Base(op, device) { }
typedef typename XprType::Index Index;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
{
return this->m_impl.coeffRef(this->srcCoeff(index));
}
template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void writePacket(Index index, const PacketReturnType& x)
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
Index inputIndices[] = {0, 0};
Index indices[] = {index, index + PacketSize - 1};
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
EIGEN_UNROLL_LOOP
for (int i = NumDims - 1; i > 0; --i) {
const Index idx0 = indices[0] / this->m_outputStrides[i];
const Index idx1 = indices[1] / this->m_outputStrides[i];
inputIndices[0] += idx0 * this->m_inputStrides[i];
inputIndices[1] += idx1 * this->m_inputStrides[i];
indices[0] -= idx0 * this->m_outputStrides[i];
indices[1] -= idx1 * this->m_outputStrides[i];
}
inputIndices[0] += indices[0] * this->m_inputStrides[0];
inputIndices[1] += indices[1] * this->m_inputStrides[0];
} else { // RowMajor
EIGEN_UNROLL_LOOP
for (int i = 0; i < NumDims - 1; ++i) {
const Index idx0 = indices[0] / this->m_outputStrides[i];
const Index idx1 = indices[1] / this->m_outputStrides[i];
inputIndices[0] += idx0 * this->m_inputStrides[i];
inputIndices[1] += idx1 * this->m_inputStrides[i];
indices[0] -= idx0 * this->m_outputStrides[i];
indices[1] -= idx1 * this->m_outputStrides[i];
}
inputIndices[0] += indices[0] * this->m_inputStrides[NumDims-1];
inputIndices[1] += indices[1] * this->m_inputStrides[NumDims-1];
}
if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
}
else {
EIGEN_ALIGN_MAX Scalar values[PacketSize];
internal::pstore<Scalar, PacketReturnType>(values, x);
this->m_impl.coeffRef(inputIndices[0]) = values[0];
this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
EIGEN_UNROLL_LOOP
for (int i = 1; i < PacketSize-1; ++i) {
this->coeffRef(index+i) = values[i];
}
}
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H

View File

@@ -0,0 +1,303 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
namespace Eigen {
/** \class TensorTrace
* \ingroup CXX11_Tensor_Module
*
* \brief Tensor Trace class.
*
*
*/
namespace internal {
template<typename Dims, typename XprType>
struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType>
{
typedef typename XprType::Scalar Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
static const int Layout = XprTraits::Layout;
};
template<typename Dims, typename XprType>
struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense>
{
typedef const TensorTraceOp<Dims, XprType>& type;
};
template<typename Dims, typename XprType>
struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type>
{
typedef TensorTraceOp<Dims, XprType> type;
};
} // end namespace internal
template<typename Dims, typename XprType>
class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> >
{
public:
typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
: m_xpr(expr), m_dims(dims) {
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Dims& dims() const { return m_dims; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const Dims m_dims;
};
// Eval as rvalue
template<typename Dims, typename ArgType, typename Device>
struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
{
typedef TensorTraceOp<Dims, ArgType> XprType;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
static const int NumReducedDims = internal::array_size<Dims>::value;
static const int NumOutputDims = NumInputDims - NumReducedDims;
typedef typename XprType::Index Index;
typedef DSizes<Index, NumOutputDims> Dimensions;
typedef typename XprType::Scalar Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
: m_impl(op.expression(), device), m_traceDim(1), m_device(device)
{
EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE);
for (int i = 0; i < NumInputDims; ++i) {
m_reduced[i] = false;
}
const Dims& op_dims = op.dims();
for (int i = 0; i < NumReducedDims; ++i) {
eigen_assert(op_dims[i] >= 0);
eigen_assert(op_dims[i] < NumInputDims);
m_reduced[op_dims[i]] = true;
}
// All the dimensions should be distinct to compute the trace
int num_distinct_reduce_dims = 0;
for (int i = 0; i < NumInputDims; ++i) {
if (m_reduced[i]) {
++num_distinct_reduce_dims;
}
}
eigen_assert(num_distinct_reduce_dims == NumReducedDims);
// Compute the dimensions of the result.
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
int output_index = 0;
int reduced_index = 0;
for (int i = 0; i < NumInputDims; ++i) {
if (m_reduced[i]) {
m_reducedDims[reduced_index] = input_dims[i];
if (reduced_index > 0) {
// All the trace dimensions must have the same size
eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
}
++reduced_index;
}
else {
m_dimensions[output_index] = input_dims[i];
++output_index;
}
}
if (NumReducedDims != 0) {
m_traceDim = m_reducedDims[0];
}
// Compute the output strides
if (NumOutputDims > 0) {
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_outputStrides[0] = 1;
for (int i = 1; i < NumOutputDims; ++i) {
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
}
}
else {
m_outputStrides.back() = 1;
for (int i = NumOutputDims - 2; i >= 0; --i) {
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
}
}
}
// Compute the input strides
if (NumInputDims > 0) {
array<Index, NumInputDims> input_strides;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
input_strides[0] = 1;
for (int i = 1; i < NumInputDims; ++i) {
input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
}
}
else {
input_strides.back() = 1;
for (int i = NumInputDims - 2; i >= 0; --i) {
input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
}
}
output_index = 0;
reduced_index = 0;
for (int i = 0; i < NumInputDims; ++i) {
if(m_reduced[i]) {
m_reducedStrides[reduced_index] = input_strides[i];
++reduced_index;
}
else {
m_preservedStrides[output_index] = input_strides[i];
++output_index;
}
}
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
return m_dimensions;
}
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
// Initialize the result
CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
Index index_stride = 0;
for (int i = 0; i < NumReducedDims; ++i) {
index_stride += m_reducedStrides[i];
}
// If trace is requested along all dimensions, starting index would be 0
Index cur_index = 0;
if (NumOutputDims != 0)
cur_index = firstInput(index);
for (Index i = 0; i < m_traceDim; ++i) {
result += m_impl.coeff(cur_index);
cur_index += index_stride;
}
return result;
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index + i);
}
PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
return result;
}
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
// Given the output index, finds the first index in the input tensor used to compute the trace
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
Index startInput = 0;
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
for (int i = NumOutputDims - 1; i > 0; --i) {
const Index idx = index / m_outputStrides[i];
startInput += idx * m_preservedStrides[i];
index -= idx * m_outputStrides[i];
}
startInput += index * m_preservedStrides[0];
}
else {
for (int i = 0; i < NumOutputDims - 1; ++i) {
const Index idx = index / m_outputStrides[i];
startInput += idx * m_preservedStrides[i];
index -= idx * m_outputStrides[i];
}
startInput += index * m_preservedStrides[NumOutputDims - 1];
}
return startInput;
}
Dimensions m_dimensions;
TensorEvaluator<ArgType, Device> m_impl;
// Initialize the size of the trace dimension
Index m_traceDim;
const Device EIGEN_DEVICE_REF m_device;
array<bool, NumInputDims> m_reduced;
array<Index, NumReducedDims> m_reducedDims;
array<Index, NumOutputDims> m_outputStrides;
array<Index, NumReducedDims> m_reducedStrides;
array<Index, NumOutputDims> m_preservedStrides;
};
} // End namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H

View File

@@ -0,0 +1,264 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
namespace Eigen {
namespace internal {
template<typename Scalar, int Options>
class compute_tensor_flags
{
enum {
is_dynamic_size_storage = 1,
is_aligned =
(
((Options&DontAlign)==0) && (
#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
(!is_dynamic_size_storage)
#else
0
#endif
|
#if EIGEN_MAX_ALIGN_BYTES>0
is_dynamic_size_storage
#else
0
#endif
)
),
packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
};
public:
enum { ret = packet_access_bit };
};
template<typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
{
typedef Scalar_ Scalar;
typedef Dense StorageKind;
typedef IndexType_ Index;
static const int NumDimensions = NumIndices_;
static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
enum {
Options = Options_,
Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
};
template <typename T> struct MakePointer {
typedef T* Type;
};
typedef typename MakePointer<Scalar>::Type PointerType;
};
template<typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
{
typedef Scalar_ Scalar;
typedef Dense StorageKind;
typedef IndexType_ Index;
static const int NumDimensions = array_size<Dimensions>::value;
static const int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
enum {
Options = Options_,
Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0: LvalueBit)
};
template <typename T> struct MakePointer {
typedef T* Type;
};
typedef typename MakePointer<Scalar>::Type PointerType;
};
template<typename PlainObjectType, int Options_, template <class> class MakePointer_>
struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
: public traits<PlainObjectType>
{
typedef traits<PlainObjectType> BaseTraits;
typedef typename BaseTraits::Scalar Scalar;
typedef typename BaseTraits::StorageKind StorageKind;
typedef typename BaseTraits::Index Index;
static const int NumDimensions = BaseTraits::NumDimensions;
static const int Layout = BaseTraits::Layout;
enum {
Options = Options_,
Flags = BaseTraits::Flags
};
template <class T> struct MakePointer {
// Intermediate typedef to workaround MSVC issue.
typedef MakePointer_<T> MakePointerT;
typedef typename MakePointerT::Type Type;
};
typedef typename MakePointer<Scalar>::Type PointerType;
};
template<typename PlainObjectType>
struct traits<TensorRef<PlainObjectType> >
: public traits<PlainObjectType>
{
typedef traits<PlainObjectType> BaseTraits;
typedef typename BaseTraits::Scalar Scalar;
typedef typename BaseTraits::StorageKind StorageKind;
typedef typename BaseTraits::Index Index;
static const int NumDimensions = BaseTraits::NumDimensions;
static const int Layout = BaseTraits::Layout;
enum {
Options = BaseTraits::Options,
Flags = BaseTraits::Flags
};
typedef typename BaseTraits::PointerType PointerType;
};
template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
{
typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
{
typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
{
typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
{
typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template<typename PlainObjectType, int Options, template <class> class MakePointer>
struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
{
typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
};
template<typename PlainObjectType, int Options, template <class> class MakePointer>
struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
{
typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
};
template<typename PlainObjectType>
struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
{
typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
};
template<typename PlainObjectType>
struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
{
typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
};
// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
template<typename T, int n=1, typename PlainObject = void> struct nested
{
typedef typename ref_selector<T>::type type;
};
template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
{
typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
};
template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
{
typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
};
template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
{
typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
{
typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
};
template <typename PlainObjectType>
struct nested<TensorRef<PlainObjectType> >
{
typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
};
template <typename PlainObjectType>
struct nested<const TensorRef<PlainObjectType> >
{
typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
};
} // end namespace internal
// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
// R, B), and convolve it with a set of filters, which can also be presented as
// a tensor (D, K, K, M), where M is the number of filters, K is the filter
// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
// simplicity we assume that we always use square filters (which is usually the
// case in images), hence the two Ks in the tensor dimension. It also takes in
// a few additional parameters:
// Stride (S): The convolution stride is the offset between locations where we
// apply the filters. A larger stride means that the output will be
// spatially smaller.
// Padding (P): The padding we apply to the input tensor along the R and C
// dimensions. This is usually used to make sure that the spatial
// dimensions of the output matches our intention.
//
// Two types of padding are often used:
// SAME: The pad value is computed so that the output will have size
// R/S and C/S.
// VALID: no padding is carried out.
// When we do padding, the padded values at the padded locations are usually
// zero.
//
// The output dimensions for convolution, when given all the parameters above,
// are as follows:
// When Padding = SAME: the output size is (B, R', C', M), where
// R' = ceil(float(R) / float(S))
// C' = ceil(float(C) / float(S))
// where ceil is the ceiling function. The input tensor is padded with 0 as
// needed. The number of padded rows and columns are computed as:
// Pr = ((R' - 1) * S + K - R) / 2
// Pc = ((C' - 1) * S + K - C) / 2
// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
// This is where SAME comes from - the output has the same size as the input has.
// When Padding = VALID: the output size is computed as
// R' = ceil(float(R - K + 1) / float(S))
// C' = ceil(float(C - K + 1) / float(S))
// and the number of padded rows and columns are computed in the same way as in
// the SAME case.
// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
// Pc=0.
typedef enum {
PADDING_VALID = 1,
PADDING_SAME = 2
} PaddingType;
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H

View File

@@ -0,0 +1,249 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
namespace Eigen {
namespace internal {
template <uint64_t n>
struct static_val {
static const uint64_t value = n;
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
template <typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
EIGEN_UNUSED_VARIABLE(v);
eigen_assert(v == n);
}
};
template <typename HIGH = uint64_t, typename LOW = uint64_t>
struct TensorUInt128
{
HIGH high;
LOW low;
template<typename OTHER_HIGH, typename OTHER_LOW>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) : high(other.high), low(other.low) {
EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
}
template<typename OTHER_HIGH, typename OTHER_LOW>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
TensorUInt128& operator = (const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) {
EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
high = other.high;
low = other.low;
return *this;
}
template<typename T>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
explicit TensorUInt128(const T& x) : high(0), low(x) {
eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
eigen_assert(x >= 0);
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
TensorUInt128(HIGH y, LOW x) : high(y), low(x) { }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
return low;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const {
return low;
}
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const {
return high;
}
};
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
return (lhs.high == rhs.high) & (lhs.low == rhs.low);
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
return (lhs.high != rhs.high) | (lhs.low != rhs.low);
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (lhs.high != rhs.high) {
return lhs.high > rhs.high;
}
return lhs.low >= rhs.low;
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (lhs.high != rhs.high) {
return lhs.high < rhs.high;
}
return lhs.low < rhs.low;
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
if (result.low < rhs.low) {
result.high += 1;
}
return result;
}
template <typename HL, typename LL, typename HR, typename LR>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
if (result.low > lhs.low) {
result.high -= 1;
}
return result;
}
template <typename HL, typename LL, typename HR, typename LR>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
// Split each 128-bit integer into 4 32-bit integers, and then do the
// multiplications by hand as follow:
// lhs a b c d
// rhs e f g h
// -----------
// ah bh ch dh
// bg cg dg
// cf df
// de
// The result is stored in 2 64bit integers, high and low.
const uint64_t LOW = 0x00000000FFFFFFFFLL;
const uint64_t HIGH = 0xFFFFFFFF00000000LL;
uint64_t d = lhs.low & LOW;
uint64_t c = (lhs.low & HIGH) >> 32LL;
uint64_t b = lhs.high & LOW;
uint64_t a = (lhs.high & HIGH) >> 32LL;
uint64_t h = rhs.low & LOW;
uint64_t g = (rhs.low & HIGH) >> 32LL;
uint64_t f = rhs.high & LOW;
uint64_t e = (rhs.high & HIGH) >> 32LL;
// Compute the low 32 bits of low
uint64_t acc = d * h;
uint64_t low = acc & LOW;
// Compute the high 32 bits of low. Add a carry every time we wrap around
acc >>= 32LL;
uint64_t carry = 0;
uint64_t acc2 = acc + c * h;
if (acc2 < acc) {
carry++;
}
acc = acc2 + d * g;
if (acc < acc2) {
carry++;
}
low |= (acc << 32LL);
// Carry forward the high bits of acc to initiate the computation of the
// low 32 bits of high
acc2 = (acc >> 32LL) | (carry << 32LL);
carry = 0;
acc = acc2 + b * h;
if (acc < acc2) {
carry++;
}
acc2 = acc + c * g;
if (acc2 < acc) {
carry++;
}
acc = acc2 + d * f;
if (acc < acc2) {
carry++;
}
uint64_t high = acc & LOW;
// Start to compute the high 32 bits of high.
acc2 = (acc >> 32LL) | (carry << 32LL);
acc = acc2 + a * h;
acc2 = acc + b * g;
acc = acc2 + c * f;
acc2 = acc + d * e;
high |= (acc2 << 32LL);
return TensorUInt128<uint64_t, uint64_t>(high, low);
}
template <typename HL, typename LL, typename HR, typename LR>
static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
{
if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
} else if (lhs < rhs) {
return TensorUInt128<uint64_t, uint64_t>(0);
} else {
// calculate the biggest power of 2 times rhs that's less than or equal to lhs
TensorUInt128<uint64_t, uint64_t> power2(1);
TensorUInt128<uint64_t, uint64_t> d(rhs);
TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
while (lhs >= d) {
tmp = tmp - d;
d = d + d;
power2 = power2 + power2;
}
tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
TensorUInt128<uint64_t, uint64_t> result(0);
while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) {
if (tmp >= d) {
tmp = tmp - d;
result = result + power2;
}
// Shift right
power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
}
return result;
}
}
} // namespace internal
} // namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H

View File

@@ -0,0 +1,629 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
namespace Eigen {
/** \class TensorVolumePatch
* \ingroup CXX11_Tensor_Module
*
* \brief Patch extraction specialized for processing of volumetric data.
* This assumes that the input has a least 4 dimensions ordered as follows:
* - channels
* - planes
* - rows
* - columns
* - (optional) additional dimensions such as time or batch size.
* Calling the volume patch code with patch_planes, patch_rows, and patch_cols
* is equivalent to calling the regular patch extraction code with parameters
* d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
* dimensions.
*/
namespace internal {
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
{
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef traits<XprType> XprTraits;
typedef typename XprTraits::StorageKind StorageKind;
typedef typename XprTraits::Index Index;
typedef typename XprType::Nested Nested;
typedef typename remove_reference<Nested>::type _Nested;
static const int NumDimensions = XprTraits::NumDimensions + 1;
static const int Layout = XprTraits::Layout;
typedef typename XprTraits::PointerType PointerType;
};
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense>
{
typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
};
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1, typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type>
{
typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
};
} // end namespace internal
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors>
{
public:
typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
PaddingType padding_type, Scalar padding_value)
: m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
m_padding_type(padding_type), m_padding_value(padding_value) {}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
DenseIndex padding_top_z, DenseIndex padding_bottom_z,
DenseIndex padding_top, DenseIndex padding_bottom,
DenseIndex padding_left, DenseIndex padding_right,
Scalar padding_value)
: m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
m_padding_left(padding_left), m_padding_right(padding_right),
m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
EIGEN_DEVICE_FUNC
DenseIndex patch_planes() const { return m_patch_planes; }
EIGEN_DEVICE_FUNC
DenseIndex patch_rows() const { return m_patch_rows; }
EIGEN_DEVICE_FUNC
DenseIndex patch_cols() const { return m_patch_cols; }
EIGEN_DEVICE_FUNC
DenseIndex plane_strides() const { return m_plane_strides; }
EIGEN_DEVICE_FUNC
DenseIndex row_strides() const { return m_row_strides; }
EIGEN_DEVICE_FUNC
DenseIndex col_strides() const { return m_col_strides; }
EIGEN_DEVICE_FUNC
DenseIndex in_plane_strides() const { return m_in_plane_strides; }
EIGEN_DEVICE_FUNC
DenseIndex in_row_strides() const { return m_in_row_strides; }
EIGEN_DEVICE_FUNC
DenseIndex in_col_strides() const { return m_in_col_strides; }
EIGEN_DEVICE_FUNC
DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
EIGEN_DEVICE_FUNC
DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
EIGEN_DEVICE_FUNC
DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
EIGEN_DEVICE_FUNC
bool padding_explicit() const { return m_padding_explicit; }
EIGEN_DEVICE_FUNC
DenseIndex padding_top_z() const { return m_padding_top_z; }
EIGEN_DEVICE_FUNC
DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
EIGEN_DEVICE_FUNC
DenseIndex padding_top() const { return m_padding_top; }
EIGEN_DEVICE_FUNC
DenseIndex padding_bottom() const { return m_padding_bottom; }
EIGEN_DEVICE_FUNC
DenseIndex padding_left() const { return m_padding_left; }
EIGEN_DEVICE_FUNC
DenseIndex padding_right() const { return m_padding_right; }
EIGEN_DEVICE_FUNC
PaddingType padding_type() const { return m_padding_type; }
EIGEN_DEVICE_FUNC
Scalar padding_value() const { return m_padding_value; }
EIGEN_DEVICE_FUNC
const typename internal::remove_all<typename XprType::Nested>::type&
expression() const { return m_xpr; }
protected:
typename XprType::Nested m_xpr;
const DenseIndex m_patch_planes;
const DenseIndex m_patch_rows;
const DenseIndex m_patch_cols;
const DenseIndex m_plane_strides;
const DenseIndex m_row_strides;
const DenseIndex m_col_strides;
const DenseIndex m_in_plane_strides;
const DenseIndex m_in_row_strides;
const DenseIndex m_in_col_strides;
const DenseIndex m_plane_inflate_strides;
const DenseIndex m_row_inflate_strides;
const DenseIndex m_col_inflate_strides;
const bool m_padding_explicit;
const DenseIndex m_padding_top_z;
const DenseIndex m_padding_bottom_z;
const DenseIndex m_padding_top;
const DenseIndex m_padding_bottom;
const DenseIndex m_padding_left;
const DenseIndex m_padding_right;
const PaddingType m_padding_type;
const Scalar m_padding_value;
};
// Eval as rvalue
template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device>
{
typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
typedef typename XprType::Index Index;
static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
static const int NumDims = NumInputDims + 1;
typedef DSizes<Index, NumDims> Dimensions;
typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
typedef typename XprType::CoeffReturnType CoeffReturnType;
typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
typedef StorageMemory<CoeffReturnType, Device> Storage;
typedef typename Storage::Type EvaluatorPointerType;
enum {
IsAligned = false,
PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
BlockAccess = false,
PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
Layout = TensorEvaluator<ArgType, Device>::Layout,
CoordAccess = false,
RawAccess = false
};
//===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
typedef internal::TensorBlockNotImplemented TensorBlock;
//===--------------------------------------------------------------------===//
EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
m_impl(op.expression(), device)
{
EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
m_paddingValue = op.padding_value();
const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
// Cache a few variables.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_inputDepth = input_dims[0];
m_inputPlanes = input_dims[1];
m_inputRows = input_dims[2];
m_inputCols = input_dims[3];
} else {
m_inputDepth = input_dims[NumInputDims-1];
m_inputPlanes = input_dims[NumInputDims-2];
m_inputRows = input_dims[NumInputDims-3];
m_inputCols = input_dims[NumInputDims-4];
}
m_plane_strides = op.plane_strides();
m_row_strides = op.row_strides();
m_col_strides = op.col_strides();
// Input strides and effective input/patch size
m_in_plane_strides = op.in_plane_strides();
m_in_row_strides = op.in_row_strides();
m_in_col_strides = op.in_col_strides();
m_plane_inflate_strides = op.plane_inflate_strides();
m_row_inflate_strides = op.row_inflate_strides();
m_col_inflate_strides = op.col_inflate_strides();
// The "effective" spatial size after inflating data with zeros.
m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
if (op.padding_explicit()) {
m_outputPlanes = numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
m_planePaddingTop = op.padding_top_z();
m_rowPaddingTop = op.padding_top();
m_colPaddingLeft = op.padding_left();
} else {
// Computing padding from the type
switch (op.padding_type()) {
case PADDING_VALID:
m_outputPlanes = numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
m_planePaddingTop = 0;
m_rowPaddingTop = 0;
m_colPaddingLeft = 0;
break;
case PADDING_SAME: {
m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
m_planePaddingTop = dz / 2;
m_rowPaddingTop = dy / 2;
m_colPaddingLeft = dx / 2;
break;
}
default:
eigen_assert(false && "unexpected padding");
}
}
eigen_assert(m_outputRows > 0);
eigen_assert(m_outputCols > 0);
eigen_assert(m_outputPlanes > 0);
// Dimensions for result of extraction.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
// ColMajor
// 0: depth
// 1: patch_planes
// 2: patch_rows
// 3: patch_cols
// 4: number of patches
// 5 and beyond: anything else (such as batch).
m_dimensions[0] = input_dims[0];
m_dimensions[1] = op.patch_planes();
m_dimensions[2] = op.patch_rows();
m_dimensions[3] = op.patch_cols();
m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
for (int i = 5; i < NumDims; ++i) {
m_dimensions[i] = input_dims[i-1];
}
} else {
// RowMajor
// NumDims-1: depth
// NumDims-2: patch_planes
// NumDims-3: patch_rows
// NumDims-4: patch_cols
// NumDims-5: number of patches
// NumDims-6 and beyond: anything else (such as batch).
m_dimensions[NumDims-1] = input_dims[NumInputDims-1];
m_dimensions[NumDims-2] = op.patch_planes();
m_dimensions[NumDims-3] = op.patch_rows();
m_dimensions[NumDims-4] = op.patch_cols();
m_dimensions[NumDims-5] = m_outputPlanes * m_outputRows * m_outputCols;
for (int i = NumDims-6; i >= 0; --i) {
m_dimensions[i] = input_dims[i];
}
}
// Strides for the output tensor.
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_rowStride = m_dimensions[1];
m_colStride = m_dimensions[2] * m_rowStride;
m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
m_otherStride = m_patchStride * m_dimensions[4];
} else {
m_rowStride = m_dimensions[NumDims-2];
m_colStride = m_dimensions[NumDims-3] * m_rowStride;
m_patchStride = m_colStride * m_dimensions[NumDims-4] * m_dimensions[NumDims-1];
m_otherStride = m_patchStride * m_dimensions[NumDims-5];
}
// Strides for navigating through the input tensor.
m_planeInputStride = m_inputDepth;
m_rowInputStride = m_inputDepth * m_inputPlanes;
m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
m_outputPlanesRows = m_outputPlanes * m_outputRows;
// Fast representations of different variables.
m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
} else {
m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims-1]);
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
m_impl.evalSubExprsIfNeeded(NULL);
return true;
}
EIGEN_STRONG_INLINE void cleanup() {
m_impl.cleanup();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
{
// Patch index corresponding to the passed in index.
const Index patchIndex = index / m_fastPatchStride;
// Spatial offset within the patch. This has to be translated into 3D
// coordinates within the patch.
const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
// Batch, etc.
const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
// Calculate column index in the input original tensor.
const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
const Index colOffset = patchOffset / m_fastColStride;
const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
const Index origInputCol = (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
if (inputCol < 0 || inputCol >= m_input_cols_eff ||
((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
return Scalar(m_paddingValue);
}
// Calculate row index in the original input tensor.
const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
const Index origInputRow = (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
if (inputRow < 0 || inputRow >= m_input_rows_eff ||
((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
return Scalar(m_paddingValue);
}
// Calculate plane index in the original input tensor.
const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
const Index origInputPlane = (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
return Scalar(m_paddingValue);
}
const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
const Index inputIndex = depth +
origInputRow * m_rowInputStride +
origInputCol * m_colInputStride +
origInputPlane * m_planeInputStride +
otherIndex * m_otherInputStride;
return m_impl.coeff(inputIndex);
}
template<int LoadMode>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
{
EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
return packetWithPossibleZero(index);
}
const Index indices[2] = {index, index + PacketSize - 1};
const Index patchIndex = indices[0] / m_fastPatchStride;
if (patchIndex != indices[1] / m_fastPatchStride) {
return packetWithPossibleZero(index);
}
const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
// Find the offset of the element wrt the location of the first element.
const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
(indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
const Index patch3DIndex = (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
const Index colOffsets[2] = {
patchOffsets[0] / m_fastColStride,
patchOffsets[1] / m_fastColStride};
// Calculate col indices in the original input tensor.
const Index inputCols[2] = {
colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
}
if (inputCols[0] != inputCols[1]) {
return packetWithPossibleZero(index);
}
const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
const Index rowOffsets[2] = {
(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
(patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
eigen_assert(rowOffsets[0] <= rowOffsets[1]);
// Calculate col indices in the original input tensor.
const Index inputRows[2] = {
rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
}
if (inputRows[0] != inputRows[1]) {
return packetWithPossibleZero(index);
}
const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
const Index planeOffsets[2] = {
patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
eigen_assert(planeOffsets[0] <= planeOffsets[1]);
const Index inputPlanes[2] = {
planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
}
if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
// no padding
const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
const Index inputIndex = depth +
inputRows[0] * m_rowInputStride +
inputCols[0] * m_colInputStride +
m_planeInputStride * inputPlanes[0] +
otherIndex * m_otherInputStride;
return m_impl.template packet<Unaligned>(inputIndex);
}
return packetWithPossibleZero(index);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
costPerCoeff(bool vectorized) const {
const double compute_cost =
10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() +
8 * TensorOpCost::AddCost<Index>();
return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
}
EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
#ifdef EIGEN_USE_SYCL
// binding placeholder accessors to a command group handler for SYCL
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
m_impl.bind(cgh);
}
#endif
protected:
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
{
EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
EIGEN_UNROLL_LOOP
for (int i = 0; i < PacketSize; ++i) {
values[i] = coeff(index+i);
}
PacketReturnType rslt = internal::pload<PacketReturnType>(values);
return rslt;
}
Dimensions m_dimensions;
// Parameters passed to the constructor.
Index m_plane_strides;
Index m_row_strides;
Index m_col_strides;
Index m_outputPlanes;
Index m_outputRows;
Index m_outputCols;
Index m_planePaddingTop;
Index m_rowPaddingTop;
Index m_colPaddingLeft;
Index m_in_plane_strides;
Index m_in_row_strides;
Index m_in_col_strides;
Index m_plane_inflate_strides;
Index m_row_inflate_strides;
Index m_col_inflate_strides;
// Cached input size.
Index m_inputDepth;
Index m_inputPlanes;
Index m_inputRows;
Index m_inputCols;
// Other cached variables.
Index m_outputPlanesRows;
// Effective input/patch post-inflation size.
Index m_input_planes_eff;
Index m_input_rows_eff;
Index m_input_cols_eff;
Index m_patch_planes_eff;
Index m_patch_rows_eff;
Index m_patch_cols_eff;
// Strides for the output tensor.
Index m_otherStride;
Index m_patchStride;
Index m_rowStride;
Index m_colStride;
// Strides for the input tensor.
Index m_planeInputStride;
Index m_rowInputStride;
Index m_colInputStride;
Index m_otherInputStride;
internal::TensorIntDivisor<Index> m_fastOtherStride;
internal::TensorIntDivisor<Index> m_fastPatchStride;
internal::TensorIntDivisor<Index> m_fastColStride;
internal::TensorIntDivisor<Index> m_fastRowStride;
internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
internal::TensorIntDivisor<Index> m_fastInputRowStride;
internal::TensorIntDivisor<Index> m_fastInputColStride;
internal::TensorIntDivisor<Index> m_fastInputColsEff;
internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
internal::TensorIntDivisor<Index> m_fastOutputPlanes;
internal::TensorIntDivisor<Index> m_fastOutputDepth;
Scalar m_paddingValue;
TensorEvaluator<ArgType, Device> m_impl;
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H

View File

@@ -0,0 +1,293 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
namespace Eigen {
class DynamicSGroup
{
public:
inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) { m_elements.push_back(ge(Generator(0, 0, 0))); }
inline DynamicSGroup(const DynamicSGroup& o) : m_numIndices(o.m_numIndices), m_elements(o.m_elements), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { }
inline DynamicSGroup(DynamicSGroup&& o) : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) { std::swap(m_elements, o.m_elements); }
inline DynamicSGroup& operator=(const DynamicSGroup& o) { m_numIndices = o.m_numIndices; m_elements = o.m_elements; m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
inline DynamicSGroup& operator=(DynamicSGroup&& o) { m_numIndices = o.m_numIndices; std::swap(m_elements, o.m_elements); m_generators = o.m_generators; m_globalFlags = o.m_globalFlags; return *this; }
void add(int one, int two, int flags = 0);
template<typename Gen_>
inline void add(Gen_) { add(Gen_::One, Gen_::Two, Gen_::Flags); }
inline void addSymmetry(int one, int two) { add(one, two, 0); }
inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); }
inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); }
inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); }
template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const
{
eigen_assert(N >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
for (std::size_t i = 0; i < size(); i++)
initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags, initial, std::forward<Args>(args)...);
return initial;
}
template<typename Op, typename RV, typename Index, typename... Args>
inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const
{
eigen_assert(idx.size() >= m_numIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
for (std::size_t i = 0; i < size(); i++)
initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
return initial;
}
inline int globalFlags() const { return m_globalFlags; }
inline std::size_t size() const { return m_elements.size(); }
template<typename Tensor_, typename... IndexTypes>
inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
{
static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
}
template<typename Tensor_>
inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
{
return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices);
}
private:
struct GroupElement {
std::vector<int> representation;
int flags;
bool isId() const
{
for (std::size_t i = 0; i < representation.size(); i++)
if (i != (size_t)representation[i])
return false;
return true;
}
};
struct Generator {
int one;
int two;
int flags;
constexpr inline Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
};
std::size_t m_numIndices;
std::vector<GroupElement> m_elements;
std::vector<Generator> m_generators;
int m_globalFlags;
template<typename Index, std::size_t N, int... n>
inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx, internal::numeric_list<int, n...>) const
{
return std::array<Index, N>{{ idx[n >= m_numIndices ? n : m_elements[which].representation[n]]... }};
}
template<typename Index>
inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const
{
std::vector<Index> result;
result.reserve(idx.size());
for (auto k : m_elements[which].representation)
result.push_back(idx[k]);
for (std::size_t i = m_numIndices; i < idx.size(); i++)
result.push_back(idx[i]);
return result;
}
inline GroupElement ge(Generator const& g) const
{
GroupElement result;
result.representation.reserve(m_numIndices);
result.flags = g.flags;
for (std::size_t k = 0; k < m_numIndices; k++) {
if (k == (std::size_t)g.one)
result.representation.push_back(g.two);
else if (k == (std::size_t)g.two)
result.representation.push_back(g.one);
else
result.representation.push_back(int(k));
}
return result;
}
GroupElement mul(GroupElement, GroupElement) const;
inline GroupElement mul(Generator g1, GroupElement g2) const
{
return mul(ge(g1), g2);
}
inline GroupElement mul(GroupElement g1, Generator g2) const
{
return mul(g1, ge(g2));
}
inline GroupElement mul(Generator g1, Generator g2) const
{
return mul(ge(g1), ge(g2));
}
inline int findElement(GroupElement e) const
{
for (auto ee : m_elements) {
if (ee.representation == e.representation)
return ee.flags ^ e.flags;
}
return -1;
}
void updateGlobalFlags(int flagDiffOfSameGenerator);
};
// dynamic symmetry group that auto-adds the template parameters in the constructor
template<typename... Gen>
class DynamicSGroupFromTemplateArgs : public DynamicSGroup
{
public:
inline DynamicSGroupFromTemplateArgs() : DynamicSGroup()
{
add_all(internal::type_list<Gen...>());
}
inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) { }
inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) { }
inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) { DynamicSGroup::operator=(o); return *this; }
inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) { DynamicSGroup::operator=(o); return *this; }
private:
template<typename Gen1, typename... GenNext>
inline void add_all(internal::type_list<Gen1, GenNext...>)
{
add(Gen1());
add_all(internal::type_list<GenNext...>());
}
inline void add_all(internal::type_list<>)
{
}
};
inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const
{
eigen_internal_assert(g1.representation.size() == m_numIndices);
eigen_internal_assert(g2.representation.size() == m_numIndices);
GroupElement result;
result.representation.reserve(m_numIndices);
for (std::size_t i = 0; i < m_numIndices; i++) {
int v = g2.representation[g1.representation[i]];
eigen_assert(v >= 0);
result.representation.push_back(v);
}
result.flags = g1.flags ^ g2.flags;
return result;
}
inline void DynamicSGroup::add(int one, int two, int flags)
{
eigen_assert(one >= 0);
eigen_assert(two >= 0);
eigen_assert(one != two);
if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) {
std::size_t newNumIndices = (one > two) ? one : two + 1;
for (auto& gelem : m_elements) {
gelem.representation.reserve(newNumIndices);
for (std::size_t i = m_numIndices; i < newNumIndices; i++)
gelem.representation.push_back(i);
}
m_numIndices = newNumIndices;
}
Generator g{one, two, flags};
GroupElement e = ge(g);
/* special case for first generator */
if (m_elements.size() == 1) {
while (!e.isId()) {
m_elements.push_back(e);
e = mul(e, g);
}
if (e.flags > 0)
updateGlobalFlags(e.flags);
// only add in case we didn't have identity
if (m_elements.size() > 1)
m_generators.push_back(g);
return;
}
int p = findElement(e);
if (p >= 0) {
updateGlobalFlags(p);
return;
}
std::size_t coset_order = m_elements.size();
m_elements.push_back(e);
for (std::size_t i = 1; i < coset_order; i++)
m_elements.push_back(mul(m_elements[i], e));
m_generators.push_back(g);
std::size_t coset_rep = coset_order;
do {
for (auto g : m_generators) {
e = mul(m_elements[coset_rep], g);
p = findElement(e);
if (p < 0) {
// element not yet in group
m_elements.push_back(e);
for (std::size_t i = 1; i < coset_order; i++)
m_elements.push_back(mul(m_elements[i], e));
} else if (p > 0) {
updateGlobalFlags(p);
}
}
coset_rep += coset_order;
} while (coset_rep < m_elements.size());
}
inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator)
{
switch (flagDiffOfSameGenerator) {
case 0:
default:
// nothing happened
break;
case NegationFlag:
// every element is it's own negative => whole tensor is zero
m_globalFlags |= GlobalZeroFlag;
break;
case ConjugationFlag:
// every element is it's own conjugate => whole tensor is real
m_globalFlags |= GlobalRealFlag;
break;
case (NegationFlag | ConjugationFlag):
// every element is it's own negative conjugate => whole tensor is imaginary
m_globalFlags |= GlobalImagFlag;
break;
/* NOTE:
* since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator
* causes the tensor to be real and the next one to be imaginary, this will
* trivially give the correct result
*/
}
}
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@@ -0,0 +1,236 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
namespace Eigen {
namespace internal {
template<typename list> struct tensor_static_symgroup_permutate;
template<int... nn>
struct tensor_static_symgroup_permutate<numeric_list<int, nn...>>
{
constexpr static std::size_t N = sizeof...(nn);
template<typename T>
constexpr static inline std::array<T, N> run(const std::array<T, N>& indices)
{
return {{indices[nn]...}};
}
};
template<typename indices_, int flags_>
struct tensor_static_symgroup_element
{
typedef indices_ indices;
constexpr static int flags = flags_;
};
template<typename Gen, int N>
struct tensor_static_symgroup_element_ctor
{
typedef tensor_static_symgroup_element<
typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type,
Gen::Flags
> type;
};
template<int N>
struct tensor_static_symgroup_identity_ctor
{
typedef tensor_static_symgroup_element<
typename gen_numeric_list<int, N>::type,
0
> type;
};
template<typename iib>
struct tensor_static_symgroup_multiply_helper
{
template<int... iia>
constexpr static inline numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
return numeric_list<int, get<iia, iib>::value...>();
}
};
template<typename A, typename B>
struct tensor_static_symgroup_multiply
{
private:
typedef typename A::indices iia;
typedef typename B::indices iib;
constexpr static int ffa = A::flags;
constexpr static int ffb = B::flags;
public:
static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices.");
typedef tensor_static_symgroup_element<
decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())),
ffa ^ ffb
> type;
};
template<typename A, typename B>
struct tensor_static_symgroup_equality
{
typedef typename A::indices iia;
typedef typename B::indices iib;
constexpr static int ffa = A::flags;
constexpr static int ffb = B::flags;
static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices.");
constexpr static bool value = is_same<iia, iib>::value;
private:
/* this should be zero if they are identical, or else the tensor
* will be forced to be pure real, pure imaginary or even pure zero
*/
constexpr static int flags_cmp_ = ffa ^ ffb;
/* either they are not equal, then we don't care whether the flags
* match, or they are equal, and then we have to check
*/
constexpr static bool is_zero = value && flags_cmp_ == NegationFlag;
constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag;
constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag);
public:
constexpr static int global_flags =
(is_real ? GlobalRealFlag : 0) |
(is_imag ? GlobalImagFlag : 0) |
(is_zero ? GlobalZeroFlag : 0);
};
template<std::size_t NumIndices, typename... Gen>
struct tensor_static_symgroup
{
typedef StaticSGroup<Gen...> type;
constexpr static std::size_t size = type::static_size;
};
template<typename Index, std::size_t N, int... ii, int... jj>
constexpr static inline std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx, internal::numeric_list<int, ii...>, internal::numeric_list<int, jj...>)
{
return {{ idx[ii]..., idx[jj]... }};
}
template<typename Index, int... ii>
static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx, internal::numeric_list<int, ii...>)
{
std::vector<Index> result{{ idx[ii]... }};
std::size_t target_size = idx.size();
for (std::size_t i = result.size(); i < target_size; i++)
result.push_back(idx[i]);
return result;
}
template<typename T> struct tensor_static_symgroup_do_apply;
template<typename first, typename... next>
struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>>
{
template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args)
{
static_assert(NumIndices >= SGNumIndices, "Can only apply symmetry group to objects that have at least the required amount of indices.");
typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices;
initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()), first::flags, initial, std::forward<Args>(args)...);
return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
}
template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args)
{
eigen_assert(idx.size() >= SGNumIndices && "Can only apply symmetry group to objects that have at least the required amount of indices.");
initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial, std::forward<Args>(args)...);
return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(idx, initial, args...);
}
};
template<EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>>
{
template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices, typename... Args>
static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...)
{
// do nothing
return initial;
}
template<typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
static inline RV run(const std::vector<Index>&, RV initial, Args&&...)
{
// do nothing
return initial;
}
};
} // end namespace internal
template<typename... Gen>
class StaticSGroup
{
constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
typedef internal::group_theory::enumerate_group_elements<
internal::tensor_static_symgroup_multiply,
internal::tensor_static_symgroup_equality,
typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type,
internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...>
> group_elements;
typedef typename group_elements::type ge;
public:
constexpr inline StaticSGroup() {}
constexpr inline StaticSGroup(const StaticSGroup<Gen...>&) {}
constexpr inline StaticSGroup(StaticSGroup<Gen...>&&) {}
template<typename Op, typename RV, typename Index, std::size_t N, typename... Args>
static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args)
{
return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
}
template<typename Op, typename RV, typename Index, typename... Args>
static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args)
{
eigen_assert(idx.size() == NumIndices);
return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
}
constexpr static std::size_t static_size = ge::count;
constexpr static inline std::size_t size() {
return ge::count;
}
constexpr static inline int globalFlags() { return group_elements::global_flags; }
template<typename Tensor_, typename... IndexTypes>
inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const
{
static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
}
template<typename Tensor_>
inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const
{
return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices);
}
};
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@@ -0,0 +1,338 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
namespace Eigen {
enum {
NegationFlag = 0x01,
ConjugationFlag = 0x02
};
enum {
GlobalRealFlag = 0x01,
GlobalImagFlag = 0x02,
GlobalZeroFlag = 0x03
};
namespace internal {
template<std::size_t NumIndices, typename... Sym> struct tensor_symmetry_pre_analysis;
template<std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup;
template<bool instantiate, std::size_t NumIndices, typename... Sym> struct tensor_static_symgroup_if;
template<typename Tensor_> struct tensor_symmetry_calculate_flags;
template<typename Tensor_> struct tensor_symmetry_assign_value;
template<typename... Sym> struct tensor_symmetry_num_indices;
} // end namespace internal
template<int One_, int Two_>
struct Symmetry
{
static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
constexpr static int One = One_;
constexpr static int Two = Two_;
constexpr static int Flags = 0;
};
template<int One_, int Two_>
struct AntiSymmetry
{
static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
constexpr static int One = One_;
constexpr static int Two = Two_;
constexpr static int Flags = NegationFlag;
};
template<int One_, int Two_>
struct Hermiticity
{
static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
constexpr static int One = One_;
constexpr static int Two = Two_;
constexpr static int Flags = ConjugationFlag;
};
template<int One_, int Two_>
struct AntiHermiticity
{
static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
constexpr static int One = One_;
constexpr static int Two = Two_;
constexpr static int Flags = ConjugationFlag | NegationFlag;
};
/** \class DynamicSGroup
* \ingroup TensorSymmetry_Module
*
* \brief Dynamic symmetry group
*
* The %DynamicSGroup class represents a symmetry group that need not be known at
* compile time. It is useful if one wants to support arbitrary run-time defineable
* symmetries for tensors, but it is also instantiated if a symmetry group is defined
* at compile time that would be either too large for the compiler to reasonably
* generate (using templates to calculate this at compile time is very inefficient)
* or that the compiler could generate the group but that it wouldn't make sense to
* unroll the loop for setting coefficients anymore.
*/
class DynamicSGroup;
/** \internal
*
* \class DynamicSGroupFromTemplateArgs
* \ingroup TensorSymmetry_Module
*
* \brief Dynamic symmetry group, initialized from template arguments
*
* This class is a child class of DynamicSGroup. It uses the template arguments
* specified to initialize itself.
*/
template<typename... Gen>
class DynamicSGroupFromTemplateArgs;
/** \class StaticSGroup
* \ingroup TensorSymmetry_Module
*
* \brief Static symmetry group
*
* This class represents a symmetry group that is known and resolved completely
* at compile time. Ideally, no run-time penalty is incurred compared to the
* manual unrolling of the symmetry.
*
* <b><i>CAUTION:</i></b>
*
* Do not use this class directly for large symmetry groups. The compiler
* may run into a limit, or segfault or in the very least will take a very,
* very, very long time to compile the code. Use the SGroup class instead
* if you want a static group. That class contains logic that will
* automatically select the DynamicSGroup class instead if the symmetry
* group becomes too large. (In that case, unrolling may not even be
* beneficial.)
*/
template<typename... Gen>
class StaticSGroup;
/** \class SGroup
* \ingroup TensorSymmetry_Module
*
* \brief Symmetry group, initialized from template arguments
*
* This class represents a symmetry group whose generators are already
* known at compile time. It may or may not be resolved at compile time,
* depending on the estimated size of the group.
*
* \sa StaticSGroup
* \sa DynamicSGroup
*/
template<typename... Gen>
class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value, Gen...>::root_type
{
public:
constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base;
// make standard constructors + assignment operators public
inline SGroup() : Base() { }
inline SGroup(const SGroup<Gen...>& other) : Base(other) { }
inline SGroup(SGroup<Gen...>&& other) : Base(other) { }
inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) { Base::operator=(other); return *this; }
inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) { Base::operator=(other); return *this; }
// all else is defined in the base class
};
namespace internal {
template<typename... Sym> struct tensor_symmetry_num_indices
{
constexpr static std::size_t value = 1;
};
template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...>
{
private:
constexpr static std::size_t One = static_cast<std::size_t>(One_);
constexpr static std::size_t Two = static_cast<std::size_t>(Two_);
constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value;
// don't use std::max, since it's not constexpr until C++14...
constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1;
public:
constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three;
};
template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...>
: public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...>
: public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
template<int One_, int Two_, typename... Sym> struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...>
: public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
/** \internal
*
* \class tensor_symmetry_pre_analysis
* \ingroup TensorSymmetry_Module
*
* \brief Pre-select whether to use a static or dynamic symmetry group
*
* When a symmetry group could in principle be determined at compile time,
* this template implements the logic whether to actually do that or whether
* to rather defer that to runtime.
*
* The logic is as follows:
* <dl>
* <dt><b>No generators (trivial symmetry):</b></dt>
* <dd>Use a trivial static group. Ideally, this has no performance impact
* compared to not using symmetry at all. In practice, this might not
* be the case.</dd>
* <dt><b>More than 4 generators:</b></dt>
* <dd>Calculate the group at run time, it is likely far too large for the
* compiler to be able to properly generate it in a realistic time.</dd>
* <dt><b>Up to and including 4 generators:</b></dt>
* <dd>Actually enumerate all group elements, but then check how many there
* are. If there are more than 16, it is unlikely that unrolling the
* loop (as is done in the static compile-time case) is sensible, so
* use a dynamic group instead. If there are at most 16 elements, actually
* use that static group. Note that the largest group with 4 generators
* still compiles with reasonable resources.</dd>
* </dl>
*
* Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470
* with 16 GiB RAM (all generators non-redundant and the subgroups don't
* factorize):
*
* # Generators -O0 -ggdb -O2
* -------------------------------------------------------------------
* 1 0.5 s / 250 MiB 0.45s / 230 MiB
* 2 0.5 s / 260 MiB 0.5 s / 250 MiB
* 3 0.65s / 310 MiB 0.62s / 310 MiB
* 4 2.2 s / 860 MiB 1.7 s / 770 MiB
* 5 130 s / 13000 MiB 120 s / 11000 MiB
*
* It is clear that everything is still very efficient up to 4 generators, then
* the memory and CPU requirements become unreasonable. Thus we only instantiate
* the template group theory logic if the number of generators supplied is 4 or
* lower, otherwise this will be forced to be done during runtime, where the
* algorithm is reasonably fast.
*/
template<std::size_t NumIndices>
struct tensor_symmetry_pre_analysis<NumIndices>
{
typedef StaticSGroup<> root_type;
};
template<std::size_t NumIndices, typename Gen_, typename... Gens_>
struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...>
{
constexpr static std::size_t max_static_generators = 4;
constexpr static std::size_t max_static_elements = 16;
typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper;
constexpr static std::size_t possible_size = helper::size;
typedef typename conditional<
possible_size == 0 || possible_size >= max_static_elements,
DynamicSGroupFromTemplateArgs<Gen_, Gens_...>,
typename helper::type
>::type root_type;
};
template<bool instantiate, std::size_t NumIndices, typename... Gens>
struct tensor_static_symgroup_if
{
constexpr static std::size_t size = 0;
typedef void type;
};
template<std::size_t NumIndices, typename... Gens>
struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {};
template<typename Tensor_>
struct tensor_symmetry_assign_value
{
typedef typename Tensor_::Index Index;
typedef typename Tensor_::Scalar Scalar;
constexpr static std::size_t NumIndices = Tensor_::NumIndices;
static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy, Tensor_& tensor, const Scalar& value_)
{
Scalar value(value_);
if (transformation_flags & ConjugationFlag)
value = numext::conj(value);
if (transformation_flags & NegationFlag)
value = -value;
tensor.coeffRef(transformed_indices) = value;
return dummy;
}
};
template<typename Tensor_>
struct tensor_symmetry_calculate_flags
{
typedef typename Tensor_::Index Index;
constexpr static std::size_t NumIndices = Tensor_::NumIndices;
static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags, int current_flags, const std::array<Index, NumIndices>& orig_indices)
{
if (transformed_indices == orig_indices) {
if (transform_flags & (ConjugationFlag | NegationFlag))
return current_flags | GlobalImagFlag; // anti-hermitian diagonal
else if (transform_flags & ConjugationFlag)
return current_flags | GlobalRealFlag; // hermitian diagonal
else if (transform_flags & NegationFlag)
return current_flags | GlobalZeroFlag; // anti-symmetric diagonal
}
return current_flags;
}
};
template<typename Tensor_, typename Symmetry_, int Flags = 0>
class tensor_symmetry_value_setter
{
public:
typedef typename Tensor_::Index Index;
typedef typename Tensor_::Scalar Scalar;
constexpr static std::size_t NumIndices = Tensor_::NumIndices;
inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry, std::array<Index, NumIndices> const& indices)
: m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) { }
inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value)
{
doAssign(value);
return *this;
}
private:
Tensor_& m_tensor;
Symmetry_ m_symmetry;
std::array<Index, NumIndices> m_indices;
inline void doAssign(Scalar const& value)
{
#ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES
int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(m_indices, m_symmetry.globalFlags(), m_indices);
if (value_flags & GlobalRealFlag)
eigen_assert(numext::imag(value) == 0);
if (value_flags & GlobalImagFlag)
eigen_assert(numext::real(value) == 0);
#endif
m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value);
}
};
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@@ -0,0 +1,669 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
namespace Eigen {
namespace internal {
namespace group_theory {
/** \internal
* \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
* This file contains C++ templates that implement group theory algorithms.
*
* The algorithms allow for a compile-time analysis of finite groups.
*
* Currently only Dimino's algorithm is implemented, which returns a list
* of all elements in a group given a set of (possibly redundant) generators.
* (One could also do that with the so-called orbital algorithm, but that
* is much more expensive and usually has no advantages.)
*/
/**********************************************************************
* "Ok kid, here is where it gets complicated."
* - Amelia Pond in the "Doctor Who" episode
* "The Big Bang"
*
* Dimino's algorithm
* ==================
*
* The following is Dimino's algorithm in sequential form:
*
* Input: identity element, list of generators, equality check,
* multiplication operation
* Output: list of group elements
*
* 1. add identity element
* 2. remove identities from list of generators
* 3. add all powers of first generator that aren't the
* identity element
* 4. go through all remaining generators:
* a. if generator is already in the list of elements
* -> do nothing
* b. otherwise
* i. remember current # of elements
* (i.e. the size of the current subgroup)
* ii. add all current elements (which includes
* the identity) each multiplied from right
* with the current generator to the group
* iii. add all remaining cosets that are generated
* by products of the new generator with itself
* and all other generators seen so far
*
* In functional form, this is implemented as a long set of recursive
* templates that have a complicated relationship.
*
* The main interface for Dimino's algorithm is the template
* enumerate_group_elements. All lists are implemented as variadic
* type_list<typename...> and numeric_list<typename = int, int...>
* templates.
*
* 'Calling' templates is usually done via typedefs.
*
* This algorithm is an extended version of the basic version. The
* extension consists in the fact that each group element has a set
* of flags associated with it. Multiplication of two group elements
* with each other results in a group element whose flags are the
* XOR of the flags of the previous elements. Each time the algorithm
* notices that a group element it just calculated is already in the
* list of current elements, the flags of both will be compared and
* added to the so-called 'global flags' of the group.
*
* The rationale behind this extension is that this allows not only
* for the description of symmetries between tensor indices, but
* also allows for the description of hermiticity, antisymmetry and
* antihermiticity. Negation and conjugation each are specific bit
* in the flags value and if two different ways to reach a group
* element lead to two different flags, this poses a constraint on
* the allowed values of the resulting tensor. For example, if a
* group element is reach both with and without the conjugation
* flags, it is clear that the resulting tensor has to be real.
*
* Note that this flag mechanism is quite generic and may have other
* uses beyond tensor properties.
*
* IMPORTANT:
* This algorithm assumes the group to be finite. If you try to
* run it with a group that's infinite, the algorithm will only
* terminate once you hit a compiler limit (max template depth).
* Also note that trying to use this implementation to create a
* very large group will probably either make you hit the same
* limit, cause the compiler to segfault or at the very least
* take a *really* long time (hours, days, weeks - sic!) to
* compile. It is not recommended to plug in more than 4
* generators, unless they are independent of each other.
*/
/** \internal
*
* \class strip_identities
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Cleanse a list of group elements of the identity element
*
* This template is used to make a first pass through all initial
* generators of Dimino's algorithm and remove the identity
* elements.
*
* \sa enumerate_group_elements
*/
template<template<typename, typename> class Equality, typename id, typename L> struct strip_identities;
template<
template<typename, typename> class Equality,
typename id,
typename t,
typename... ts
>
struct strip_identities<Equality, id, type_list<t, ts...>>
{
typedef typename conditional<
Equality<id, t>::value,
typename strip_identities<Equality, id, type_list<ts...>>::type,
typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type
>::type type;
constexpr static int global_flags = Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
};
template<
template<typename, typename> class Equality,
typename id
EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)
>
struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>>
{
typedef type_list<> type;
constexpr static int global_flags = 0;
};
/** \internal
*
* \class dimino_first_step_elements_helper
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Recursive template that adds powers of the first generator to the list of group elements
*
* This template calls itself recursively to add powers of the first
* generator to the list of group elements. It stops if it reaches
* the identity element again.
*
* \sa enumerate_group_elements, dimino_first_step_elements
*/
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename g,
typename current_element,
typename elements,
bool dont_add_current_element // = false
>
struct dimino_first_step_elements_helper
#ifndef EIGEN_PARSED_BY_DOXYGEN
: // recursive inheritance is too difficult for Doxygen
public dimino_first_step_elements_helper<
Multiply,
Equality,
id,
g,
typename Multiply<current_element, g>::type,
typename concat<elements, type_list<current_element>>::type,
Equality<typename Multiply<current_element, g>::type, id>::value
> {};
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename g,
typename current_element,
typename elements
>
struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
#endif // EIGEN_PARSED_BY_DOXYGEN
{
typedef elements type;
constexpr static int global_flags = Equality<current_element, id>::global_flags;
};
/** \internal
*
* \class dimino_first_step_elements
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Add all powers of the first generator to the list of group elements
*
* This template takes the first non-identity generator and generates the initial
* list of elements which consists of all powers of that generator. For a group
* with just one generated, it would be enumerated after this.
*
* \sa enumerate_group_elements
*/
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename generators
>
struct dimino_first_step_elements
{
typedef typename get<0, generators>::type first_generator;
typedef typename skip<1, generators>::type next_generators;
typedef type_list<first_generator> generators_done;
typedef dimino_first_step_elements_helper<
Multiply,
Equality,
id,
first_generator,
first_generator,
type_list<id>,
false
> helper;
typedef typename helper::type type;
constexpr static int global_flags = helper::global_flags;
};
/** \internal
*
* \class dimino_get_coset_elements
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Generate all elements of a specific coset
*
* This template generates all the elements of a specific coset by
* multiplying all elements in the given subgroup with the new
* coset representative. Note that the first element of the
* subgroup is always the identity element, so the first element of
* the result of this template is going to be the coset
* representative itself.
*
* Note that this template accepts an additional boolean parameter
* that specifies whether to actually generate the coset (true) or
* just return an empty list (false).
*
* \sa enumerate_group_elements, dimino_add_cosets_for_rep
*/
template<
template<typename, typename> class Multiply,
typename sub_group_elements,
typename new_coset_rep,
bool generate_coset // = true
>
struct dimino_get_coset_elements
{
typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type;
};
template<
template<typename, typename> class Multiply,
typename sub_group_elements,
typename new_coset_rep
>
struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false>
{
typedef type_list<> type;
};
/** \internal
*
* \class dimino_add_cosets_for_rep
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Recursive template for adding coset spaces
*
* This template multiplies the coset representative with a generator
* from the list of previous generators. If the new element is not in
* the group already, it adds the corresponding coset. Finally it
* proceeds to call itself with the next generator from the list.
*
* \sa enumerate_group_elements, dimino_add_all_coset_spaces
*/
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename sub_group_elements,
typename elements,
typename generators,
typename rep_element,
int sub_group_size
>
struct dimino_add_cosets_for_rep;
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename sub_group_elements,
typename elements,
typename g,
typename... gs,
typename rep_element,
int sub_group_size
>
struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element, sub_group_size>
{
typedef typename Multiply<rep_element, g>::type new_coset_rep;
typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil;
constexpr static bool add_coset = !_cil::value;
typedef typename dimino_get_coset_elements<
Multiply,
sub_group_elements,
new_coset_rep,
add_coset
>::type coset_elements;
typedef dimino_add_cosets_for_rep<
Multiply,
Equality,
id,
sub_group_elements,
typename concat<elements, coset_elements>::type,
type_list<gs...>,
rep_element,
sub_group_size
> _helper;
typedef typename _helper::type type;
constexpr static int global_flags = _cil::global_flags | _helper::global_flags;
/* Note that we don't have to update global flags here, since
* we will only add these elements if they are not part of
* the group already. But that only happens if the coset rep
* is not already in the group, so the check for the coset rep
* will catch this.
*/
};
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename sub_group_elements,
typename elements
EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
typename rep_element,
int sub_group_size
>
struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size>
{
typedef elements type;
constexpr static int global_flags = 0;
};
/** \internal
*
* \class dimino_add_all_coset_spaces
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Recursive template for adding all coset spaces for a new generator
*
* This template tries to go through the list of generators (with
* the help of the dimino_add_cosets_for_rep template) as long as
* it still finds elements that are not part of the group and add
* the corresponding cosets.
*
* \sa enumerate_group_elements, dimino_add_cosets_for_rep
*/
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename sub_group_elements,
typename elements,
typename generators,
int sub_group_size,
int rep_pos,
bool stop_condition // = false
>
struct dimino_add_all_coset_spaces
{
typedef typename get<rep_pos, elements>::type rep_element;
typedef dimino_add_cosets_for_rep<
Multiply,
Equality,
id,
sub_group_elements,
elements,
generators,
rep_element,
sub_group_elements::count
> _ac4r;
typedef typename _ac4r::type new_elements;
constexpr static int new_rep_pos = rep_pos + sub_group_elements::count;
constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count;
typedef dimino_add_all_coset_spaces<
Multiply,
Equality,
id,
sub_group_elements,
new_elements,
generators,
sub_group_size,
new_rep_pos,
new_stop_condition
> _helper;
typedef typename _helper::type type;
constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags;
};
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename sub_group_elements,
typename elements,
typename generators,
int sub_group_size,
int rep_pos
>
struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size, rep_pos, true>
{
typedef elements type;
constexpr static int global_flags = 0;
};
/** \internal
*
* \class dimino_add_generator
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Enlarge the group by adding a new generator.
*
* It accepts a boolean parameter that determines if the generator is redundant,
* i.e. was already seen in the group. In that case, it reduces to a no-op.
*
* \sa enumerate_group_elements, dimino_add_all_coset_spaces
*/
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename elements,
typename generators_done,
typename current_generator,
bool redundant // = false
>
struct dimino_add_generator
{
/* this template is only called if the generator is not redundant
* => all elements of the group multiplied with the new generator
* are going to be new elements of the most trivial coset space
*/
typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements;
typedef typename concat<elements, multiplied_elements>::type new_elements;
constexpr static int rep_pos = elements::count;
typedef dimino_add_all_coset_spaces<
Multiply,
Equality,
id,
elements, // elements of previous subgroup
new_elements,
typename concat<generators_done, type_list<current_generator>>::type,
elements::count, // size of previous subgroup
rep_pos,
false // don't stop (because rep_pos >= new_elements::count is always false at this point)
> _helper;
typedef typename _helper::type type;
constexpr static int global_flags = _helper::global_flags;
};
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename elements,
typename generators_done,
typename current_generator
>
struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true>
{
// redundant case
typedef elements type;
constexpr static int global_flags = 0;
};
/** \internal
*
* \class dimino_add_remaining_generators
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Recursive template that adds all remaining generators to a group
*
* Loop through the list of generators that remain and successively
* add them to the group.
*
* \sa enumerate_group_elements, dimino_add_generator
*/
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename generators_done,
typename remaining_generators,
typename elements
>
struct dimino_add_remaining_generators
{
typedef typename get<0, remaining_generators>::type first_generator;
typedef typename skip<1, remaining_generators>::type next_generators;
typedef contained_in_list_gf<Equality, first_generator, elements> _cil;
typedef dimino_add_generator<
Multiply,
Equality,
id,
elements,
generators_done,
first_generator,
_cil::value
> _helper;
typedef typename _helper::type new_elements;
typedef dimino_add_remaining_generators<
Multiply,
Equality,
id,
typename concat<generators_done, type_list<first_generator>>::type,
next_generators,
new_elements
> _next_iter;
typedef typename _next_iter::type type;
constexpr static int global_flags =
_cil::global_flags |
_helper::global_flags |
_next_iter::global_flags;
};
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename generators_done,
typename elements
>
struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements>
{
typedef elements type;
constexpr static int global_flags = 0;
};
/** \internal
*
* \class enumerate_group_elements_noid
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Helper template that implements group element enumeration
*
* This is a helper template that implements the actual enumeration
* of group elements. This has been split so that the list of
* generators can be cleansed of the identity element before
* performing the actual operation.
*
* \sa enumerate_group_elements
*/
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename generators,
int initial_global_flags = 0
>
struct enumerate_group_elements_noid
{
typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step;
typedef typename first_step::type first_step_elements;
typedef dimino_add_remaining_generators<
Multiply,
Equality,
id,
typename first_step::generators_done,
typename first_step::next_generators, // remaining_generators
typename first_step::type // first_step elements
> _helper;
typedef typename _helper::type type;
constexpr static int global_flags =
initial_global_flags |
first_step::global_flags |
_helper::global_flags;
};
// in case when no generators are specified
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
int initial_global_flags
>
struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags>
{
typedef type_list<id> type;
constexpr static int global_flags = initial_global_flags;
};
/** \internal
*
* \class enumerate_group_elements
* \ingroup CXX11_TensorSymmetry_Module
*
* \brief Enumerate all elements in a finite group
*
* This template enumerates all elements in a finite group. It accepts
* the following template parameters:
*
* \tparam Multiply The multiplication operation that multiplies two group elements
* with each other.
* \tparam Equality The equality check operation that checks if two group elements
* are equal to another.
* \tparam id The identity element
* \tparam _generators A list of (possibly redundant) generators of the group
*/
template<
template<typename, typename> class Multiply,
template<typename, typename> class Equality,
typename id,
typename _generators
>
struct enumerate_group_elements
: public enumerate_group_elements_noid<
Multiply,
Equality,
id,
typename strip_identities<Equality, id, _generators>::type,
strip_identities<Equality, id, _generators>::global_flags
>
{
};
} // end namespace group_theory
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@@ -0,0 +1,67 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Barrier is an object that allows one or more threads to wait until
// Notify has been called a specified number of times.
#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
#define EIGEN_CXX11_THREADPOOL_BARRIER_H
namespace Eigen {
class Barrier {
public:
Barrier(unsigned int count) : state_(count << 1), notified_(false) {
eigen_plain_assert(((count << 1) >> 1) == count);
}
~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
void Notify() {
unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
if (v != 1) {
// Clear the lowest bit (waiter flag) and check that the original state
// value was not zero. If it was zero, it means that notify was called
// more times than the original count.
eigen_plain_assert(((v + 2) & ~1) != 0);
return; // either count has not dropped to 0, or waiter is not waiting
}
std::unique_lock<std::mutex> l(mu_);
eigen_plain_assert(!notified_);
notified_ = true;
cv_.notify_all();
}
void Wait() {
unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
if ((v >> 1) == 0) return;
std::unique_lock<std::mutex> l(mu_);
while (!notified_) {
cv_.wait(l);
}
}
private:
std::mutex mu_;
std::condition_variable cv_;
std::atomic<unsigned int> state_; // low bit is waiter flag
bool notified_;
};
// Notification is an object that allows a user to to wait for another
// thread to signal a notification that an event has occurred.
//
// Multiple threads can wait on the same Notification object,
// but only one caller must call Notify() on the object.
struct Notification : Barrier {
Notification() : Barrier(1){};
};
} // namespace Eigen
#endif // EIGEN_CXX11_THREADPOOL_BARRIER_H

View File

@@ -0,0 +1,249 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_
namespace Eigen {
// EventCount allows to wait for arbitrary predicates in non-blocking
// algorithms. Think of condition variable, but wait predicate does not need to
// be protected by a mutex. Usage:
// Waiting thread does:
//
// if (predicate)
// return act();
// EventCount::Waiter& w = waiters[my_index];
// ec.Prewait(&w);
// if (predicate) {
// ec.CancelWait(&w);
// return act();
// }
// ec.CommitWait(&w);
//
// Notifying thread does:
//
// predicate = true;
// ec.Notify(true);
//
// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
// cheap, but they are executed only if the preceding predicate check has
// failed.
//
// Algorithm outline:
// There are two main variables: predicate (managed by user) and state_.
// Operation closely resembles Dekker mutual algorithm:
// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
// Waiting thread sets state_ then checks predicate, Notifying thread sets
// predicate then checks state_. Due to seq_cst fences in between these
// operations it is guaranteed than either waiter will see predicate change
// and won't block, or notifying thread will see state_ change and will unblock
// the waiter, or both. But it can't happen that both threads don't see each
// other changes, which would lead to deadlock.
class EventCount {
public:
class Waiter;
EventCount(MaxSizeVector<Waiter>& waiters)
: state_(kStackMask), waiters_(waiters) {
eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
}
~EventCount() {
// Ensure there are no waiters.
eigen_plain_assert(state_.load() == kStackMask);
}
// Prewait prepares for waiting.
// After calling Prewait, the thread must re-check the wait predicate
// and then call either CancelWait or CommitWait.
void Prewait() {
uint64_t state = state_.load(std::memory_order_relaxed);
for (;;) {
CheckState(state);
uint64_t newstate = state + kWaiterInc;
CheckState(newstate);
if (state_.compare_exchange_weak(state, newstate,
std::memory_order_seq_cst))
return;
}
}
// CommitWait commits waiting after Prewait.
void CommitWait(Waiter* w) {
eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
w->state = Waiter::kNotSignaled;
const uint64_t me = (w - &waiters_[0]) | w->epoch;
uint64_t state = state_.load(std::memory_order_seq_cst);
for (;;) {
CheckState(state, true);
uint64_t newstate;
if ((state & kSignalMask) != 0) {
// Consume the signal and return immidiately.
newstate = state - kWaiterInc - kSignalInc;
} else {
// Remove this thread from pre-wait counter and add to the waiter stack.
newstate = ((state & kWaiterMask) - kWaiterInc) | me;
w->next.store(state & (kStackMask | kEpochMask),
std::memory_order_relaxed);
}
CheckState(newstate);
if (state_.compare_exchange_weak(state, newstate,
std::memory_order_acq_rel)) {
if ((state & kSignalMask) == 0) {
w->epoch += kEpochInc;
Park(w);
}
return;
}
}
}
// CancelWait cancels effects of the previous Prewait call.
void CancelWait() {
uint64_t state = state_.load(std::memory_order_relaxed);
for (;;) {
CheckState(state, true);
uint64_t newstate = state - kWaiterInc;
// We don't know if the thread was also notified or not,
// so we should not consume a signal unconditionaly.
// Only if number of waiters is equal to number of signals,
// we know that the thread was notified and we must take away the signal.
if (((state & kWaiterMask) >> kWaiterShift) ==
((state & kSignalMask) >> kSignalShift))
newstate -= kSignalInc;
CheckState(newstate);
if (state_.compare_exchange_weak(state, newstate,
std::memory_order_acq_rel))
return;
}
}
// Notify wakes one or all waiting threads.
// Must be called after changing the associated wait predicate.
void Notify(bool notifyAll) {
std::atomic_thread_fence(std::memory_order_seq_cst);
uint64_t state = state_.load(std::memory_order_acquire);
for (;;) {
CheckState(state);
const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
const uint64_t signals = (state & kSignalMask) >> kSignalShift;
// Easy case: no waiters.
if ((state & kStackMask) == kStackMask && waiters == signals) return;
uint64_t newstate;
if (notifyAll) {
// Empty wait stack and set signal to number of pre-wait threads.
newstate =
(state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
} else if (signals < waiters) {
// There is a thread in pre-wait state, unblock it.
newstate = state + kSignalInc;
} else {
// Pop a waiter from list and unpark it.
Waiter* w = &waiters_[state & kStackMask];
uint64_t next = w->next.load(std::memory_order_relaxed);
newstate = (state & (kWaiterMask | kSignalMask)) | next;
}
CheckState(newstate);
if (state_.compare_exchange_weak(state, newstate,
std::memory_order_acq_rel)) {
if (!notifyAll && (signals < waiters))
return; // unblocked pre-wait thread
if ((state & kStackMask) == kStackMask) return;
Waiter* w = &waiters_[state & kStackMask];
if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
Unpark(w);
return;
}
}
}
class Waiter {
friend class EventCount;
// Align to 128 byte boundary to prevent false sharing with other Waiter
// objects in the same vector.
EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
std::mutex mu;
std::condition_variable cv;
uint64_t epoch = 0;
unsigned state = kNotSignaled;
enum {
kNotSignaled,
kWaiting,
kSignaled,
};
};
private:
// State_ layout:
// - low kWaiterBits is a stack of waiters committed wait
// (indexes in waiters_ array are used as stack elements,
// kStackMask means empty stack).
// - next kWaiterBits is count of waiters in prewait state.
// - next kWaiterBits is count of pending signals.
// - remaining bits are ABA counter for the stack.
// (stored in Waiter node and incremented on push).
static const uint64_t kWaiterBits = 14;
static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
static const uint64_t kWaiterShift = kWaiterBits;
static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
<< kWaiterShift;
static const uint64_t kWaiterInc = 1ull << kWaiterShift;
static const uint64_t kSignalShift = 2 * kWaiterBits;
static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
<< kSignalShift;
static const uint64_t kSignalInc = 1ull << kSignalShift;
static const uint64_t kEpochShift = 3 * kWaiterBits;
static const uint64_t kEpochBits = 64 - kEpochShift;
static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
static const uint64_t kEpochInc = 1ull << kEpochShift;
std::atomic<uint64_t> state_;
MaxSizeVector<Waiter>& waiters_;
static void CheckState(uint64_t state, bool waiter = false) {
static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
const uint64_t signals = (state & kSignalMask) >> kSignalShift;
eigen_plain_assert(waiters >= signals);
eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
eigen_plain_assert(!waiter || waiters > 0);
(void)waiters;
(void)signals;
}
void Park(Waiter* w) {
std::unique_lock<std::mutex> lock(w->mu);
while (w->state != Waiter::kSignaled) {
w->state = Waiter::kWaiting;
w->cv.wait(lock);
}
}
void Unpark(Waiter* w) {
for (Waiter* next; w; w = next) {
uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
next = wnext == kStackMask ? nullptr : &waiters_[wnext];
unsigned state;
{
std::unique_lock<std::mutex> lock(w->mu);
state = w->state;
w->state = Waiter::kSignaled;
}
// Avoid notifying if it wasn't waiting.
if (state == Waiter::kWaiting) w->cv.notify_one();
}
}
EventCount(const EventCount&) = delete;
void operator=(const EventCount&) = delete;
};
} // namespace Eigen
#endif // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H_

View File

@@ -0,0 +1,486 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
namespace Eigen {
template <typename Environment>
class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
public:
typedef typename Environment::Task Task;
typedef RunQueue<Task, 1024> Queue;
ThreadPoolTempl(int num_threads, Environment env = Environment())
: ThreadPoolTempl(num_threads, true, env) {}
ThreadPoolTempl(int num_threads, bool allow_spinning,
Environment env = Environment())
: env_(env),
num_threads_(num_threads),
allow_spinning_(allow_spinning),
thread_data_(num_threads),
all_coprimes_(num_threads),
waiters_(num_threads),
global_steal_partition_(EncodePartition(0, num_threads_)),
blocked_(0),
spinning_(0),
done_(false),
cancelled_(false),
ec_(waiters_) {
waiters_.resize(num_threads_);
// Calculate coprimes of all numbers [1, num_threads].
// Coprimes are used for random walks over all threads in Steal
// and NonEmptyQueueIndex. Iteration is based on the fact that if we take
// a random starting thread index t and calculate num_threads - 1 subsequent
// indices as (t + coprime) % num_threads, we will cover all threads without
// repetitions (effectively getting a presudo-random permutation of thread
// indices).
eigen_plain_assert(num_threads_ < kMaxThreads);
for (int i = 1; i <= num_threads_; ++i) {
all_coprimes_.emplace_back(i);
ComputeCoprimes(i, &all_coprimes_.back());
}
#ifndef EIGEN_THREAD_LOCAL
init_barrier_.reset(new Barrier(num_threads_));
#endif
thread_data_.resize(num_threads_);
for (int i = 0; i < num_threads_; i++) {
SetStealPartition(i, EncodePartition(0, num_threads_));
thread_data_[i].thread.reset(
env_.CreateThread([this, i]() { WorkerLoop(i); }));
}
#ifndef EIGEN_THREAD_LOCAL
// Wait for workers to initialize per_thread_map_. Otherwise we might race
// with them in Schedule or CurrentThreadId.
init_barrier_->Wait();
#endif
}
~ThreadPoolTempl() {
done_ = true;
// Now if all threads block without work, they will start exiting.
// But note that threads can continue to work arbitrary long,
// block, submit new work, unblock and otherwise live full life.
if (!cancelled_) {
ec_.Notify(true);
} else {
// Since we were cancelled, there might be entries in the queues.
// Empty them to prevent their destructor from asserting.
for (size_t i = 0; i < thread_data_.size(); i++) {
thread_data_[i].queue.Flush();
}
}
// Join threads explicitly (by destroying) to avoid destruction order within
// this class.
for (size_t i = 0; i < thread_data_.size(); ++i)
thread_data_[i].thread.reset();
}
void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
// Pass this information to each thread queue.
for (int i = 0; i < num_threads_; i++) {
const auto& pair = partitions[i];
unsigned start = pair.first, end = pair.second;
AssertBounds(start, end);
unsigned val = EncodePartition(start, end);
SetStealPartition(i, val);
}
}
void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
ScheduleWithHint(std::move(fn), 0, num_threads_);
}
void ScheduleWithHint(std::function<void()> fn, int start,
int limit) override {
Task t = env_.CreateTask(std::move(fn));
PerThread* pt = GetPerThread();
if (pt->pool == this) {
// Worker thread of this pool, push onto the thread's queue.
Queue& q = thread_data_[pt->thread_id].queue;
t = q.PushFront(std::move(t));
} else {
// A free-standing thread (or worker of another pool), push onto a random
// queue.
eigen_plain_assert(start < limit);
eigen_plain_assert(limit <= num_threads_);
int num_queues = limit - start;
int rnd = Rand(&pt->rand) % num_queues;
eigen_plain_assert(start + rnd < limit);
Queue& q = thread_data_[start + rnd].queue;
t = q.PushBack(std::move(t));
}
// Note: below we touch this after making w available to worker threads.
// Strictly speaking, this can lead to a racy-use-after-free. Consider that
// Schedule is called from a thread that is neither main thread nor a worker
// thread of this pool. Then, execution of w directly or indirectly
// completes overall computations, which in turn leads to destruction of
// this. We expect that such scenario is prevented by program, that is,
// this is kept alive while any threads can potentially be in Schedule.
if (!t.f) {
ec_.Notify(false);
} else {
env_.ExecuteTask(t); // Push failed, execute directly.
}
}
void Cancel() EIGEN_OVERRIDE {
cancelled_ = true;
done_ = true;
// Let each thread know it's been cancelled.
#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
for (size_t i = 0; i < thread_data_.size(); i++) {
thread_data_[i].thread->OnCancel();
}
#endif
// Wake up the threads without work to let them exit on their own.
ec_.Notify(true);
}
int NumThreads() const EIGEN_FINAL { return num_threads_; }
int CurrentThreadId() const EIGEN_FINAL {
const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
if (pt->pool == this) {
return pt->thread_id;
} else {
return -1;
}
}
private:
// Create a single atomic<int> that encodes start and limit information for
// each thread.
// We expect num_threads_ < 65536, so we can store them in a single
// std::atomic<unsigned>.
// Exposed publicly as static functions so that external callers can reuse
// this encode/decode logic for maintaining their own thread-safe copies of
// scheduling and steal domain(s).
static const int kMaxPartitionBits = 16;
static const int kMaxThreads = 1 << kMaxPartitionBits;
inline unsigned EncodePartition(unsigned start, unsigned limit) {
return (start << kMaxPartitionBits) | limit;
}
inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
*limit = val & (kMaxThreads - 1);
val >>= kMaxPartitionBits;
*start = val;
}
void AssertBounds(int start, int end) {
eigen_plain_assert(start >= 0);
eigen_plain_assert(start < end); // non-zero sized partition
eigen_plain_assert(end <= num_threads_);
}
inline void SetStealPartition(size_t i, unsigned val) {
thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
}
inline unsigned GetStealPartition(int i) {
return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
}
void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
for (int i = 1; i <= N; i++) {
unsigned a = i;
unsigned b = N;
// If GCD(a, b) == 1, then a and b are coprimes.
while (b != 0) {
unsigned tmp = a;
a = b;
b = tmp % b;
}
if (a == 1) {
coprimes->push_back(i);
}
}
}
typedef typename Environment::EnvThread Thread;
struct PerThread {
constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
ThreadPoolTempl* pool; // Parent pool, or null for normal threads.
uint64_t rand; // Random generator state.
int thread_id; // Worker thread index in pool.
#ifndef EIGEN_THREAD_LOCAL
// Prevent false sharing.
char pad_[128];
#endif
};
struct ThreadData {
constexpr ThreadData() : thread(), steal_partition(0), queue() {}
std::unique_ptr<Thread> thread;
std::atomic<unsigned> steal_partition;
Queue queue;
};
Environment env_;
const int num_threads_;
const bool allow_spinning_;
MaxSizeVector<ThreadData> thread_data_;
MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
MaxSizeVector<EventCount::Waiter> waiters_;
unsigned global_steal_partition_;
std::atomic<unsigned> blocked_;
std::atomic<bool> spinning_;
std::atomic<bool> done_;
std::atomic<bool> cancelled_;
EventCount ec_;
#ifndef EIGEN_THREAD_LOCAL
std::unique_ptr<Barrier> init_barrier_;
std::mutex per_thread_map_mutex_; // Protects per_thread_map_.
std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
#endif
// Main worker thread loop.
void WorkerLoop(int thread_id) {
#ifndef EIGEN_THREAD_LOCAL
std::unique_ptr<PerThread> new_pt(new PerThread());
per_thread_map_mutex_.lock();
bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
eigen_plain_assert(insertOK);
EIGEN_UNUSED_VARIABLE(insertOK);
per_thread_map_mutex_.unlock();
init_barrier_->Notify();
init_barrier_->Wait();
#endif
PerThread* pt = GetPerThread();
pt->pool = this;
pt->rand = GlobalThreadIdHash();
pt->thread_id = thread_id;
Queue& q = thread_data_[thread_id].queue;
EventCount::Waiter* waiter = &waiters_[thread_id];
// TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
// proportional to num_threads_ and we assume that new work is scheduled at
// a constant rate, so we set spin_count to 5000 / num_threads_. The
// constant was picked based on a fair dice roll, tune it.
const int spin_count =
allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
if (num_threads_ == 1) {
// For num_threads_ == 1 there is no point in going through the expensive
// steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
// victim queues it might reverse the order in which ops are executed
// compared to the order in which they are scheduled, which tends to be
// counter-productive for the types of I/O workloads the single thread
// pools tend to be used for.
while (!cancelled_) {
Task t = q.PopFront();
for (int i = 0; i < spin_count && !t.f; i++) {
if (!cancelled_.load(std::memory_order_relaxed)) {
t = q.PopFront();
}
}
if (!t.f) {
if (!WaitForWork(waiter, &t)) {
return;
}
}
if (t.f) {
env_.ExecuteTask(t);
}
}
} else {
while (!cancelled_) {
Task t = q.PopFront();
if (!t.f) {
t = LocalSteal();
if (!t.f) {
t = GlobalSteal();
if (!t.f) {
// Leave one thread spinning. This reduces latency.
if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
for (int i = 0; i < spin_count && !t.f; i++) {
if (!cancelled_.load(std::memory_order_relaxed)) {
t = GlobalSteal();
} else {
return;
}
}
spinning_ = false;
}
if (!t.f) {
if (!WaitForWork(waiter, &t)) {
return;
}
}
}
}
}
if (t.f) {
env_.ExecuteTask(t);
}
}
}
}
// Steal tries to steal work from other worker threads in the range [start,
// limit) in best-effort manner.
Task Steal(unsigned start, unsigned limit) {
PerThread* pt = GetPerThread();
const size_t size = limit - start;
unsigned r = Rand(&pt->rand);
// Reduce r into [0, size) range, this utilizes trick from
// https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
unsigned inc = all_coprimes_[size - 1][index];
for (unsigned i = 0; i < size; i++) {
eigen_plain_assert(start + victim < limit);
Task t = thread_data_[start + victim].queue.PopBack();
if (t.f) {
return t;
}
victim += inc;
if (victim >= size) {
victim -= size;
}
}
return Task();
}
// Steals work within threads belonging to the partition.
Task LocalSteal() {
PerThread* pt = GetPerThread();
unsigned partition = GetStealPartition(pt->thread_id);
// If thread steal partition is the same as global partition, there is no
// need to go through the steal loop twice.
if (global_steal_partition_ == partition) return Task();
unsigned start, limit;
DecodePartition(partition, &start, &limit);
AssertBounds(start, limit);
return Steal(start, limit);
}
// Steals work from any other thread in the pool.
Task GlobalSteal() {
return Steal(0, num_threads_);
}
// WaitForWork blocks until new work is available (returns true), or if it is
// time to exit (returns false). Can optionally return a task to execute in t
// (in such case t.f != nullptr on return).
bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
eigen_plain_assert(!t->f);
// We already did best-effort emptiness check in Steal, so prepare for
// blocking.
ec_.Prewait();
// Now do a reliable emptiness check.
int victim = NonEmptyQueueIndex();
if (victim != -1) {
ec_.CancelWait();
if (cancelled_) {
return false;
} else {
*t = thread_data_[victim].queue.PopBack();
return true;
}
}
// Number of blocked threads is used as termination condition.
// If we are shutting down and all worker threads blocked without work,
// that's we are done.
blocked_++;
// TODO is blocked_ required to be unsigned?
if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
ec_.CancelWait();
// Almost done, but need to re-check queues.
// Consider that all queues are empty and all worker threads are preempted
// right after incrementing blocked_ above. Now a free-standing thread
// submits work and calls destructor (which sets done_). If we don't
// re-check queues, we will exit leaving the work unexecuted.
if (NonEmptyQueueIndex() != -1) {
// Note: we must not pop from queues before we decrement blocked_,
// otherwise the following scenario is possible. Consider that instead
// of checking for emptiness we popped the only element from queues.
// Now other worker threads can start exiting, which is bad if the
// work item submits other work. So we just check emptiness here,
// which ensures that all worker threads exit at the same time.
blocked_--;
return true;
}
// Reached stable termination state.
ec_.Notify(true);
return false;
}
ec_.CommitWait(waiter);
blocked_--;
return true;
}
int NonEmptyQueueIndex() {
PerThread* pt = GetPerThread();
// We intentionally design NonEmptyQueueIndex to steal work from
// anywhere in the queue so threads don't block in WaitForWork() forever
// when all threads in their partition go to sleep. Steal is still local.
const size_t size = thread_data_.size();
unsigned r = Rand(&pt->rand);
unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
unsigned victim = r % size;
for (unsigned i = 0; i < size; i++) {
if (!thread_data_[victim].queue.Empty()) {
return victim;
}
victim += inc;
if (victim >= size) {
victim -= size;
}
}
return -1;
}
static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
return std::hash<std::thread::id>()(std::this_thread::get_id());
}
EIGEN_STRONG_INLINE PerThread* GetPerThread() {
#ifndef EIGEN_THREAD_LOCAL
static PerThread dummy;
auto it = per_thread_map_.find(GlobalThreadIdHash());
if (it == per_thread_map_.end()) {
return &dummy;
} else {
return it->second.get();
}
#else
EIGEN_THREAD_LOCAL PerThread per_thread_;
PerThread* pt = &per_thread_;
return pt;
#endif
}
static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
uint64_t current = *state;
// Update the internal state
*state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
// Generate the random output (using the PCG-XSH-RS scheme)
return static_cast<unsigned>((current ^ (current >> 22)) >>
(22 + (current >> 61)));
}
};
typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
} // namespace Eigen
#endif // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H

View File

@@ -0,0 +1,236 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
namespace Eigen {
// RunQueue is a fixed-size, partially non-blocking deque or Work items.
// Operations on front of the queue must be done by a single thread (owner),
// operations on back of the queue can be done by multiple threads concurrently.
//
// Algorithm outline:
// All remote threads operating on the queue back are serialized by a mutex.
// This ensures that at most two threads access state: owner and one remote
// thread (Size aside). The algorithm ensures that the occupied region of the
// underlying array is logically continuous (can wraparound, but no stray
// occupied elements). Owner operates on one end of this region, remote thread
// operates on the other end. Synchronization between these threads
// (potential consumption of the last element and take up of the last empty
// element) happens by means of state variable in each element. States are:
// empty, busy (in process of insertion of removal) and ready. Threads claim
// elements (empty->busy and ready->busy transitions) by means of a CAS
// operation. The finishing transition (busy->empty and busy->ready) are done
// with plain store as the element is exclusively owned by the current thread.
//
// Note: we could permit only pointers as elements, then we would not need
// separate state variable as null/non-null pointer value would serve as state,
// but that would require malloc/free per operation for large, complex values
// (and this is designed to store std::function<()>).
template <typename Work, unsigned kSize>
class RunQueue {
public:
RunQueue() : front_(0), back_(0) {
// require power-of-two for fast masking
eigen_plain_assert((kSize & (kSize - 1)) == 0);
eigen_plain_assert(kSize > 2); // why would you do this?
eigen_plain_assert(kSize <= (64 << 10)); // leave enough space for counter
for (unsigned i = 0; i < kSize; i++)
array_[i].state.store(kEmpty, std::memory_order_relaxed);
}
~RunQueue() { eigen_plain_assert(Size() == 0); }
// PushFront inserts w at the beginning of the queue.
// If queue is full returns w, otherwise returns default-constructed Work.
Work PushFront(Work w) {
unsigned front = front_.load(std::memory_order_relaxed);
Elem* e = &array_[front & kMask];
uint8_t s = e->state.load(std::memory_order_relaxed);
if (s != kEmpty ||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
return w;
front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
e->w = std::move(w);
e->state.store(kReady, std::memory_order_release);
return Work();
}
// PopFront removes and returns the first element in the queue.
// If the queue was empty returns default-constructed Work.
Work PopFront() {
unsigned front = front_.load(std::memory_order_relaxed);
Elem* e = &array_[(front - 1) & kMask];
uint8_t s = e->state.load(std::memory_order_relaxed);
if (s != kReady ||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
return Work();
Work w = std::move(e->w);
e->state.store(kEmpty, std::memory_order_release);
front = ((front - 1) & kMask2) | (front & ~kMask2);
front_.store(front, std::memory_order_relaxed);
return w;
}
// PushBack adds w at the end of the queue.
// If queue is full returns w, otherwise returns default-constructed Work.
Work PushBack(Work w) {
std::unique_lock<std::mutex> lock(mutex_);
unsigned back = back_.load(std::memory_order_relaxed);
Elem* e = &array_[(back - 1) & kMask];
uint8_t s = e->state.load(std::memory_order_relaxed);
if (s != kEmpty ||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
return w;
back = ((back - 1) & kMask2) | (back & ~kMask2);
back_.store(back, std::memory_order_relaxed);
e->w = std::move(w);
e->state.store(kReady, std::memory_order_release);
return Work();
}
// PopBack removes and returns the last elements in the queue.
Work PopBack() {
if (Empty()) return Work();
std::unique_lock<std::mutex> lock(mutex_);
unsigned back = back_.load(std::memory_order_relaxed);
Elem* e = &array_[back & kMask];
uint8_t s = e->state.load(std::memory_order_relaxed);
if (s != kReady ||
!e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire))
return Work();
Work w = std::move(e->w);
e->state.store(kEmpty, std::memory_order_release);
back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
return w;
}
// PopBackHalf removes and returns half last elements in the queue.
// Returns number of elements removed.
unsigned PopBackHalf(std::vector<Work>* result) {
if (Empty()) return 0;
std::unique_lock<std::mutex> lock(mutex_);
unsigned back = back_.load(std::memory_order_relaxed);
unsigned size = Size();
unsigned mid = back;
if (size > 1) mid = back + (size - 1) / 2;
unsigned n = 0;
unsigned start = 0;
for (; static_cast<int>(mid - back) >= 0; mid--) {
Elem* e = &array_[mid & kMask];
uint8_t s = e->state.load(std::memory_order_relaxed);
if (n == 0) {
if (s != kReady || !e->state.compare_exchange_strong(
s, kBusy, std::memory_order_acquire))
continue;
start = mid;
} else {
// Note: no need to store temporal kBusy, we exclusively own these
// elements.
eigen_plain_assert(s == kReady);
}
result->push_back(std::move(e->w));
e->state.store(kEmpty, std::memory_order_release);
n++;
}
if (n != 0)
back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
return n;
}
// Size returns current queue size.
// Can be called by any thread at any time.
unsigned Size() const { return SizeOrNotEmpty<true>(); }
// Empty tests whether container is empty.
// Can be called by any thread at any time.
bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
// Delete all the elements from the queue.
void Flush() {
while (!Empty()) {
PopFront();
}
}
private:
static const unsigned kMask = kSize - 1;
static const unsigned kMask2 = (kSize << 1) - 1;
struct Elem {
std::atomic<uint8_t> state;
Work w;
};
enum {
kEmpty,
kBusy,
kReady,
};
std::mutex mutex_;
// Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
// front/back, respectively. The remaining bits contain modification counters
// that are incremented on Push operations. This allows us to (1) distinguish
// between empty and full conditions (if we would use log(kSize) bits for
// position, these conditions would be indistinguishable); (2) obtain
// consistent snapshot of front_/back_ for Size operation using the
// modification counters.
std::atomic<unsigned> front_;
std::atomic<unsigned> back_;
Elem array_[kSize];
// SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
// only whether the size is 0 is guaranteed to be correct.
// Can be called by any thread at any time.
template<bool NeedSizeEstimate>
unsigned SizeOrNotEmpty() const {
// Emptiness plays critical role in thread pool blocking. So we go to great
// effort to not produce false positives (claim non-empty queue as empty).
unsigned front = front_.load(std::memory_order_acquire);
for (;;) {
// Capture a consistent snapshot of front/tail.
unsigned back = back_.load(std::memory_order_acquire);
unsigned front1 = front_.load(std::memory_order_relaxed);
if (front != front1) {
front = front1;
std::atomic_thread_fence(std::memory_order_acquire);
continue;
}
if (NeedSizeEstimate) {
return CalculateSize(front, back);
} else {
// This value will be 0 if the queue is empty, and undefined otherwise.
unsigned maybe_zero = ((front ^ back) & kMask2);
// Queue size estimate must agree with maybe zero check on the queue
// empty/non-empty state.
eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
return maybe_zero;
}
}
}
EIGEN_ALWAYS_INLINE
unsigned CalculateSize(unsigned front, unsigned back) const {
int size = (front & kMask2) - (back & kMask2);
// Fix overflow.
if (size < 0) size += 2 * kSize;
// Order of modification in push/pop is crafted to make the queue look
// larger than it is during concurrent modifications. E.g. push can
// increment size before the corresponding pop has decremented it.
// So the computed size can be up to kSize + 1, fix it.
if (size > static_cast<int>(kSize)) size = kSize;
return static_cast<unsigned>(size);
}
RunQueue(const RunQueue&) = delete;
void operator=(const RunQueue&) = delete;
};
} // namespace Eigen
#endif // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_

View File

@@ -0,0 +1,23 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
// Try to come up with a portable way to cancel a thread
#if EIGEN_OS_GNULINUX
#define EIGEN_THREAD_CANCEL(t) \
pthread_cancel(t.native_handle());
#define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
#else
#define EIGEN_THREAD_CANCEL(t)
#endif
#endif // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H

View File

@@ -0,0 +1,40 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
namespace Eigen {
struct StlThreadEnvironment {
struct Task {
std::function<void()> f;
};
// EnvThread constructor must start the thread,
// destructor must join the thread.
class EnvThread {
public:
EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
~EnvThread() { thr_.join(); }
// This function is called when the threadpool is cancelled.
void OnCancel() { }
private:
std::thread thr_;
};
EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
void ExecuteTask(const Task& t) { t.f(); }
};
} // namespace Eigen
#endif // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H

View File

@@ -0,0 +1,301 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
#ifdef EIGEN_AVOID_THREAD_LOCAL
#ifdef EIGEN_THREAD_LOCAL
#undef EIGEN_THREAD_LOCAL
#endif
#else
#if EIGEN_MAX_CPP_VER >= 11 && \
((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
__has_feature(cxx_thread_local) || \
(EIGEN_COMP_MSVC >= 1900) )
#define EIGEN_THREAD_LOCAL static thread_local
#endif
// Disable TLS for Apple and Android builds with older toolchains.
#if defined(__APPLE__)
// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
// __IPHONE_8_0.
#include <Availability.h>
#include <TargetConditionals.h>
#endif
// Checks whether C++11's `thread_local` storage duration specifier is
// supported.
#if defined(__apple_build_version__) && \
((__apple_build_version__ < 8000042) || \
(TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
// Notes: Xcode's clang did not support `thread_local` until version
// 8, and even then not for all iOS < 9.0.
#undef EIGEN_THREAD_LOCAL
#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
// There are platforms for which TLS should not be used even though the compiler
// makes it seem like it's supported (Android NDK < r12b for example).
// This is primarily because of linker problems and toolchain misconfiguration:
// TLS isn't supported until NDK r12b per
// https://developer.android.com/ndk/downloads/revision_history.html
// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
// <android/ndk-version.h>. For NDK < r16, users should define these macros,
// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
#if __has_include(<android/ndk-version.h>)
#include <android/ndk-version.h>
#endif // __has_include(<android/ndk-version.h>)
#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
defined(__NDK_MINOR__) && \
((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
#undef EIGEN_THREAD_LOCAL
#endif
#endif // defined(__ANDROID__) && defined(__clang__)
#endif // EIGEN_AVOID_THREAD_LOCAL
namespace Eigen {
namespace internal {
template <typename T>
struct ThreadLocalNoOpInitialize {
void operator()(T&) const {}
};
template <typename T>
struct ThreadLocalNoOpRelease {
void operator()(T&) const {}
};
} // namespace internal
// Thread local container for elements of type T, that does not use thread local
// storage. As long as the number of unique threads accessing this storage
// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
// use a mutex for synchronization.
//
// Type `T` has to be default constructible, and by default each thread will get
// a default constructed value. It is possible to specify custom `initialize`
// callable, that will be called lazily from each thread accessing this object,
// and will be passed a default initialized object of type `T`. Also it's
// possible to pass a custom `release` callable, that will be invoked before
// calling ~T().
//
// Example:
//
// struct Counter {
// int value = 0;
// }
//
// Eigen::ThreadLocal<Counter> counter(10);
//
// // Each thread will have access to it's own counter object.
// Counter& cnt = counter.local();
// cnt++;
//
// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
// std::this_thread::get_id() to identify threads. This value is not guaranteed
// to be unique except for the life of the thread. A newly created thread may
// get an OS-specific ID equal to that of an already destroyed thread.
//
// Somewhat similar to TBB thread local storage, with similar restrictions:
// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
//
template <typename T,
typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
typename Release = internal::ThreadLocalNoOpRelease<T>>
class ThreadLocal {
// We preallocate default constructed elements in MaxSizedVector.
static_assert(std::is_default_constructible<T>::value,
"ThreadLocal data type must be default constructible");
public:
explicit ThreadLocal(int capacity)
: ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
internal::ThreadLocalNoOpRelease<T>()) {}
ThreadLocal(int capacity, Initialize initialize)
: ThreadLocal(capacity, std::move(initialize),
internal::ThreadLocalNoOpRelease<T>()) {}
ThreadLocal(int capacity, Initialize initialize, Release release)
: initialize_(std::move(initialize)),
release_(std::move(release)),
capacity_(capacity),
data_(capacity_),
ptr_(capacity_),
filled_records_(0) {
eigen_assert(capacity_ >= 0);
data_.resize(capacity_);
for (int i = 0; i < capacity_; ++i) {
ptr_.emplace_back(nullptr);
}
}
T& local() {
std::thread::id this_thread = std::this_thread::get_id();
if (capacity_ == 0) return SpilledLocal(this_thread);
std::size_t h = std::hash<std::thread::id>()(this_thread);
const int start_idx = h % capacity_;
// NOTE: From the definition of `std::this_thread::get_id()` it is
// guaranteed that we never can have concurrent insertions with the same key
// to our hash-map like data structure. If we didn't find an element during
// the initial traversal, it's guaranteed that no one else could have
// inserted it while we are in this function. This allows to massively
// simplify out lock-free insert-only hash map.
// Check if we already have an element for `this_thread`.
int idx = start_idx;
while (ptr_[idx].load() != nullptr) {
ThreadIdAndValue& record = *(ptr_[idx].load());
if (record.thread_id == this_thread) return record.value;
idx += 1;
if (idx >= capacity_) idx -= capacity_;
if (idx == start_idx) break;
}
// If we are here, it means that we found an insertion point in lookup
// table at `idx`, or we did a full traversal and table is full.
// If lock-free storage is full, fallback on mutex.
if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
// We double check that we still have space to insert an element into a lock
// free storage. If old value in `filled_records_` is larger than the
// records capacity, it means that some other thread added an element while
// we were traversing lookup table.
int insertion_index =
filled_records_.fetch_add(1, std::memory_order_relaxed);
if (insertion_index >= capacity_) return SpilledLocal(this_thread);
// At this point it's guaranteed that we can access to
// data_[insertion_index_] without a data race.
data_[insertion_index].thread_id = this_thread;
initialize_(data_[insertion_index].value);
// That's the pointer we'll put into the lookup table.
ThreadIdAndValue* inserted = &data_[insertion_index];
// We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
ThreadIdAndValue* empty = nullptr;
// Now we have to find an insertion point into the lookup table. We start
// from the `idx` that was identified as an insertion point above, it's
// guaranteed that we will have an empty record somewhere in a lookup table
// (because we created a record in the `data_`).
const int insertion_idx = idx;
do {
// Always start search from the original insertion candidate.
idx = insertion_idx;
while (ptr_[idx].load() != nullptr) {
idx += 1;
if (idx >= capacity_) idx -= capacity_;
// If we did a full loop, it means that we don't have any free entries
// in the lookup table, and this means that something is terribly wrong.
eigen_assert(idx != insertion_idx);
}
// Atomic CAS of the pointer guarantees that any other thread, that will
// follow this pointer will see all the mutations in the `data_`.
} while (!ptr_[idx].compare_exchange_weak(empty, inserted));
return inserted->value;
}
// WARN: It's not thread safe to call it concurrently with `local()`.
void ForEach(std::function<void(std::thread::id, T&)> f) {
// Reading directly from `data_` is unsafe, because only CAS to the
// record in `ptr_` makes all changes visible to other threads.
for (auto& ptr : ptr_) {
ThreadIdAndValue* record = ptr.load();
if (record == nullptr) continue;
f(record->thread_id, record->value);
}
// We did not spill into the map based storage.
if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
// Adds a happens before edge from the last call to SpilledLocal().
std::unique_lock<std::mutex> lock(mu_);
for (auto& kv : per_thread_map_) {
f(kv.first, kv.second);
}
}
// WARN: It's not thread safe to call it concurrently with `local()`.
~ThreadLocal() {
// Reading directly from `data_` is unsafe, because only CAS to the record
// in `ptr_` makes all changes visible to other threads.
for (auto& ptr : ptr_) {
ThreadIdAndValue* record = ptr.load();
if (record == nullptr) continue;
release_(record->value);
}
// We did not spill into the map based storage.
if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
// Adds a happens before edge from the last call to SpilledLocal().
std::unique_lock<std::mutex> lock(mu_);
for (auto& kv : per_thread_map_) {
release_(kv.second);
}
}
private:
struct ThreadIdAndValue {
std::thread::id thread_id;
T value;
};
// Use unordered map guarded by a mutex when lock free storage is full.
T& SpilledLocal(std::thread::id this_thread) {
std::unique_lock<std::mutex> lock(mu_);
auto it = per_thread_map_.find(this_thread);
if (it == per_thread_map_.end()) {
auto result = per_thread_map_.emplace(this_thread, T());
eigen_assert(result.second);
initialize_((*result.first).second);
return (*result.first).second;
} else {
return it->second;
}
}
Initialize initialize_;
Release release_;
const int capacity_;
// Storage that backs lock-free lookup table `ptr_`. Records stored in this
// storage contiguously starting from index 0.
MaxSizeVector<ThreadIdAndValue> data_;
// Atomic pointers to the data stored in `data_`. Used as a lookup table for
// linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
// Number of records stored in the `data_`.
std::atomic<int> filled_records_;
// We fallback on per thread map if lock-free storage is full. In practice
// this should never happen, if `capacity_` is a reasonable estimate of the
// number of threads running in a system.
std::mutex mu_; // Protects per_thread_map_.
std::unordered_map<std::thread::id, T> per_thread_map_;
};
} // namespace Eigen
#endif // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H

View File

@@ -0,0 +1,48 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
namespace Eigen {
// This defines an interface that ThreadPoolDevice can take to use
// custom thread pools underneath.
class ThreadPoolInterface {
public:
// Submits a closure to be run by a thread in the pool.
virtual void Schedule(std::function<void()> fn) = 0;
// Submits a closure to be run by threads in the range [start, end) in the
// pool.
virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
int /*end*/) {
// Just defer to Schedule in case sub-classes aren't interested in
// overriding this functionality.
Schedule(fn);
}
// If implemented, stop processing the closures that have been enqueued.
// Currently running closures may still be processed.
// If not implemented, does nothing.
virtual void Cancel() {}
// Returns the number of threads in the pool.
virtual int NumThreads() const = 0;
// Returns a logical thread index between 0 and NumThreads() - 1 if called
// from one of the threads in the pool. Returns -1 otherwise.
virtual int CurrentThreadId() const = 0;
virtual ~ThreadPoolInterface() {}
};
} // namespace Eigen
#endif // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H

View File

@@ -0,0 +1,20 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
// Try to come up with a portable way to yield
#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
#define EIGEN_THREAD_YIELD() sched_yield()
#else
#define EIGEN_THREAD_YIELD() std::this_thread::yield()
#endif
#endif // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H

View File

@@ -0,0 +1,537 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11META_H
#define EIGEN_CXX11META_H
#include <vector>
#include "EmulateArray.h"
#include "CXX11Workarounds.h"
namespace Eigen {
namespace internal {
/** \internal
* \file CXX11/util/CXX11Meta.h
* This file contains generic metaprogramming classes which are not specifically related to Eigen.
* This file expands upon Core/util/Meta.h and adds support for C++11 specific features.
*/
template<typename... tt>
struct type_list { constexpr static int count = sizeof...(tt); };
template<typename t, typename... tt>
struct type_list<t, tt...> { constexpr static int count = sizeof...(tt) + 1; typedef t first_type; };
template<typename T, T... nn>
struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
template<typename T, T n, T... nn>
struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
#ifndef EIGEN_PARSED_BY_DOXYGEN
/* numeric list constructors
*
* equivalencies:
* constructor result
* typename gen_numeric_list<int, 5>::type numeric_list<int, 0,1,2,3,4>
* typename gen_numeric_list_reversed<int, 5>::type numeric_list<int, 4,3,2,1,0>
* typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
* typename gen_numeric_list_repeated<int, 0, 5>::type numeric_list<int, 0,0,0,0,0>
*/
template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list : gen_numeric_list<T, n-1, start, start + n-1, ii...> {};
template<typename T, T start, T... ii> struct gen_numeric_list<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
template<typename T, std::size_t n, T start = 0, T... ii> struct gen_numeric_list_reversed : gen_numeric_list_reversed<T, n-1, start, ii..., start + n-1> {};
template<typename T, T start, T... ii> struct gen_numeric_list_reversed<T, 0, start, ii...> { typedef numeric_list<T, ii...> type; };
template<typename T, std::size_t n, T a, T b, T start = 0, T... ii> struct gen_numeric_list_swapped_pair : gen_numeric_list_swapped_pair<T, n-1, a, b, start, (start + n-1) == a ? b : ((start + n-1) == b ? a : (start + n-1)), ii...> {};
template<typename T, T a, T b, T start, T... ii> struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> { typedef numeric_list<T, ii...> type; };
template<typename T, std::size_t n, T V, T... nn> struct gen_numeric_list_repeated : gen_numeric_list_repeated<T, n-1, V, V, nn...> {};
template<typename T, T V, T... nn> struct gen_numeric_list_repeated<T, 0, V, nn...> { typedef numeric_list<T, nn...> type; };
/* list manipulation: concatenate */
template<class a, class b> struct concat;
template<typename... as, typename... bs> struct concat<type_list<as...>, type_list<bs...>> { typedef type_list<as..., bs...> type; };
template<typename T, T... as, T... bs> struct concat<numeric_list<T, as...>, numeric_list<T, bs...> > { typedef numeric_list<T, as..., bs...> type; };
template<typename... p> struct mconcat;
template<typename a> struct mconcat<a> { typedef a type; };
template<typename a, typename b> struct mconcat<a, b> : concat<a, b> {};
template<typename a, typename b, typename... cs> struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
/* list manipulation: extract slices */
template<int n, typename x> struct take;
template<int n, typename a, typename... as> struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n-1, type_list<as...>>::type> {};
template<int n> struct take<n, type_list<>> { typedef type_list<> type; };
template<typename a, typename... as> struct take<0, type_list<a, as...>> { typedef type_list<> type; };
template<> struct take<0, type_list<>> { typedef type_list<> type; };
template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
template<typename T, int n> struct take<n, numeric_list<T>> { typedef numeric_list<T> type; };
template<typename T, T a, T... as> struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
template<typename T> struct take<0, numeric_list<T>> { typedef numeric_list<T> type; };
template<typename T, int n, T... ii> struct h_skip_helper_numeric;
template<typename T, int n, T i, T... ii> struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n-1, ii...> {};
template<typename T, T i, T... ii> struct h_skip_helper_numeric<T, 0, i, ii...> { typedef numeric_list<T, i, ii...> type; };
template<typename T, int n> struct h_skip_helper_numeric<T, n> { typedef numeric_list<T> type; };
template<typename T> struct h_skip_helper_numeric<T, 0> { typedef numeric_list<T> type; };
template<int n, typename... tt> struct h_skip_helper_type;
template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n-1, tt...> {};
template<typename t, typename... tt> struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
template<int n> struct h_skip_helper_type<n> { typedef type_list<> type; };
template<> struct h_skip_helper_type<0> { typedef type_list<> type; };
#endif //not EIGEN_PARSED_BY_DOXYGEN
template<int n>
struct h_skip {
template<typename T, T... ii>
constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
template<typename... tt>
constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
};
template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
template<int start, int count, typename a> struct slice : take<count, typename skip<start, a>::type> {};
/* list manipulation: retrieve single element from list */
template<int n, typename x> struct get;
template<int n, typename a, typename... as> struct get<n, type_list<a, as...>> : get<n-1, type_list<as...>> {};
template<typename a, typename... as> struct get<0, type_list<a, as...>> { typedef a type; };
template<typename T, int n, T a, T... as> struct get<n, numeric_list<T, a, as...>> : get<n-1, numeric_list<T, as...>> {};
template<typename T, T a, T... as> struct get<0, numeric_list<T, a, as...>> { constexpr static T value = a; };
template<std::size_t n, typename T, T a, T... as> constexpr T array_get(const numeric_list<T, a, as...>&) {
return get<(int)n, numeric_list<T, a, as...>>::value;
}
/* always get type, regardless of dummy; good for parameter pack expansion */
template<typename T, T dummy, typename t> struct id_numeric { typedef t type; };
template<typename dummy, typename t> struct id_type { typedef t type; };
/* equality checking, flagged version */
template<typename a, typename b> struct is_same_gf : is_same<a, b> { constexpr static int global_flags = 0; };
/* apply_op to list */
template<
bool from_left, // false
template<typename, typename> class op,
typename additional_param,
typename... values
>
struct h_apply_op_helper { typedef type_list<typename op<values, additional_param>::type...> type; };
template<
template<typename, typename> class op,
typename additional_param,
typename... values
>
struct h_apply_op_helper<true, op, additional_param, values...> { typedef type_list<typename op<additional_param, values>::type...> type; };
template<
bool from_left,
template<typename, typename> class op,
typename additional_param
>
struct h_apply_op
{
template<typename... values>
constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(type_list<values...>)
{ return typename h_apply_op_helper<from_left, op, additional_param, values...>::type(); }
};
template<
template<typename, typename> class op,
typename additional_param,
typename a
>
struct apply_op_from_left { typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type; };
template<
template<typename, typename> class op,
typename additional_param,
typename a
>
struct apply_op_from_right { typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type; };
/* see if an element is in a list */
template<
template<typename, typename> class test,
typename check_against,
typename h_list,
bool last_check_positive = false
>
struct contained_in_list;
template<
template<typename, typename> class test,
typename check_against,
typename h_list
>
struct contained_in_list<test, check_against, h_list, true>
{
constexpr static bool value = true;
};
template<
template<typename, typename> class test,
typename check_against,
typename a,
typename... as
>
struct contained_in_list<test, check_against, type_list<a, as...>, false> : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
template<
template<typename, typename> class test,
typename check_against
EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty)
>
struct contained_in_list<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, false> { constexpr static bool value = false; };
/* see if an element is in a list and check for global flags */
template<
template<typename, typename> class test,
typename check_against,
typename h_list,
int default_flags = 0,
bool last_check_positive = false,
int last_check_flags = default_flags
>
struct contained_in_list_gf;
template<
template<typename, typename> class test,
typename check_against,
typename h_list,
int default_flags,
int last_check_flags
>
struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags>
{
constexpr static bool value = true;
constexpr static int global_flags = last_check_flags;
};
template<
template<typename, typename> class test,
typename check_against,
typename a,
typename... as,
int default_flags,
int last_check_flags
>
struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags> : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value, test<check_against, a>::global_flags> {};
template<
template<typename, typename> class test,
typename check_against
EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
int default_flags,
int last_check_flags
>
struct contained_in_list_gf<test, check_against, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, default_flags, false, last_check_flags> { constexpr static bool value = false; constexpr static int global_flags = default_flags; };
/* generic reductions */
template<
typename Reducer,
typename... Ts
> struct reduce;
template<
typename Reducer
> struct reduce<Reducer>
{
EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
};
template<
typename Reducer,
typename A
> struct reduce<Reducer, A>
{
EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
};
template<
typename Reducer,
typename A,
typename... Ts
> struct reduce<Reducer, A, Ts...>
{
EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
}
};
/* generic binary operations */
struct sum_op {
template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b) { return a + b; }
static constexpr int Identity = 0;
};
struct product_op {
template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b) { return a * b; }
static constexpr int Identity = 1;
};
struct logical_and_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b) { return a && b; } };
struct logical_or_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b) { return a || b; } };
struct equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b) { return a == b; } };
struct not_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b) { return a != b; } };
struct lesser_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b) { return a < b; } };
struct lesser_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b) { return a <= b; } };
struct greater_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b) { return a > b; } };
struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b) { return a >= b; } };
/* generic unary operations */
struct not_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a) { return !a; } };
struct negation_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a) { return -a; } };
struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0) { return a >= 0; } };
/* reductions for lists */
// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
// does...
template<typename... Ts>
EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
{
return reduce<product_op, Ts...>::run(ts...);
}
template<typename... Ts>
constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
{
return reduce<sum_op, Ts...>::run(ts...);
}
/* reverse arrays */
template<typename Array, int... n>
constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
{
return {{array_get<sizeof...(n) - n - 1>(arr)...}};
}
template<typename T, std::size_t N>
constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
{
return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
}
/* generic array reductions */
// can't reuse standard reduce() interface above because Intel's Compiler
// *really* doesn't like it, so we just reimplement the stuff
// (start from N - 1 and work down to 0 because specialization for
// n == N - 1 also doesn't work in Intel's compiler, so it goes into
// an infinite loop)
template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
struct h_array_reduce {
EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
{
return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
}
};
template<typename Reducer, typename T, std::size_t N>
struct h_array_reduce<Reducer, T, N, 0>
{
EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
{
return array_get<0>(arr);
}
};
template<typename Reducer, typename T>
struct h_array_reduce<Reducer, T, 0>
{
EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
{
return identity;
}
};
template<typename Reducer, typename T, std::size_t N>
EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
{
return h_array_reduce<Reducer, T, N>::run(arr, identity);
}
/* standard array reductions */
template<typename T, std::size_t N>
EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
{
return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
}
template<typename T, std::size_t N>
EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
{
return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
}
template<typename t>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
eigen_assert(a.size() > 0);
t prod = 1;
for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
return prod;
}
/* zip an array */
template<typename Op, typename A, typename B, std::size_t N, int... n>
constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
{
return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
}
template<typename Op, typename A, typename B, std::size_t N>
constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
{
return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
}
/* zip an array and reduce the result */
template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
{
return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
}
template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
{
return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
}
/* apply stuff to an array */
template<typename Op, typename A, std::size_t N, int... n>
constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
{
return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
}
template<typename Op, typename A, std::size_t N>
constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
{
return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
}
/* apply stuff to an array and reduce */
template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
{
return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
}
template<typename Reducer, typename Op, typename A, std::size_t N>
constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
{
return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
}
/* repeat a value n times (and make an array out of it
* usage:
* array<int, 16> = repeat<16>(42);
*/
template<int n>
struct h_repeat
{
template<typename t, int... ii>
constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
{
return {{ typename id_numeric<int, ii, t>::type(v)... }};
}
};
template<int n, typename t>
constexpr array<t, n> repeat(t v) { return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type()); }
/* instantiate a class by a C-style array */
template<class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
struct h_instantiate_by_c_array;
template<class InstType, typename ArrType, std::size_t N, typename... Ps>
struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...>
{
static InstType run(ArrType* arr, Ps... args)
{
return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
}
};
template<class InstType, typename ArrType, std::size_t N, typename... Ps>
struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...>
{
static InstType run(ArrType* arr, Ps... args)
{
return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
}
};
template<class InstType, typename ArrType, typename... Ps>
struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...>
{
static InstType run(ArrType* arr, Ps... args)
{
(void)arr;
return InstType(args...);
}
};
template<class InstType, typename ArrType, typename... Ps>
struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...>
{
static InstType run(ArrType* arr, Ps... args)
{
(void)arr;
return InstType(args...);
}
};
template<class InstType, typename ArrType, std::size_t N, bool Reverse = false>
InstType instantiate_by_c_array(ArrType* arr)
{
return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
}
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11META_H

View File

@@ -0,0 +1,88 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_CXX11WORKAROUNDS_H
#define EIGEN_CXX11WORKAROUNDS_H
/* COMPATIBILITY CHECKS
* (so users of compilers that are too old get some realistic error messages)
*/
#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
#error Intel Compiler only supports required C++ features since version 13.1.
// note that most stuff in principle works with 13.0 but when combining
// some features, at some point 13.0 will just fail with an internal assertion
#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
// it sees. Unfortunately, that is still not our #error directive, but at least the output is
// short enough the user has a chance to see that the compiler version is not sufficient for
// the funky template mojo we use.
#pragma GCC diagnostic error "-Wfatal-errors"
#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
#endif
/* Check that the compiler at least claims to support C++11. It might not be sufficient
* because the compiler may not implement it correctly, but at least we'll know.
* On the other hand, visual studio still doesn't claim to support C++11 although it's
* compliant enugh for our purpose.
*/
#if (EIGEN_COMP_CXXVER < 11)
#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
#pragma GCC diagnostic error "-Wfatal-errors"
#endif
#error This library needs at least a C++11 compliant compiler. If you use g++/clang, please enable the -std=c++11 compiler flag. (-std=c++0x on older versions.)
#endif
namespace Eigen {
namespace internal {
/* std::get is only constexpr in C++14, not yet in C++11
*/
template<std::size_t I_, class T> constexpr inline T& array_get(std::vector<T>& a) { return a[I_]; }
template<std::size_t I_, class T> constexpr inline T&& array_get(std::vector<T>&& a) { return a[I_]; }
template<std::size_t I_, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I_]; }
/* Suppose you have a template of the form
* template<typename T> struct X;
* And you want to specialize it in such a way:
* template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
* template<> struct X<Foo<>> { ::: };
* This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
* g++ can only match templates called with parameter packs if the number of template
* arguments is not a fixed size (so inside the first specialization, referencing
* X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
* template<typename S...> struct X<Foo<S...>> { ::: }:
* as an additional (!) specialization, which will then only match the empty case.
* But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
* so we have to create a workaround for this.
*/
#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n
#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n...
#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n...
#else
#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
#endif
} // end namespace internal
} // end namespace Eigen
#endif // EIGEN_CXX11WORKAROUNDS_H
/*
* kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
*/

View File

@@ -0,0 +1,261 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_EMULATE_ARRAY_H
#define EIGEN_EMULATE_ARRAY_H
// The array class is only available starting with cxx11. Emulate our own here
// if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
// Moreover, CUDA doesn't support the STL containers, so we use our own instead.
#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
namespace Eigen {
template <typename T, size_t n> class array {
public:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& front() { return values[0]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& back() { return values[n-1]; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& back() const { return values[n-1]; }
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
static std::size_t size() { return n; }
T values[n];
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array() { }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v) {
EIGEN_STATIC_ASSERT(n==1, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
EIGEN_STATIC_ASSERT(n==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
EIGEN_STATIC_ASSERT(n==3, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3,
const T& v4) {
EIGEN_STATIC_ASSERT(n==4, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
const T& v5) {
EIGEN_STATIC_ASSERT(n==5, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
values[4] = v5;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
const T& v5, const T& v6) {
EIGEN_STATIC_ASSERT(n==6, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
values[4] = v5;
values[5] = v6;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4,
const T& v5, const T& v6, const T& v7) {
EIGEN_STATIC_ASSERT(n==7, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
values[4] = v5;
values[5] = v6;
values[6] = v7;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(
const T& v1, const T& v2, const T& v3, const T& v4,
const T& v5, const T& v6, const T& v7, const T& v8) {
EIGEN_STATIC_ASSERT(n==8, YOU_MADE_A_PROGRAMMING_MISTAKE)
values[0] = v1;
values[1] = v2;
values[2] = v3;
values[3] = v4;
values[4] = v5;
values[5] = v6;
values[6] = v7;
values[7] = v8;
}
#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
eigen_assert(l.size() == n);
internal::smart_copy(l.begin(), l.end(), values);
}
#endif
};
// Specialize array for zero size
template <typename T> class array<T, 0> {
public:
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& operator[] (size_t) {
eigen_assert(false && "Can't index a zero size array");
return dummy;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& operator[] (size_t) const {
eigen_assert(false && "Can't index a zero size array");
return dummy;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& front() {
eigen_assert(false && "Can't index a zero size array");
return dummy;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& front() const {
eigen_assert(false && "Can't index a zero size array");
return dummy;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE T& back() {
eigen_assert(false && "Can't index a zero size array");
return dummy;
}
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE const T& back() const {
eigen_assert(false && "Can't index a zero size array");
return dummy;
}
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
EIGEN_DEVICE_FUNC
EIGEN_STRONG_INLINE array() : dummy() { }
#if EIGEN_HAS_VARIADIC_TEMPLATES
EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
EIGEN_UNUSED_VARIABLE(l);
eigen_assert(l.size() == 0);
}
#endif
private:
T dummy;
};
// Comparison operator
// Todo: implement !=, <, <=, >, and >=
template<class T, std::size_t N>
EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs) {
for (std::size_t i = 0; i < N; ++i) {
if (lhs[i] != rhs[i]) {
return false;
}
}
return true;
}
namespace internal {
template<std::size_t I_, class T, std::size_t N>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
return a[I_];
}
template<std::size_t I_, class T, std::size_t N>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
return a[I_];
}
template<class T, std::size_t N> struct array_size<array<T,N> > {
enum { value = N };
};
template<class T, std::size_t N> struct array_size<array<T,N>& > {
enum { value = N };
};
template<class T, std::size_t N> struct array_size<const array<T,N> > {
enum { value = N };
};
template<class T, std::size_t N> struct array_size<const array<T,N>& > {
enum { value = N };
};
} // end namespace internal
} // end namespace Eigen
#else
// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
#include <array>
namespace Eigen {
template <typename T, std::size_t N> using array = std::array<T, N>;
namespace internal {
/* std::get is only constexpr in C++14, not yet in C++11
* - libstdc++ from version 4.7 onwards has it nevertheless,
* so use that
* - libstdc++ older versions: use _M_instance directly
* - libc++ all versions so far: use __elems_ directly
* - all other libs: use std::get to be portable, but
* this may not be constexpr
*/
#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
#define STD_GET_ARR_HACK a._M_instance[I_]
#elif defined(_LIBCPP_VERSION)
#define STD_GET_ARR_HACK a.__elems_[I_]
#else
#define STD_GET_ARR_HACK std::template get<I_, T, N>(a)
#endif
template<std::size_t I_, class T, std::size_t N> constexpr inline T& array_get(std::array<T,N>& a) { return (T&) STD_GET_ARR_HACK; }
template<std::size_t I_, class T, std::size_t N> constexpr inline T&& array_get(std::array<T,N>&& a) { return (T&&) STD_GET_ARR_HACK; }
template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
#undef STD_GET_ARR_HACK
} // end namespace internal
} // end namespace Eigen
#endif
#endif // EIGEN_EMULATE_ARRAY_H

View File

@@ -0,0 +1,158 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_FIXEDSIZEVECTOR_H
#define EIGEN_FIXEDSIZEVECTOR_H
namespace Eigen {
/** \class MaxSizeVector
* \ingroup Core
*
* \brief The MaxSizeVector class.
*
* The %MaxSizeVector provides a subset of std::vector functionality.
*
* The goal is to provide basic std::vector operations when using
* std::vector is not an option (e.g. on GPU or when compiling using
* FMA/AVX, as this can cause either compilation failures or illegal
* instruction failures).
*
* Beware: The constructors are not API compatible with these of
* std::vector.
*/
template <typename T>
class MaxSizeVector {
static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*));
public:
// Construct a new MaxSizeVector, reserve n elements.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
explicit MaxSizeVector(size_t n)
: reserve_(n), size_(0),
data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
}
// Construct a new MaxSizeVector, reserve and resize to n.
// Copy the init value to all elements.
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
MaxSizeVector(size_t n, const T& init)
: reserve_(n), size_(n),
data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
size_t i = 0;
EIGEN_TRY
{
for(; i < size_; ++i) { new (&data_[i]) T(init); }
}
EIGEN_CATCH(...)
{
// Construction failed, destruct in reverse order:
for(; (i+1) > 0; --i) { data_[i-1].~T(); }
internal::handmade_aligned_free(data_);
EIGEN_THROW;
}
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
~MaxSizeVector() {
for (size_t i = size_; i > 0; --i) {
data_[i-1].~T();
}
internal::handmade_aligned_free(data_);
}
void resize(size_t n) {
eigen_assert(n <= reserve_);
for (; size_ < n; ++size_) {
new (&data_[size_]) T;
}
for (; size_ > n; --size_) {
data_[size_-1].~T();
}
eigen_assert(size_ == n);
}
// Append new elements (up to reserved size).
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void push_back(const T& t) {
eigen_assert(size_ < reserve_);
new (&data_[size_++]) T(t);
}
// For C++03 compatibility this only takes one argument
template<class X>
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void emplace_back(const X& x) {
eigen_assert(size_ < reserve_);
new (&data_[size_++]) T(x);
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const T& operator[] (size_t i) const {
eigen_assert(i < size_);
return data_[i];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T& operator[] (size_t i) {
eigen_assert(i < size_);
return data_[i];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T& back() {
eigen_assert(size_ > 0);
return data_[size_ - 1];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const T& back() const {
eigen_assert(size_ > 0);
return data_[size_ - 1];
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pop_back() {
eigen_assert(size_ > 0);
data_[--size_].~T();
}
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
size_t size() const { return size_; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
bool empty() const { return size_ == 0; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T* data() { return data_; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const T* data() const { return data_; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T* begin() { return data_; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
T* end() { return data_ + size_; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const T* begin() const { return data_; }
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const T* end() const { return data_ + size_; }
private:
size_t reserve_;
size_t size_;
T* data_;
};
} // namespace Eigen
#endif // EIGEN_FIXEDSIZEVECTOR_H

View File

@@ -0,0 +1,43 @@
// This file is part of Eigen, a lightweight C++ template library
// for linear algebra.
//
// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
//
// This Source Code Form is subject to the terms of the Mozilla
// Public License v. 2.0. If a copy of the MPL was not distributed
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
#ifndef EIGEN_EULERANGLES_MODULE_H
#define EIGEN_EULERANGLES_MODULE_H
#include "../../Eigen/Core"
#include "../../Eigen/Geometry"
#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
namespace Eigen {
/**
* \defgroup EulerAngles_Module EulerAngles module
* \brief This module provides generic euler angles rotation.
*
* Euler angles are a way to represent 3D rotation.
*
* In order to use this module in your code, include this header:
* \code
* #include <unsupported/Eigen/EulerAngles>
* \endcode
*
* See \ref EulerAngles for more information.
*
*/
}
#include "src/EulerAngles/EulerSystem.h"
#include "src/EulerAngles/EulerAngles.h"
#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
#endif // EIGEN_EULERANGLES_MODULE_H

Some files were not shown because too many files have changed in this diff Show More